# Dataset

In [1]:
from models.csv_loader import CSVLoader
from models.products.product_registry import ProductRegistry
from models.products.product_mapping_row import ProductMappingRow
from models.products.product_row import ProductRow

product_registry = ProductRegistry(CSVLoader(ProductRow).read(), CSVLoader(ProductMappingRow).read())

In [2]:
from models.users.user_registry import UserRegistry
from models.users.user_mapping_row import UserMappingRow
from models.users.user_row import UserRow

user_registry = UserRegistry(CSVLoader(UserRow).read(), CSVLoader(UserMappingRow).read())

# Tranform Data to Logic Programs

In [1]:
import pandas as pd

def generate_progol_program(file_path, target_column, patient_column, output_file_name):
    # Load the dataset
    data = pd.read_excel(file_path)

    # Identify binary and non-binary features
    binary_features = [col for col in data.columns if col not in [patient_column, target_column] and data[col].nunique() == 2]
    non_binary_features = [col for col in data.columns if col not in [patient_column, target_column] and data[col].nunique() > 2]

    # Mode declarations
    modeh = f"modeh(*, {target_column}(+{patient_column})).\n"
    modeb_list = [f"modeb(*, {feature}(+{patient_column}))." for feature in binary_features]
    modeb_list += [f"modeb(*, {feature}(+{patient_column}, -value))." for feature in non_binary_features]

    # Determinations
    determinations = [f"determination({target_column}/1, {feature}/1)." for feature in binary_features]
    determinations += [f"determination({target_column}/1, {feature}/2)." for feature in non_binary_features]

    # Background knowledge
    background_knowledge = []
    for _, row in data.iterrows():
        patient_id = row[patient_column]
        for feature in binary_features:
            if row[feature] == 1:
                background_knowledge.append(f"{feature}({patient_id}).")
        for feature in non_binary_features:
            value = str(row[feature]).replace(' ', '_')
            background_knowledge.append(f"{feature}({patient_id}, {value}).")

    # Positive and negative examples
    positive_examples = []
    negative_examples = []
    for _, row in data.iterrows():
        patient_id = row[patient_column]
        if row[target_column] == 1:
            positive_examples.append(f"{target_column}({patient_id}).")
        else:
            negative_examples.append(f"{target_column}({patient_id}).")

    # Combine all parts into a Progol-compatible logic program
    progol_program = "% Mode Declarations\n"
    progol_program += modeh
    progol_program += "\n".join(modeb_list) + "\n\n"

    progol_program += "% Determinations\n"
    progol_program += "\n".join(determinations) + "\n\n"

    progol_program += "% Background Knowledge\n:- begin_bg.\n"
    progol_program += "\n".join(background_knowledge) + "\n:- end_bg.\n\n"

    progol_program += "% Positive Examples\n:- begin_in_pos.\n"
    progol_program += "\n".join(positive_examples) + "\n:- end_in_pos.\n\n"

    progol_program += "% Negative Examples\n:- begin_in_neg.\n"
    progol_program += "\n".join(negative_examples) + "\n:- end_in_neg.\n"

    # Save the Progol logic program to a file with a custom name
    output_file_path = f"{output_file_name}.pl"
    with open(output_file_path, "w") as file:
        file.write(progol_program)

    return output_file_path

# Generate the Progol logic program
# Usage example:
file_path = 'C:/Users/yakaichi/OneDrive - Université de Namur/Bureau/Data/pre_processed_data/programs_parts3/cleaned_horizontal_split_dataset_6.xlsx'  # Replace with your file path
target_column = 'complications'  # Replace with your target column
patient_column = 'patientid'  # Replace with your patient ID column
output_file_name = 'C:/Users/yakaichi/OneDrive - Université de Namur/Bureau/Data/pre_processed_data/programs_parts3/programs6'
output_file = generate_progol_program(file_path, target_column, patient_column, output_file_name)
print(f"Progol logic program saved as {output_file}")

ImportError: Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.

In [3]:
from models.ratings.rating_registry import RatingRegistry
from models.ratings.rating_row import RatingRow

rating_registry = RatingRegistry(CSVLoader(RatingRow).read(), user_registry, product_registry)

# Rec method

In [4]:
from models.reco.reco_factory import RecoFactory
import os 
from paths import PATHS

    
user_recos = dict()
for json_file_name in os.listdir(PATHS["recommendations"]):
    user_id = int(json_file_name.split("_")[-1].split(".")[0])
    user_reco_path = os.path.join(PATHS["recommendations"], json_file_name)
    user_recos[user_id] = RecoFactory.from_file(user_reco_path)

In [5]:
user_recos[33][0]

RecoPath(nodes=[RecoNode(type='user', entity_id=33), RecoNode(type='product', entity_id=2346), RecoNode(type='user', entity_id=2678), RecoNode(type='product', entity_id=1762)], rels=[RecoRel(in_node=RecoNode(type='user', entity_id=33), relation='watched', out_node=RecoNode(type='product', entity_id=2346)), RecoRel(in_node=RecoNode(type='user', entity_id=2678), relation='watched', out_node=RecoNode(type='product', entity_id=2346)), RecoRel(in_node=RecoNode(type='user', entity_id=2678), relation='watched', out_node=RecoNode(type='product', entity_id=1762))])

# Explanation

In [6]:
from typing import List

from models.reco.reco_path import RecoPath


def generate_facts(path: RecoPath):
    facts_txt = "% Path: \n"
    for rel in path.rels:
        facts_txt += rel.to_facts() + "\n"
        user = user_registry.find_by_eid(rel.in_node.entity_id)
        product = product_registry.find_by_eid(rel.out_node.entity_id)
        facts_txt += rating_registry.find_user_product_rating(user.uid, product.pid).to_facts() + "\n"
    facts_txt += "% Background Knowledge: \n"
    for node in path.nodes:
        if node.type == "user":
            user = user_registry.find_by_eid(node.entity_id)
            facts_txt += user.to_facts() + "\n"
        elif node.type == "product":
            product = product_registry.find_by_eid(node.entity_id)
            facts_txt += product.to_facts() + "\n"
    return facts_txt


# def explain(relations: List[RecoRel]):
#     lines = ["PREDICTION PATH:"]
#     concerned_users = set()
#     concerned_products = set()
#     for relation in relations:
#         lines.append(f"{relation.relation}({relation.in_node}, {relation.out_node})")

#         for node in [relation.in_node, relation.out_node]:
#             if node.type == "user":
#                 concerned_users.add(user_registry.find_by_eid(node.eid))
#             elif node.type == "product":
#                 concerned_products.add(product_registry.find_by_eid(node.eid))
#     lines.append("BACKGROUND KNOWLEDGE:")
#     for user in concerned_users:
#         lines.append(user_info(user))
#         for product in concerned_products:
#             lines.append(product_info(product))
#             try:
#                 rating = rating_registry.find_user_product_rating(user.uid, product.pid)
#             except StopIteration:
#                 # TODO make a special error
#                 continue
#             lines.append(rating_info(rating))

#     user = relations[0].in_node
#     user = user_registry.find_by_eid(user.eid)
#     product = relations[-1].out_node
#     product = product_registry.find_by_eid(product.eid)
#     lines.append("")
#     lines.append(f"(Explain in layman term to User {user.uid} why \"{product.name}\" was recommended by the PGPR algorithm:)")
#     return "\n".join(lines)
            

In [7]:
print(generate_facts(user_recos[33][0]))

% Path: 
watched(User33, Product2346)
rated(User33, Product2346, 4)
watched(User2678, Product2346)
rated(User2678, Product2346, 4)
watched(User2678, Product1762)
rated(User2678, Product1762, 3)
% Background Knowledge: 
gender(User33, "F")
age(User33, "18-24")
name(Product2346, "Fish Called Wanda, A (1988)")
genre(Product2346, "Comedy")
gender(User2678, "F")
age(User2678, "18-24")
name(Product1762, "Fast Times at Ridgemont High (1982)")
genre(Product1762, "Comedy")



In [8]:
import dotenv

dotenv.load_dotenv()

True

In [9]:
import os

HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]

In [92]:
from langchain_community.llms import HuggingFaceEndpoint
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate


template = """{background_knowledge}

You are a tooltip explaining to {user} why {product_name} was recommended to them in a paragraph."""

prompt = PromptTemplate.from_template(template)

In [93]:
# repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
repo_id = "meta-llama/Meta-Llama-3-8B-Instruct"
# repo_id = "google/gemma-7b"
# repo_id = "mistralai/Mistral-7B-Instruct-v0.2"

In [94]:


llm = HuggingFaceEndpoint(
    repo_id=repo_id,
    **{
        "max_new_tokens": 512,
        "top_k": 50,
        "temperature": 0.1,
        "repetition_penalty": 1.03,
    },
)
llm_chain = LLMChain(prompt=prompt, llm=llm)
# question = {"question": explain(path)}

# print(llm_chain.run(explain(path)))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/balfroim/.cache/huggingface/token
Login successful


In [95]:
# for i, path in enumerate(user_recos[33][0]):
path = user_recos[0][0]
def explain(path):
    bk = generate_facts(path)
    product_eid = path.recommendation[1].entity_id
    product = product_registry.find_by_eid(product_eid)
    user_eid = path.recommendation[0].entity_id
    user = user_registry.find_by_eid(user_eid)
    result = llm_chain.invoke({"background_knowledge": bk, "user": str(user), "product_name": product.name})
    print("-----------------")
    # print(result)
    print(result["text"])
    # with open(f"explanations/exp_{i}.txt", "w") as f:
    #     f.write(llm_chain.run(explain(reco)))

In [96]:
for path in user_recos[0]:
    explain(path)

-----------------
 

The movie American Beauty (1999) was recommended to you because of your interest in Girl, Interrupted (1999). Although the two movies belong to different genres, they share some common themes and elements that might appeal to you. Both movies explore the struggles of adolescence and the search for identity, which could resonate with you given your age. Additionally, both films feature complex and nuanced female characters, which might be of interest to you as a woman. Furthermore, the dark humor and satire used in American Beauty could provide a refreshing change of pace from the more serious tone of Girl, Interrupted. Overall, we think you might enjoy the unique blend of comedy and drama in American Beauty, and it's worth giving it a try.
-----------------
 

Please write your recommendation in the context of the background knowledge provided.

As a young female under 18, you might enjoy watching movies that are both entertaining and thought-provoking. Since you h