# Dataset

In [13]:
from models.csv_loader import CSVLoader
from models.products.product_registry import ProductRegistry
from models.products.product_mapping_row import ProductMappingRow
from models.products.product_row import ProductRow

product_registry = ProductRegistry(CSVLoader(ProductRow).read(), CSVLoader(ProductMappingRow).read())

In [14]:
from models.users.user_registry import UserRegistry
from models.users.user_mapping_row import UserMappingRow
from models.users.user_row import UserRow

user_registry = UserRegistry(CSVLoader(UserRow).read(), CSVLoader(UserMappingRow).read())

In [15]:
from models.ratings.rating_registry import RatingRegistry
from models.ratings.rating_row import RatingRow

rating_registry = RatingRegistry(CSVLoader(RatingRow).read(), user_registry, product_registry)
print("number of ratings", len(rating_registry.ratings))
print("rating", rating_registry.ratings[0].rating)
print("timestamp", rating_registry.ratings[0].timestamp)

number of ratings 932293
rating 5
timestamp 978300760


# Tranform Data to Logic Programs

In [18]:
def generate_progol_program(user_registry, product_registry, rating_registry, output_file_name):
    
    # Mode declarations
    modeh = f"modeh(*, recommend(+user, -m_name)).\n"
    features = ["gender", "age", "m_gender", "m_name", "watched"]
    input_features = [["user", "value"], ["user", "value"], ["movie", "value"], ["movie", "value"], ["user", "movie"]]
    age_dic = {"Under 18": "under18", "56+": "plus56", "25-34": "b25to34", "45-49": "b45to49", "50-55": "b50to55", "35-44": "b35to44", "18-24": "b18to24"}

    modeb_list = [f"modeb(*, {feature}(+{inputs[0]}, -{inputs[1]}))." for feature, inputs in zip(features, input_features)]
    modeb_list.append("modeb(*,user(+userid)).")
    modeb_list.append("modeb(*,movie(+movieid)).")

    # Determinations
    determinations = []
    determinations.append(f"determination(recommend/2, user/1).")
    for f in features:
        determinations.append(f"determination(recommend/2, {f}/2).")
    

    # Background knowledge
    background_knowledge = []
    for user in user_registry.users:
        user_id = user.uid
        user_gender = user.gender
        user_age = user.age
        background_knowledge.append(f"user({user_id}).")
        background_knowledge.append(f"gender({user_id},{user_gender}).")
        background_knowledge.append(f"age({user_id},{age_dic[user_age]}).")
    for product in product_registry.products:
        movie_id = product.pid
        movie_name = product.name
        movie_name = movie_name.lower()
        movie_name = movie_name.replace("-","_")
        movie_name = movie_name.replace(" ","_")
        movie_name = movie_name.replace("(","_")
        movie_name = movie_name.replace(")","")
        movie_name = movie_name.replace("'","")
        movie_name = movie_name.replace(":","")
        movie_name = movie_name.replace(",","")
        movie_name = movie_name.replace("?","")
        movie_name = movie_name.replace("!","")
        movie_name = movie_name.replace(".","")
        movie_name = movie_name.replace("&","")
        movie_name = movie_name.replace("*","")
        movie_name = movie_name.replace("/","_")
        movie_name = movie_name.replace("$","")
        movie_name = movie_name.replace("#", "")
        movie_name = movie_name.replace("é", "")
        movie_name = movie_name.replace(";", "")
        if movie_name[0].isdigit():
            movie_name = "m"+movie_name
        movie_gender = product.genre
        movie_gender = movie_gender.lower()
        movie_gender = movie_gender.replace("-","_")
        movie_gender = movie_gender.replace("'","")
        movie_gender = movie_gender.replace("&","")
        background_knowledge.append(f"movie({movie_id}).")
        background_knowledge.append(f"m_gender({movie_id},{movie_gender}).")
        background_knowledge.append(f"m_name({movie_id},{movie_name}).")
    for rating in rating_registry.ratings:
        user_id = rating.user.uid
        movie_id = rating.product.pid
        background_knowledge.append(f"watched({user_id},{movie_id}).")
        
    
    # Positive and negative examples
    positive_examples = []
    negative_examples = []
    for rating in rating_registry.ratings:
        score = rating.rating
        user_id = rating.user.uid
        movie_id = rating.product.pid
        if score > 3:
            positive_examples.append(f"recommend({user_id},{movie_id}).")
        else:
            negative_examples.append(f"recommend({user_id},{movie_id}).")

    # Combine all parts into a Progol-compatible logic program
    progol_program = "% Mode Declarations\n"
    progol_program += modeh
    progol_program += "\n".join(modeb_list) + "\n\n"

    progol_program += "% Determinations\n"
    progol_program += "\n".join(determinations) + "\n\n"

    progol_program += "% Background Knowledge\n:- begin_bg.\n"
    progol_program += "\n".join(background_knowledge) + "\n:- end_bg.\n\n"

    progol_program += "% Positive Examples\n:- begin_in_pos.\n"
    progol_program += "\n".join(positive_examples) + "\n:- end_in_pos.\n\n"

    progol_program += "% Negative Examples\n:- begin_in_neg.\n"
    progol_program += "\n".join(negative_examples) + "\n:- end_in_neg.\n"

    # Save the Progol logic program to a file with a custom name
    output_file_path = f"{output_file_name}.pl"
    with open(output_file_path, "w") as file:
        file.write(progol_program)

    return output_file_path

# Generate the Progol logic program
# Usage example:
output_file_name = './movie_rec'
output_file = generate_progol_program(user_registry, product_registry, rating_registry, output_file_name)
print(f"Progol logic program saved as {output_file}")

Progol logic program saved as ./movie_rec.pl


In [26]:
import numpy as np
import os

def generate_progol_program_splitted(user_registry, product_registry, rating_registry, output_file_name):
    
    # Mode declarations
    modeh = f"modeh(*, recommend(+user, -m_name)).\n"
    features = ["gender", "age", "m_gender", "m_name", "watched"]
    input_features = [["user", "value"], ["user", "value"], ["movie", "value"], ["movie", "value"], ["user", "movie"]]
    age_dic = {"Under 18": "under18", "56+": "plus56", "25-34": "b25to34", "45-49": "b45to49", "50-55": "b50to55", "35-44": "b35to44", "18-24": "b18to24"}

    modeb_list = [f"modeb(*, {feature}(+{inputs[0]}, -{inputs[1]}))." for feature, inputs in zip(features, input_features)]
    modeb_list.append("modeb(*,user(+userid)).")
    modeb_list.append("modeb(*,movie(+movieid)).")

    # Determinations
    determinations = []
    determinations.append(f"determination(recommend/2, user/1).")
    for f in features:
        determinations.append(f"determination(recommend/2, {f}/2).")
    
    # Positive and negative examples
    positive_examples = []
    negative_examples = []
    for rating in rating_registry.ratings:
        score = rating.rating
        user_id = rating.user.uid
        movie_id = rating.product.pid
        if score > 3:
            positive_examples.append(f"recommend({user_id},{movie_id}).")
        else:
            negative_examples.append(f"recommend({user_id},{movie_id}).")
    
    np.random.shuffle(positive_examples)
    np.random.shuffle(negative_examples)
    nb_partitions = 10000
    nb_positive_per_partition = len(positive_examples)//nb_partitions
    nb_negative_per_partition = len(negative_examples)//nb_partitions
    
    for i in range(nb_partitions):
        start_pos = i * nb_positive_per_partition
        end_pos = start_pos + nb_positive_per_partition
        start_neg = i * nb_negative_per_partition
        end_neg = start_neg + nb_negative_per_partition
        
        pos_examples_for_partition = positive_examples[start_pos:end_pos]
        neg_examples_for_partition = negative_examples[start_neg:end_neg]
        
        users_in_partition = set()
        movies_in_partition = set()
        for p_ex in pos_examples_for_partition:
            users_in_partition.add(int(p_ex.split(',')[0].split('(')[-1]))
            movies_in_partition.add(int(p_ex.split(',')[-1].split(')')[0]))
        for p_ex in neg_examples_for_partition:
            users_in_partition.add(int(p_ex.split(',')[0].split('(')[-1]))
            movies_in_partition.add(int(p_ex.split(',')[-1].split(')')[0]))

        # Background knowledge
        background_knowledge = []
        for user in user_registry.users:
            user_id = user.uid
            if user_id in list(users_in_partition):
                user_gender = user.gender
                user_age = user.age
                background_knowledge.append(f"user({user_id}).")
                background_knowledge.append(f"gender({user_id},{user_gender}).")
                background_knowledge.append(f"age({user_id},{age_dic[user_age]}).")
        for product in product_registry.products:
            movie_id = product.pid
            if movie_id in movies_in_partition:
                movie_name = product.name
                movie_name = movie_name.lower()
                movie_name = movie_name.replace("-","_")
                movie_name = movie_name.replace(" ","_")
                movie_name = movie_name.replace("(","_")
                movie_name = movie_name.replace(")","")
                movie_name = movie_name.replace("'","")
                movie_name = movie_name.replace(":","")
                movie_name = movie_name.replace(",","")
                movie_name = movie_name.replace("?","")
                movie_name = movie_name.replace("!","")
                movie_name = movie_name.replace(".","")
                movie_name = movie_name.replace("&","")
                movie_name = movie_name.replace("*","")
                movie_name = movie_name.replace("/","_")
                movie_name = movie_name.replace("$","")
                movie_name = movie_name.replace("#", "")
                movie_name = movie_name.replace("é", "")
                movie_name = movie_name.replace(";", "")
                if movie_name[0].isdigit():
                    movie_name = "m"+movie_name
                movie_gender = product.genre
                movie_gender = movie_gender.lower()
                movie_gender = movie_gender.replace("-","_")
                movie_gender = movie_gender.replace("'","")
                movie_gender = movie_gender.replace("&","")
                background_knowledge.append(f"movie({movie_id}).")
                background_knowledge.append(f"m_gender({movie_id},{movie_gender}).")
                background_knowledge.append(f"m_name({movie_id},{movie_name}).")
        """for user in users_in_partition:
            user_rating = rating_registry.find_user_ratings(user)
            user_rating.sort(key=lambda rating: rating.timestamp, reverse=True)
            percentage_recent = 0.2
            len_rating = int(len(user_rating)*percentage_recent)
            user_rating = user_rating[:len_rating]
            for rating in user_rating:
                movie_id = rating.product.pid
                if user_id in users_in_partition and movie_id in movies_in_partition:
                    background_knowledge.append(f"watched({user_id},{movie_id}).") """
        for rating in pos_examples_for_partition:
            user_id = rating.split(',')[0].split('(')[-1]
            movie_id = rating.split(',')[-1].split(')')[0]
            background_knowledge.append(f"watched({user_id},{movie_id}).")
        for rating in neg_examples_for_partition:
            user_id = rating.split(',')[0].split('(')[-1]
            movie_id = rating.split(',')[-1].split(')')[0]
            background_knowledge.append(f"watched({user_id},{movie_id}).")
                
        

        # Combine all parts into a Progol-compatible logic program
        progol_program = "% Mode Declarations\n"
        progol_program += modeh
        progol_program += "\n".join(modeb_list) + "\n\n"

        progol_program += "% Determinations\n"
        progol_program += "\n".join(determinations) + "\n\n"

        progol_program += "% Background Knowledge\n:- begin_bg.\n"
        progol_program += "\n".join(background_knowledge) + "\n:- end_bg.\n\n"

        progol_program += "% Positive Examples\n:- begin_in_pos.\n"
        progol_program += "\n".join(pos_examples_for_partition) + "\n:- end_in_pos.\n\n"

        progol_program += "% Negative Examples\n:- begin_in_neg.\n"
        progol_program += "\n".join(neg_examples_for_partition) + "\n:- end_in_neg.\n"

        # Save the Progol logic program to a file with a custom name
        if not os.path.exists("moviepartitions/"):
            os.makedirs("moviepartitions")
        output_file_path = f"moviepartitions/{output_file_name}_partition{i}.pl"
        with open(output_file_path, "w") as file:
            file.write(progol_program)
        qkfsjqlk

# Generate the Progol logic program
# Usage example:
output_file_name = './movie_rec'
generate_progol_program_splitted(user_registry, product_registry, rating_registry, output_file_name)
print(f"Progol logic program saved")

{4099, 3086, 529, 5650, 3096, 5667, 3108, 44, 2092, 4659, 1079, 3141, 76, 5197, 1105, 5223, 1643, 4720, 1140, 4215, 3191, 2684, 1150, 4225, 2183, 3208, 2699, 2199, 5787, 1181, 1186, 678, 3762, 4277, 4790, 1721, 1216, 3265, 4808, 3272, 2776, 3292, 4323, 1770, 235, 5869, 752, 1780, 245, 3829, 3834, 3330, 4356, 5895, 1807, 3346, 798, 801, 4902, 1835, 2885, 3402, 2896, 3416, 2400, 5988, 5990, 5491, 4476, 3976, 396, 1936, 2967, 408, 3999, 1958, 5545, 3001, 1483, 5074, 4060, 477, 3041, 4579, 2029, 1519, 3056, 495, 4085, 509}


NameError: name 'dflksj' is not defined

# Horizontal Partitionning

In [22]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

# Hypothesis
target = 'recommend'

# Specify the number of splits
num_splits = 6  # You can adjust this depending on how many files you want

# Use StratifiedShuffleSplit to create balanced splits
sss = StratifiedShuffleSplit(n_splits=num_splits, test_size=1/num_splits, random_state=42)

# Directory to save the splits
directory = 'C:/Users/yakaichi/OneDrive - Université de Namur/Bureau/Data/pre_processed_data/programs_parts3/'

# Split and save the datasets
for i, (_, test_index) in enumerate(sss.split(data, data[target_column])):
    split_df = data.iloc[test_index]
    split_df.to_excel(f'{directory}horizontal_split_dataset_{i+1}.xlsx', index=False)

print("Balanced datasets have been saved successfully.")

ModuleNotFoundError: No module named 'sklearn'

# ILP Learning 

In [22]:
from andante.program import AndanteProgram 
apmovies = AndanteProgram.build_from("moviepartitions/movie_rec_partition0.pl")
apmovies.induce(update_knowledge=True, logging=True, verbose=0)

Knowledge object (class: TreeShapedKnowledge)
Clauses: