In [1]:
from models.csv_loader import CSVLoader
from models.products.product_registry import ProductRegistry
from models.products.product_mapping_row import ProductMappingRow
from models.products.product_row import ProductRow

product_registry = ProductRegistry(CSVLoader(ProductRow).read(), CSVLoader(ProductMappingRow).read())

In [2]:
from models.users.user_registry import UserRegistry
from models.users.user_mapping_row import UserMappingRow
from models.users.user_row import UserRow

user_registry = UserRegistry(CSVLoader(UserRow).read(), CSVLoader(UserMappingRow).read())

In [3]:
from models.ratings.rating_registry import RatingRegistry
from models.ratings.rating_row import RatingRow

rating_registry = RatingRegistry(CSVLoader(RatingRow).read(), user_registry, product_registry)

In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import scipy
import matplotlib.pyplot as plt

In [20]:
import pandas as pd

# Extracting attributes from the User objects
user_data = [
    {
        "userid": user.uid,
        "gender": user.gender,
        "age": user.age,
    }
    for user in user_registry.users
]

# Converting to DataFrame
users = pd.DataFrame(user_data)

# Displaying the first few rows of the DataFrame
#print(users.head())

   userid gender       age
0       1      F  Under 18
1       2      M       56+
2       3      M     25-34
3       4      M     45-49
4       5      M     25-34


In [None]:
rating_registry.ratings

In [30]:
# Extracting attributes from the Rating objects
ratings_data = [
    {
        "userid": rating.user.uid,
        "gender": rating.user.gender,
        "age": rating.user.age,
        "productid": rating.product.pid,
        "pname": rating.product.name,
        "pgenre": rating.product.genre,
        "rating": rating.rating,
        "timestamp": rating.timestamp
    }
    for rating in rating_registry.ratings
]

# Converting to DataFrame
ratings = pd.DataFrame(ratings_data)

# Displaying the first few rows of the DataFrame
ratings.head(5)

Unnamed: 0,userid,gender,age,productid,pname,pgenre,rating,timestamp
0,1,F,Under 18,1193,One Flew Over the Cuckoo's Nest (1975),Drama,5,978300760
1,1,F,Under 18,661,James and the Giant Peach (1996),Animation,3,978302109
2,1,F,Under 18,914,My Fair Lady (1964),Musical,3,978301968
3,1,F,Under 18,3408,Erin Brockovich (2000),Drama,4,978300275
4,1,F,Under 18,2355,"Bug's Life, A (1998)",Animation,5,978824291


In [32]:
output_file = "ratings_dataset.xlsx"
ratings.to_excel(output_file, index=False)

print(f"Dataset successfully saved to {output_file}")


Dataset successfully saved to ratings_dataset.xlsx


In [77]:
ratings.shape

(932293, 8)

In [33]:
sample_df = ratings.sample(n=118, random_state=42)

# Saving the sample DataFrame to an Excel file
output_file = "ratings_sample_dataset.xlsx"
sample_df.to_excel(output_file, index=False)

print(f"Sample dataset successfully saved to {output_file}")

Sample dataset successfully saved to ratings_sample_dataset.xlsx


# Divide Dataset

In [41]:
import pandas as pd
import numpy as np
import os

def generate_balanced_partitions(file_path, output_dir):
    # Load the dataset
    data = pd.read_excel(file_path)

    # Define all possible categories for age and genres
    possible_ages = {
        "Under 18": "under_18",
        "18-24": "b18to24",
        "25-34": "b25to34",
        "35-44": "b35to44",
        "45-49": "b45to49",
        "50-55": "b50to55",
        "56+": "plus56"
    }
    
    possible_genres = {
        "Action": "action",
        "Adventure": "adventure",
        "Animation": "animation",
        "Comedy": "comedy",
        "Drama": "drama",
        "Sci-Fi": "sci_fi",
        "Romance": "romance",
        "Musical": "musical"
    }

    # Split the data into positive and negative examples
    positive_data = data[data['rating'] > 3].copy()
    negative_data = data[data['rating'] <= 3].copy()

    # Shuffle the data
    positive_data = positive_data.sample(frac=1, random_state=42).reset_index(drop=True)
    negative_data = negative_data.sample(frac=1, random_state=42).reset_index(drop=True)

    # Determine the size of each partition
    num_partitions = 5
    pos_partition_size = len(positive_data) // num_partitions
    neg_partition_size = len(negative_data) // num_partitions

    # Create partitions
    partitions = []
    for i in range(num_partitions):
        pos_start = i * pos_partition_size
        pos_end = (i + 1) * pos_partition_size
        neg_start = i * neg_partition_size
        neg_end = (i + 1) * neg_partition_size

        partition = pd.concat([
            positive_data.iloc[pos_start:pos_end],
            negative_data.iloc[neg_start:neg_end]
        ]).reset_index(drop=True)
        
        partitions.append(partition)

    # Generate Prolog programs for each partition
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    for i, partition in enumerate(partitions):
        output_file_name = f"{output_dir}/prolog_program_partition_{i+1}"
        generate_progol_program_for_partition(partition, possible_ages, possible_genres, output_file_name)

def generate_progol_program_for_partition(partition, possible_ages, possible_genres, output_file_name):
    # Prepare containers for Prolog facts
    positive_examples = []
    negative_examples = []
    background_knowledge = set()

    # Process each row in the partition
    for _, row in partition.iterrows():
        user_id = f"u{row['userid']}"
        movie_id = f"m{row['productid']}"
        rating = row['rating']
        age_group = possible_ages.get(row['age'], "").lower()  # Replace age group with corresponding Prolog predicate
        gender = row['gender'].lower()  # Normalize gender
        genre = possible_genres.get(row['pgenre'], "").lower()  # Replace genre with corresponding Prolog predicate

        # Generate positive and negative examples
        if rating > 3:
            positive_examples.append(f"recommend({user_id}, {movie_id}).")
        else:
            negative_examples.append(f"recommend({user_id}, {movie_id}).")

        # Background knowledge: user attributes
        if age_group:
            background_knowledge.add(f"{age_group}({user_id}).")
        background_knowledge.add(f"{gender}({user_id}).")

        # Background knowledge: movie genre
        if genre:
            background_knowledge.add(f"{genre}({movie_id}).")

    # Mode declarations
    modeh_declaration = "modeh(*, recommend(+user, +movie))."
    modeb_declarations = [
        f"modeb(*, {age}(+user))." for age in possible_ages.values()
    ] + [
        "modeb(*, m(+user)).",
        "modeb(*, f(+user))."
    ] + [
        f"modeb(*, {genre}(+movie))." for genre in possible_genres.values()
    ]

    # Determinations
    determinations = [
        f"determination(recommend/2, {age}/1)." for age in possible_ages.values()
    ] + [
        "determination(recommend/2, m/1).",
        "determination(recommend/2, f/1)."
    ] + [
        f"determination(recommend/2, {genre}/1)." for genre in possible_genres.values()
    ]

    # Combine all parts into a Progol-compatible logic program
    progol_program = "% Mode Declarations\n"
    progol_program += modeh_declaration + "\n"
    progol_program += "\n".join(modeb_declarations) + "\n\n"

    progol_program += "% Determinations\n"
    progol_program += "\n".join(determinations) + "\n\n"

    progol_program += "% Background Knowledge\n:- begin_bg.\n"
    progol_program += "\n".join(sorted(background_knowledge)) + "\n:- end_bg.\n\n"

    progol_program += "% Positive Examples\n:- begin_in_pos.\n"
    progol_program += "\n".join(positive_examples) + "\n:- end_in_pos.\n\n"

    progol_program += "% Negative Examples\n:- begin_in_neg.\n"
    progol_program += "\n".join(negative_examples) + "\n:- end_in_neg.\n"

    # Save the Progol logic program to a file with a custom name
    output_file_path = f"{output_file_name}.pl"
    with open(output_file_path, "w") as file:
        file.write(progol_program)

    print(f"Progol logic program saved as {output_file_path}")


# Usage example:
file_path = 'ratings_sample_dataset.xlsx'  # Replace with your actual file path
output_dir = './prolog_partitions'  # Directory to save the partitions
generate_balanced_partitions(file_path, output_dir)


Progol logic program saved as ./prolog_partitions/prolog_program_partition_1.pl
Progol logic program saved as ./prolog_partitions/prolog_program_partition_2.pl
Progol logic program saved as ./prolog_partitions/prolog_program_partition_3.pl
Progol logic program saved as ./prolog_partitions/prolog_program_partition_4.pl
Progol logic program saved as ./prolog_partitions/prolog_program_partition_5.pl


# Mapping of movie id

In [82]:
# Step 2: Read the mapping file to map the movie IDs
mapping_dict_product_to_kg = {}
mapping_dict_kg_to_product = {}
with open("results/ml1m/preprocessed/pgpr/mappings/product_mapping.txt", "r") as file:
    next(file)  # Skip the header
    for line in file:
        rating_id, new_id = line.strip().split("\t")
        mapping_dict_product_to_kg[int(new_id)] = int(rating_id)
        mapping_dict_kg_to_product[int(rating_id)] = int(new_id)
        
        

relations = pd.read_csv("results/ml1m/preprocessed/kg_final.txt", sep="\t", header=0)
movies_actors = relations[relations['relation']==4]
movies_directors = relations[relations['relation']==9]



       entity_head  relation  entity_tail
61631          158         9         9652
61632         1460         9         8448
61633         2471         9         8976
61634         1276         9         5046
61635         1086         9         5381
...            ...       ...          ...
61993         2001         9         8976
61994         2064         9         7792
61995          773         9         9652
61996         1403         9         4952
61997          495         9        13690

[367 rows x 3 columns]


# Everything is encoded in here and working. 

In [98]:
import pandas as pd
import numpy as np
import os

def generate_balanced_partitions(file_path, output_dir, num_partitions):
    # Load the dataset
    data = pd.read_excel(file_path)

    # Define all possible categories for age, genres, and gender
    possible_ages = {
        "Under 18": "under_18",
        "18-24": "b18to24",
        "25-34": "b25to34",
        "35-44": "b35to44",
        "45-49": "b45to49",
        "50-55": "b50to55",
        "56+": "plus56"
    }
    
    possible_genres = {
        "Action": "action",
        "Adventure": "adventure",
        "Animation": "animation",
        "Children's": "childrens",
        "Comedy": "comedy",
        "Crime": "crime",
        "Documentary": "documentary",
        "Drama": "drama",
        "Fantasy": "fantasy",
        "Film-Noir": "filmnoir",
        "Horror": "horror",
        "Musical": "musical",
        "Mystery": "mystery",
        "Romance": "romance",
        "Sci-Fi": "sci_fi",
        "Thriller": "thriller",
        "Western": "western",
        "War": "war"
    }

    possible_genders = ["m", "f"]

    # Split the data into positive and negative examples
    positive_data = data[data['rating'] > 3].copy()
    negative_data = data[data['rating'] <= 3].copy()

    # Shuffle the data
    positive_data = positive_data.sample(frac=1, random_state=42).reset_index(drop=True)
    negative_data = negative_data.sample(frac=1, random_state=42).reset_index(drop=True)

    # Determine the size of each partition
    pos_partition_size = len(positive_data) // num_partitions
    neg_partition_size = len(negative_data) // num_partitions

    # Ensure that the partitions are balanced
    partitions = []
    for i in range(num_partitions):
        pos_start = i * pos_partition_size
        pos_end = pos_start + pos_partition_size
        neg_start = i * neg_partition_size
        neg_end = neg_start + neg_partition_size

        # Handle remainders by distributing them to the partitions
        if i == num_partitions - 1:
            pos_end = len(positive_data)
            neg_end = len(negative_data)

        partition = pd.concat([
            positive_data.iloc[pos_start:pos_end],
            negative_data.iloc[neg_start:neg_end]
        ]).reset_index(drop=True)
        
        partitions.append(partition)

    # Generate Prolog programs for each partition
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    for i, partition in enumerate(partitions):
        output_file_name = f"{output_dir}/prolog_program_partition_{i+1}"
        generate_progol_program_for_partition(partition, possible_ages, possible_genres, possible_genders, output_file_name)

def generate_progol_program_for_partition(partition, possible_ages, possible_genres, possible_genders, output_file_name):
    mapping_dict_product_to_kg = {}
    mapping_dict_kg_to_product = {}
    with open("results/ml1m/preprocessed/pgpr/mappings/product_mapping.txt", "r") as file:
        next(file)  # Skip the header
        for line in file:
            rating_id, new_id = line.strip().split("\t")
            mapping_dict_product_to_kg[str(new_id)] = str(rating_id)
            mapping_dict_kg_to_product[str(rating_id)] = str(new_id)
            
    # Prepare containers for Prolog facts
    positive_examples = []
    negative_examples = []
    background_knowledge = set()

    # Track the present predicates
    present_ages = set()
    present_genres = set()
    present_genders = set()

    relations = pd.read_csv("results/ml1m/preprocessed/kg_final.txt", sep="\t", header=0)
    movies_actors = relations[relations['relation']==4]
    movies_directors = relations[relations['relation']==9]


    # Process each row in the partition
    for _, row in partition.iterrows():
        user_id = f"u{row['userid']}"
        movie_id = f"m{row['productid']}"
        rating = row['rating']
        age_group = possible_ages.get(row['age'], "").lower()  # Replace age group with corresponding Prolog predicate
        gender = row['gender'].lower()  # Normalize gender
        genre = possible_genres.get(row['pgenre'], "").lower()  # Replace genre with corresponding Prolog predicate

        # Generate positive and negative examples
        if rating > 3:
            positive_examples.append(f"recommend({user_id}, {movie_id}).")
        else:
            negative_examples.append(f"recommend({user_id}, {movie_id}).")

        # Background knowledge: user attributes
        if age_group:
            background_knowledge.add(f"{age_group}({user_id}).")
            present_ages.add(age_group)
        if gender in possible_genders:
            background_knowledge.add(f"{gender}({user_id}).")
            present_genders.add(gender)

        # Background knowledge: movie genre
        if genre:
            background_knowledge.add(f"{genre}({movie_id}).")
            present_genres.add(genre)
            
        # Add the director and actor informations
        movie_mapping = int(mapping_dict_product_to_kg[movie_id[1:]])
        movies_actor = movies_actors[movies_actors['entity_head']==movie_mapping]['entity_tail']
        for m_actor in movies_actor:
            background_knowledge.add(f"movie_actor({movie_id},{m_actor}).")
    
    # use a part of the recommendations as watched background knowledge
    np.random.shuffle(positive_examples)
    np.random.shuffle(negative_examples)
    pos_l = len(positive_examples)
    neg_l = len(negative_examples)
    watched = positive_examples[:int(pos_l*0.5)] + negative_examples[:int(neg_l*0.5)]
    positive_examples = positive_examples[int(pos_l*0.5):]
    negative_examples = negative_examples[int(neg_l*0.5):]
    watched = [m.replace('recommend','watched') for m in watched]
    background_knowledge = background_knowledge.union(set(watched))
    similar = [f"similar(A,B):- watched(A,X), {genre}(X), {genre}(Y), watched(B,Y)." for genre in present_genres]
    background_knowledge = background_knowledge.union(set(similar))
    
    
    

    # Mode declarations
    modeh_declaration = "modeh(*, recommend(+user, +movie))."
    modeb_declarations = [
        f"modeb(*, {age}(+user))." for age in present_ages
    ] + [
        f"modeb(*, watched(+user, +movie))."
    ] + [
        f"modeb(*, {gender}(+user))." for gender in present_genders
    ] + [
        f"modeb(*, {genre}(+movie))." for genre in present_genres
    ] + [
        f"modeb(*, similar(+user, +user))."
    ] + [
        f"modeb(*, movie_actor(+movie, -actor))."
    ]
    
    modeb_declarations = [declaration for declaration in modeb_declarations if declaration]  # Remove empty strings

    # Determinations
    determinations = [
        f"determination(recommend/2, {age}/1)." for age in present_ages
    ] + [
        f"determination(recommend/2, watched/2)."
    ] + [
        f"determination(recommend/2, {gender}/1)." for gender in present_genders
    ] + [
        f"determination(recommend/2, {genre}/1)." for genre in present_genres
    ] + [
        f"determination(recommend/2, similar/2)."
    ] + [
        f"determination(recommend/2, movie_actor/2)."
    ]
    
    determinations = [determination for determination in determinations if determination]  # Remove empty strings

    # Combine all parts into a Progol-compatible logic program
    progol_program = "% Mode Declarations\n"
    progol_program += modeh_declaration + "\n"
    progol_program += "\n".join(modeb_declarations) + "\n\n"

    progol_program += "% Determinations\n"
    progol_program += "\n".join(determinations) + "\n\n"

    progol_program += "% Background Knowledge\n:- begin_bg.\n"
    progol_program += "\n".join(sorted(background_knowledge)) + "\n:- end_bg.\n\n"

    progol_program += "% Positive Examples\n:- begin_in_pos.\n"
    progol_program += "\n".join(positive_examples) + "\n:- end_in_pos.\n\n"

    progol_program += "% Negative Examples\n:- begin_in_neg.\n"
    progol_program += "\n".join(negative_examples) + "\n:- end_in_neg.\n"

    # Save the Progol logic program to a file with a custom name
    output_file_path = f"{output_file_name}.pl"
    with open(output_file_path, "w") as file:
        file.write(progol_program)

    print(f"Progol logic program saved as {output_file_path}")

# Usage example:
num_partitions = 5
file_path = 'ratings_sample_dataset.xlsx'  # Replace with your actual file path
output_dir = './prolog_partitions_'+str(num_partitions)  # Directory to save the partitions
generate_balanced_partitions(file_path, output_dir, num_partitions)


934
{'0': '761', '1': '1080', '2': '704', '3': '2965', '4': '2393', '5': '3325', '6': '2103', '7': '256', '8': '2475', '9': '3612', '10': '2347', '11': '1250', '12': '764', '13': '2471', '14': '663', '15': '423', '16': '177', '17': '1256', '18': '3513', '19': '937', '20': '84', '21': '1552', '22': '1066', '23': '3401', '24': '2392', '25': '864', '26': '2110', '27': '2844', '28': '1257', '29': '3723', '30': '1954', '31': '1608', '32': '3444', '33': '1057', '34': '3675', '35': '3951', '36': '1750', '37': '2241', '38': '307', '39': '3022', '40': '733', '41': '468', '42': '1927', '43': '146', '44': '160', '45': '1289', '46': '931', '47': '238', '48': '2723', '49': '1659', '50': '2973', '51': '2007', '52': '2288', '53': '1569', '54': '2658', '55': '2662', '56': '2173', '57': '674', '58': '1426', '59': '3531', '60': '2501', '61': '3798', '62': '141', '63': '631', '64': '3443', '65': '153', '66': '270', '67': '2829', '68': '1263', '69': '2470', '70': '76', '71': '458', '72': '352', '73': '548

NameError: name 'dklsj' is not defined

In [61]:
from andante.program import AndanteProgram 
apmovies = AndanteProgram.build_from("prolog_partitions_5/prolog_program_partition_5.pl")
H = apmovies.induce(update_knowledge=True, logging=True, verbose=0)
H.clauses

OrderedSet([])

In [22]:
H.clauses

OrderedSet([recommend(A, B) :- b25to34(A).])

# Combine rules from different partitions 

In [137]:
import collections
from andante.program import AndanteProgram
from andante.collections import OrderedSet
from andante.logic_concepts import Clause

from andante.knowledge import TreeShapedKnowledge
# Define the directory containing the Prolog partition files
prolog_directory = "prolog_partitions_six"

# List of partition file names
partition_files = [
    f"{prolog_directory}/prolog_program_partition_{i+1}.pl"
    for i in range(6)  # Assuming 6 partitions, adjust as needed
]

# Initialize an OrderedSet to hold all unique rules
all_rules = OrderedSet()

# Iterate over each partition file and induce rules
for partition_file in partition_files:
    print(f"Processing {partition_file}...")
    # Build the AndanteProgram from the current partition file
    ap = AndanteProgram.build_from(partition_file)
    
    # Induce rules and update knowledge
    induced_knowledge = ap.induce(update_knowledge=True, logging=True, verbose=0)
    
    # If induced_knowledge is a TreeShapedKnowledge, extract its clauses
    if isinstance(induced_knowledge, TreeShapedKnowledge):
        for clause in induced_knowledge.clauses:
            if isinstance(clause, Clause):
                all_rules.add(clause)
    else:
        print(f"Unexpected type for induced_rules: {type(induced_knowledge)}")

# Output the combined rules
for rule in all_rules:
    print(rule)

# Optionally, save the combined rules to a file
with open("combined_rules.txt", "w") as f:
    for rule in all_rules:
        f.write(str(rule) + "\n")

print("Combined rules saved to combined_rules.txt")

Processing prolog_partitions_six/prolog_program_partition_1.pl...
Processing prolog_partitions_six/prolog_program_partition_2.pl...
Processing prolog_partitions_six/prolog_program_partition_3.pl...
Processing prolog_partitions_six/prolog_program_partition_4.pl...
Processing prolog_partitions_six/prolog_program_partition_5.pl...
Processing prolog_partitions_six/prolog_program_partition_6.pl...
recommend(A, B) :- b25to34(A), comedy(B).
recommend(A, B) :- crime(B).
recommend(A, B) :- f(A).
recommend(A, B) :- b25to34(A), action(B).
recommend(A, B) :- b45to49(A).
recommend(A, B) :- b18to24(A), drama(B).
recommend(A, B) :- horror(B).
recommend(A, B) :- b25to34(A), f(A).
recommend(A, B) :- adventure(B).
recommend(A, B) :- m(A), drama(B).
Combined rules saved to combined_rules.txt


# Apply Union with normalization and unification

In [40]:
import collections
from andante.program import AndanteProgram
from andante.collections import OrderedSet
from andante.logic_concepts import Clause, Atom, Variable, Predicate
from andante.knowledge import TreeShapedKnowledge

# Define the directory containing the Prolog partition files
prolog_directory = "prolog_partitions_six"

# List of partition file names
partition_files = [
    f"{prolog_directory}/prolog_program_partition_{i+1}.pl"
    for i in range(6)  # Assuming 6 partitions, adjust as needed
]

# Initialize an OrderedSet to hold all unique rules
all_rules = OrderedSet()

# Function to normalize and unify clauses
def normalize_clause(clause):
    # Sort the literals in the body of the clause for consistent ordering
    sorted_body = sorted(clause.body, key=lambda atom: str(atom))

    # Standardize variable names: use a consistent naming scheme, e.g., A, B, C...
    var_mapping = {}
    new_body = []
    new_head = clause.head

    for atom in sorted_body:
        new_terms = []
        for term in atom:
            if isinstance(term, Variable):
                if term not in var_mapping:
                    var_mapping[term] = Variable(chr(ord('A') + len(var_mapping)))
                new_terms.append(var_mapping[term])
            else:
                new_terms.append(term)
        new_body.append(Atom(atom.predicate, new_terms))

    # Apply the same mapping to the head of the clause
    if clause.head:
        new_head_terms = []
        for term in clause.head.terms:
            if isinstance(term, Variable):
                new_head_terms.append(var_mapping.get(term, term))
            else:
                new_head_terms.append(term)
        new_head = Atom(clause.head.predicate, new_head_terms)

    # Return the normalized clause
    return Clause(new_head, new_body)

# Iterate over each partition file and induce rules
for partition_file in partition_files:
    print(f"Processing {partition_file}...")
    # Build the AndanteProgram from the current partition file
    ap = AndanteProgram.build_from(partition_file)
    
    # Induce rules and update knowledge
    induced_knowledge = ap.induce(update_knowledge=True, logging=True, verbose=0)
    
    # If induced_knowledge is a TreeShapedKnowledge, extract its clauses
    if isinstance(induced_knowledge, TreeShapedKnowledge):
        for clause in induced_knowledge.clauses:
            if isinstance(clause, Clause):
                normalized_clause = normalize_clause(clause)
                all_rules.add(normalized_clause)
    else:
        print(f"Unexpected type for induced_rules: {type(induced_knowledge)}")

# Function to check for redundancy and remove duplicates
def remove_redundancy(rules):
    unique_rules = OrderedSet()
    for rule in rules:
        if rule not in unique_rules:
            unique_rules.add(rule)
    return unique_rules

# Remove redundancy from all_rules
all_rules = remove_redundancy(all_rules)

# Output the combined, normalized, and unique rules
for rule in all_rules:
    print(rule)

# Optionally, save the combined rules to a file
with open("combined_rules.txt", "w") as f:
    for rule in all_rules:
        f.write(str(rule) + "\n")

print("Combined, normalized, and unique rules saved to combined_rules_normalized.txt")

Processing prolog_partitions_six/prolog_program_partition_1.pl...


SyntaxError: Failed to parse rule <compoundterm = word __ '(' __ term __ (',' __ term __)* ')'> (prolog_program_partition_1.pl, line 43)

In [135]:
import collections
from andante.program import AndanteProgram
from andante.collections import OrderedSet
from andante.logic_concepts import Clause, Atom, Variable
from andante.knowledge import TreeShapedKnowledge

# Define the directory containing the Prolog partition files
prolog_directory = "prolog_partitions_six"

# List of partition file names
partition_files = [
    f"{prolog_directory}/prolog_program_partition_{i+1}.pl"
    for i in range(6)  # Adjust the number of partitions as needed
]

# Initialize an OrderedSet to hold all unique rules
all_rules = OrderedSet()

# Function to clean the predicate's string representation
def clean_predicate_str(pred):
    pred_str = str(pred)
    # Assuming the unwanted format is `/1/1`, we can remove it
    cleaned_str = pred_str.replace('/1/1', '')
    return cleaned_str

# Function to normalize and unify clauses
def normalize_clause(clause, var_start='A'):
    # Sort the literals in the body of the clause for consistent ordering
    sorted_body = sorted(clause.body, key=lambda atom: str(atom))

    # Standardize variable names: use a consistent naming scheme (e.g., A, B, C...)
    var_mapping = {}
    var_count = ord(var_start)
    new_body = []
    new_head = clause.head

    for atom in sorted_body:
        new_terms = []
        for term in atom.terms:
            if isinstance(term, Variable):
                if term not in var_mapping:
                    var_mapping[term] = Variable(chr(var_count))
                    var_count += 1
                new_terms.append(var_mapping[term])
            else:
                new_terms.append(term)
        # Create a new Atom with the cleaned predicate
        cleaned_predicate = clean_predicate_str(atom.predicate)
        new_body.append(Atom(cleaned_predicate, new_terms))

    # Apply the same mapping to the head of the clause
    if clause.head:
        new_head_terms = []
        for term in clause.head.terms:
            if isinstance(term, Variable):
                new_head_terms.append(var_mapping.get(term, term))
            else:
                new_head_terms.append(term)
        new_head = Atom(clean_predicate_str(clause.head.predicate), new_head_terms)

    # Return the normalized clause with cleaned-up predicates
    return Clause(new_head, new_body)

# Iterate over each partition file and induce rules
for partition_file in partition_files:
    print(f"Processing {partition_file}...")
    # Build the AndanteProgram from the current partition file
    ap = AndanteProgram.build_from(partition_file)
    
    # Induce rules and update knowledge
    induced_knowledge = ap.induce(update_knowledge=True, logging=True, verbose=0)
    
    # If induced_knowledge is a TreeShapedKnowledge, extract its clauses
    if isinstance(induced_knowledge, TreeShapedKnowledge):
        for clause in induced_knowledge.clauses:
            if isinstance(clause, Clause):
                normalized_clause = normalize_clause(clause)
                all_rules.add(normalized_clause)
    else:
        print(f"Unexpected type for induced_rules: {type(induced_knowledge)}")

# Function to check for redundancy and remove duplicates
def remove_redundancy(rules):
    unique_rules = OrderedSet()
    for rule in rules:
        normalized_rule = normalize_clause(rule)  # Normalize each rule
        if normalized_rule not in unique_rules:
            unique_rules.add(normalized_rule)
    return unique_rules

# Remove redundancy from all_rules and apply final normalization
all_rules = remove_redundancy(all_rules)

# Output the combined, normalized, and unique rules
for rule in all_rules:
    print(rule)

# Optionally, save the combined rules to a file
with open("combined_rules.txt", "w") as f:
    for rule in all_rules:
        f.write(str(rule) + "\n")

print("Combined, normalized, and unique rules saved to combined_rules.txt")


Processing prolog_partitions_six/prolog_program_partition_1.pl...
Processing prolog_partitions_six/prolog_program_partition_2.pl...


AttributeError: 'Predicate' object has no attribute 'terms'

# Query 

# Test with the encoding of the NOTS

In [85]:
import pandas as pd
import numpy as np
import os

def generate_balanced_partitions(file_path, output_dir):
    # Load the dataset
    data = pd.read_excel(file_path)

    # Define all possible categories for age, genres, and gender
    possible_ages = {
        "Under 18": "under_18",
        "18-24": "b18to24",
        "25-34": "b25to34",
        "35-44": "b35to44",
        "45-49": "b45to49",
        "50-55": "b50to55",
        "56+": "plus56"
    }
    
    possible_genres = {
        "Action": "action",
        "Adventure": "adventure",
        "Animation": "animation",
        "Children's": "childrens",
        "Comedy": "comedy",
        "Crime": "crime",
        "Documentary": "documentary",
        "Drama": "drama",
        "Fantasy": "fantasy",
        "Film-Noir": "filmnoir",
        "Horror": "horror",
        "Musical": "musical",
        "Mystery": "mystery",
        "Romance": "romance",
        "Sci-Fi": "sci_fi",
        "Thriller": "thriller",
        "Western": "western",
        "War": "war"
    }

    possible_genders = ["m", "f"]

    # Split the data into positive and negative examples
    positive_data = data[data['rating'] > 3].copy()
    negative_data = data[data['rating'] <= 3].copy()

    # Shuffle the data
    positive_data = positive_data.sample(frac=1, random_state=42).reset_index(drop=True)
    negative_data = negative_data.sample(frac=1, random_state=42).reset_index(drop=True)

    # Determine the size of each partition
    num_partitions = 6
    pos_partition_size = len(positive_data) // num_partitions
    neg_partition_size = len(negative_data) // num_partitions

    # Ensure that the partitions are balanced
    partitions = []
    for i in range(num_partitions):
        pos_start = i * pos_partition_size
        pos_end = pos_start + pos_partition_size
        neg_start = i * neg_partition_size
        neg_end = neg_start + neg_partition_size

        # Handle remainders by distributing them to the partitions
        if i == num_partitions - 1:
            pos_end = len(positive_data)
            neg_end = len(negative_data)

        partition = pd.concat([
            positive_data.iloc[pos_start:pos_end],
            negative_data.iloc[neg_start:neg_end]
        ]).reset_index(drop=True)
        
        partitions.append(partition)

    # Generate Prolog programs for each partition
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    for i, partition in enumerate(partitions):
        output_file_name = f"{output_dir}/prolog_program_partition_{i+1}"
        generate_progol_program_for_partition(partition, possible_ages, possible_genres, possible_genders, output_file_name)

def generate_progol_program_for_partition(partition, possible_ages, possible_genres, possible_genders, output_file_name):
    # Prepare containers for Prolog facts
    positive_examples = []
    negative_examples = []
    background_knowledge = set()

    # Track the present predicates
    present_ages = set()
    present_genres = set()
    present_genders = set()

    # Process each row in the partition
    for _, row in partition.iterrows():
        user_id = f"u{row['userid']}"
        movie_id = f"m{row['productid']}"
        rating = row['rating']
        age_group = possible_ages.get(row['age'], "").lower()  # Replace age group with corresponding Prolog predicate
        gender = row['gender'].lower()  # Normalize gender
        genre = possible_genres.get(row['pgenre'], "").lower()  # Replace genre with corresponding Prolog predicate

        # Generate positive and negative examples
        if rating > 3:
            positive_examples.append(f"recommend({user_id}, {movie_id}).")
        else:
            negative_examples.append(f"recommend({user_id}, {movie_id}).")

        # Background knowledge: user attributes (age)
        if age_group:
            background_knowledge.add(f"{age_group}({user_id}).")
            present_ages.add(age_group)

        # Background knowledge: user attributes (gender)
        if gender in possible_genders:
            background_knowledge.add(f"{gender}({user_id}).")
            present_genders.add(gender)

        # Background knowledge: movie genre
        if genre:
            background_knowledge.add(f"{genre}({movie_id}).")
            present_genres.add(genre)

    # Add `not_age` and `not_genre` facts for present categories only
    for age_group in present_ages:
        user_ids = {f"u{row['userid']}" for _, row in partition.iterrows() if possible_ages.get(row['age'], "").lower() == age_group}
        for user_id in user_ids:
            for other_age in possible_ages.values():
                if other_age != age_group:
                    background_knowledge.add(f"not_{other_age}({user_id}).")

    for genre in present_genres:
        movie_ids = {f"m{row['productid']}" for _, row in partition.iterrows() if possible_genres.get(row['pgenre'], "").lower() == genre}
        for movie_id in movie_ids:
            for other_genre in possible_genres.values():
                if other_genre != genre:
                    background_knowledge.add(f"not_{other_genre}({movie_id}).")

    # Mode declarations
    modeh_declaration = "modeh(*, recommend(+user, +movie))."
    modeb_declarations = [
        f"modeb(*, {age}(+user))." for age in present_ages
    ] + [
        f"modeb(*, {gender}(+user))." for gender in present_genders
    ] + [
        f"modeb(*, {genre}(+movie))." for genre in present_genres
    ] + [
        f"modeb(*, not_{age}(+user))." for age in present_ages
    ] + [
        f"modeb(*, not_{genre}(+movie))." for genre in present_genres
    ]
    modeb_declarations = [declaration for declaration in modeb_declarations if declaration]  # Remove empty strings

    # Determinations
    determinations = [
        f"determination(recommend/2, {age}/1)." for age in present_ages
    ] + [
        f"determination(recommend/2, {gender}/1)." for gender in present_genders
    ] + [
        f"determination(recommend/2, {genre}/1)." for genre in present_genres
    ] + [
        f"determination(recommend/2, not_{age}/1)." for age in present_ages
    ] + [
        f"determination(recommend/2, not_{genre}/1)." for genre in present_genres
    ]
    determinations = [determination for determination in determinations if determination]  # Remove empty strings

    # Combine all parts into a Progol-compatible logic program
    progol_program = "% Mode Declarations\n"
    progol_program += modeh_declaration + "\n"
    progol_program += "\n".join(modeb_declarations) + "\n\n"

    progol_program += "% Determinations\n"
    progol_program += "\n".join(determinations) + "\n\n"

    progol_program += "% Background Knowledge\n:- begin_bg.\n"
    progol_program += "\n".join(sorted(background_knowledge)) + "\n:- end_bg.\n\n"

    progol_program += "% Positive Examples\n:- begin_in_pos.\n"
    progol_program += "\n".join(positive_examples) + "\n:- end_in_pos.\n\n"

    progol_program += "% Negative Examples\n:- begin_in_neg.\n"
    progol_program += "\n".join(negative_examples) + "\n:- end_in_neg.\n"

    # Save the Progol logic program to a file with a custom name
    output_file_path = f"{output_file_name}.pl"
    with open(output_file_path, "w") as file:
        file.write(progol_program)

    print(f"Progol logic program saved as {output_file_path}")

# Usage example:
file_path = 'ratings_sample_dataset.xlsx'  # Replace with your actual file path
output_dir = './prolog_partitions'  # Directory to save the partitions
generate_balanced_partitions(file_path, output_dir)


Progol logic program saved as ./prolog_partitions/prolog_program_partition_1.pl
Progol logic program saved as ./prolog_partitions/prolog_program_partition_2.pl
Progol logic program saved as ./prolog_partitions/prolog_program_partition_3.pl
Progol logic program saved as ./prolog_partitions/prolog_program_partition_4.pl
Progol logic program saved as ./prolog_partitions/prolog_program_partition_5.pl
Progol logic program saved as ./prolog_partitions/prolog_program_partition_6.pl


In [90]:
from andante.program import AndanteProgram 
apmovies = AndanteProgram.build_from("prolog_partitions/prolog_program_partition_3.pl")
apmovies.induce(update_knowledge=True, logging=True, verbose=0)

Knowledge object (class: TreeShapedKnowledge)
Clauses: