In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from itertools import chain, combinations
from typing import List

In [2]:
# read movies data and print head   
movies_df = pd.read_csv('../ml-25m/movies.csv')
print("Movies data loaded. Sample:")
print(movies_df.head())

total_movies = len(movies_df)
print(f"Total number of movies: {total_movies}")

ratings_df = pd.read_csv('../ml-25m/ratings.csv')
print("Ratings data loaded. Sample:")
print(ratings_df.head())

total_ratings = len(ratings_df)
print(f"Total number of ratings: {total_ratings}")

Movies data loaded. Sample:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
Total number of movies: 62423
Ratings data loaded. Sample:
   userId  movieId  rating   timestamp
0       1      296     5.0  1147880044
1       1      306     3.5  1147868817
2       1      307     5.0  1147868828
3       1      665     5.0  1147878820
4       1      899     3.5  1147868510
Total number of ratings: 25000095


In [3]:
# Filter only "liked" ratings (rating >= 4.0)
liked_df = ratings_df[ratings_df['rating'] >= 4.0].copy()

# Unique userIds and movieIds from liked data
user_ids = liked_df['userId'].unique()
movie_ids = liked_df['movieId'].unique()

# Mappings from real IDs to matrix indices
user_id_map = {uid: idx for idx, uid in enumerate(user_ids)}
movieId_to_idx = {mid: idx for idx, mid in enumerate(movie_ids)}

# Reverse mapping: matrix idx back to real movieId
idx_to_movieId = {idx: mid for mid, idx in movieId_to_idx.items()}

# movieId to movie title mapping from movies_df
movieId_to_title = dict(zip(movies_df['movieId'], movies_df['title']))

# Convert userId and movieId in liked_df to matrix indices
row_indices = liked_df['userId'].map(user_id_map)
col_indices = liked_df['movieId'].map(movieId_to_idx)
data = [1] * len(liked_df)  # all liked entries = 1

# Build sparse user-movie matrix (users x movies)
user_movie_sparse = csr_matrix((data, (row_indices, col_indices)),
                              shape=(len(user_ids), len(movie_ids)))

print(f"Sparse binary matrix shape: {user_movie_sparse.shape}")
print(f"Non-zero entries (likes): {user_movie_sparse.nnz}")

def titles_from_indices(indices, idx_to_movieId, movieId_to_title):
    # indices: list of column indices (int)
    # idx_to_movieId: reverse dict from column idx to movieId
    # movieId_to_title: dict from movieId to title
    titles = []
    for idx in indices:
        mid = idx_to_movieId[idx]
        titles.append(movieId_to_title.get(mid, f"Unknown({mid})"))
    return titles

Sparse binary matrix shape: (162342, 40858)
Non-zero entries (likes): 12452811


In [4]:
def powerset(s):
    "Generate all non-empty subsets of a set"
    return list(chain.from_iterable(combinations(s, r) for r in range(1, len(s)+1)))

def generate_and_print_all_rules_with_sparse_confidence_lift(user_movie_sparse,
                                                            antecedent_movie_indices: List[int],
                                                            consequent_movie_indices: List[int],
                                                            idx_to_movieId: dict,
                                                            movieId_to_title: dict):
    n_users = user_movie_sparse.shape[0]
    
    def users_liked_all(movies: List[int]):
        if not movies:
            return np.ones(n_users, dtype=bool)
        subset_matrix = user_movie_sparse[:, movies].toarray()
        return subset_matrix.all(axis=1)
    
    count = 0
    antecedent_subsets = powerset(antecedent_movie_indices)
    consequent_subsets = powerset(consequent_movie_indices)

    for ant in antecedent_subsets:
        for con in consequent_subsets:
            if set(ant).isdisjoint(set(con)):
                ant_users = users_liked_all(list(ant))
                con_users = users_liked_all(list(con))
                both_users = ant_users & con_users

                ant_support_count = ant_users.sum()
                con_support_count = con_users.sum()
                both_support_count = both_users.sum()

                if ant_support_count == 0 or con_support_count == 0:
                    continue  # skip invalid (avoid division by zero)

                confidence = both_support_count / ant_support_count
                support_consequent = con_support_count / n_users
                support_both = both_support_count / n_users
                support_antecedent = ant_support_count / n_users

                lift = confidence / support_consequent if support_consequent > 0 else float('inf')

                ant_titles = titles_from_indices(ant, idx_to_movieId, movieId_to_title)
                con_titles = titles_from_indices(con, idx_to_movieId, movieId_to_title)

                count += 1
                print(f"Rule #{count}: {ant_titles} -> {con_titles}")
                print(f"  Confidence: {confidence:.3f}")
                print(f"  Lift: {lift:.3f}")
                # print(f"  Support (Antecedent): {support_antecedent:.3f}")
                # print(f"  Support (Consequent): {support_consequent:.3f}")
                # print(f"  Support (Both): {support_both:.3f}")
                if lift > 1:
                    print(f"  → Users who like {', '.join(ant_titles)} are {((lift - 1) * 100):.1f}% more likely to also like {', '.join(con_titles)}.\n")
                elif lift < 1:
                    print(f"  → Users who like {', '.join(ant_titles)} are {((1 - lift) * 100):.1f}% *less* likely to also like {', '.join(con_titles)}.\n")
                else:
                    print(f"  → Liking {', '.join(ant_titles)} has no effect on the likelihood of liking {', '.join(con_titles)}.\n")

    print(f"Total valid rules generated: {count}")


In [5]:
# Step 1: Select sample movie titles
sample_titles = [
    "Matrix, The (1999)",
    "Star Wars: Episode IV - A New Hope (1977)",
    "Pulp Fiction (1994)",
    "Lord of the Rings: The Fellowship of the Ring, The (2001)"
]

# Step 2: Get movieId for each title
title_to_movieId = {v: k for k, v in movieId_to_title.items()}

sample_movie_ids = [title_to_movieId[title] for title in sample_titles]

# Step 3: Convert movieId to matrix index
sample_indices = [movieId_to_idx[mid] for mid in sample_movie_ids]

# Step 4: Split into antecedent and consequent indices
antecedent_indices = sample_indices[:2]
consequent_indices = sample_indices[2:]

# Step 5: Run rule generation
generate_and_print_all_rules_with_sparse_confidence_lift(
    user_movie_sparse,
    antecedent_indices,
    consequent_indices,
    idx_to_movieId,
    movieId_to_title
)

Rule #1: ['Matrix, The (1999)'] -> ['Pulp Fiction (1994)']
  Confidence: 0.532
  Lift: 1.384
  → Users who like Matrix, The (1999) are 38.4% more likely to also like Pulp Fiction (1994).

Rule #2: ['Matrix, The (1999)'] -> ['Lord of the Rings: The Fellowship of the Ring, The (2001)']
  Confidence: 0.503
  Lift: 1.968
  → Users who like Matrix, The (1999) are 96.8% more likely to also like Lord of the Rings: The Fellowship of the Ring, The (2001).

Rule #3: ['Matrix, The (1999)'] -> ['Pulp Fiction (1994)', 'Lord of the Rings: The Fellowship of the Ring, The (2001)']
  Confidence: 0.299
  Lift: 2.279
  → Users who like Matrix, The (1999) are 127.9% more likely to also like Pulp Fiction (1994), Lord of the Rings: The Fellowship of the Ring, The (2001).

Rule #4: ['Star Wars: Episode IV - A New Hope (1977)'] -> ['Pulp Fiction (1994)']
  Confidence: 0.496
  Lift: 1.291
  → Users who like Star Wars: Episode IV - A New Hope (1977) are 29.1% more likely to also like Pulp Fiction (1994).

Rule 