In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
# https://github.com/Darel13712/ease_rec

import os
import pickle

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder


class EASE:
    def __init__(self):
        self.user_enc = LabelEncoder()
        self.item_enc = LabelEncoder()

    def _get_users_and_items(self, df):
        users = self.user_enc.fit_transform(df.loc[:, 'user_id'])
        items = self.item_enc.fit_transform(df.loc[:, 'item_id'])
        return users, items

    def save_model(self, filepath):
        with open(filepath, 'wb') as f:
            pickle.dump((self.user_enc, self.item_enc, self.B), f)

    def load_model(self, filepath):
        with open(filepath, 'rb') as f:
            self.user_enc, self.item_enc, self.B = pickle.load(f)

    def fit(self, recommender, lambda_: float = 0.5, sparsity_coefficient=98, implicit=False, model_path=None):
        """
        df: pandas.DataFrame with columns user_id, item_id, and (rating)
        lambda_: l2-regularization term
        sparsity_coefficient: percentage of the smallest values to zero out in the B matrix before converting it to sparse
        implicit: if True, ratings are ignored and taken as 1, else normalized ratings are used
        model_path: Path to save or load the model. If provided, will attempt to load the model; if not found, trains a new model.
        """


        if model_path is not None and os.path.exists(model_path):
            print("Loading the model")
            self.load_model(model_path)
            print("Model loaded successfully.")
            return

        ratings = recommender.read_ratings_to_df(recommender.ratings_path)

        print(f"Starting the fit process with lambda={lambda_}...")
        users, items = self._get_users_and_items(ratings)
        print("Preparing values...")
        values = (
            np.ones(ratings.shape[0])
            if implicit
            else ratings['rating'].to_numpy() / ratings['rating'].max()
        )

        print("Creating matrix X...")
        print(f'Users: {users.shape}. Items: {items.shape}. Values: {values.shape}.')
        X = csr_matrix((values, (users, items)), shape=(len(set(users)), len(set(items))))
        print(f'X shape: {X.shape}.')

        print("Computing matrix G...")
        G = X.T.dot(X).toarray()
        print("Adding lambda")
        diagIndices = np.diag_indices(G.shape[0])
        G[diagIndices] += lambda_
        print("Inverting matrix G...")
        P = np.linalg.inv(G)
        B = P / (-np.diag(P))
        B[diagIndices] = 0

        print("Converting the matrix to sparse...")
        # Convert B to sparse by zeroing out the smallest absolute values
        threshold = np.percentile(np.abs(B), sparsity_coefficient)
        B[np.abs(B) < threshold] = 0
        self.B = csr_matrix(B)

        if model_path is not None:
            self.save_model(model_path)
            print(f"Model saved successfully at {model_path}.")

    def predict(self, new_user_ratings, k=10):
        # Transform movie_id to the internal representation
        movie_ids = [x[0] for x in new_user_ratings]
        ratings = [x[1] for x in new_user_ratings]

        try:
            transformed_movie_ids = self.item_enc.transform(movie_ids)
        except ValueError:
            # Handles unknown movies by ignoring them
            valid_indices = [i for i, movie_id in enumerate(movie_ids) if movie_id in self.item_enc.classes_]
            transformed_movie_ids = self.item_enc.transform([movie_ids[i] for i in valid_indices])
            ratings = [ratings[i] for i in valid_indices]


        # Create a user vector with ratings for the movies they've rated
        user_vector = np.zeros(self.B.shape[1])
        user_vector[transformed_movie_ids] = ratings

        scores = user_vector.dot(self.B.toarray())          # Compute the score for each item

        scores[transformed_movie_ids] = -np.inf   # Remove items user has already rated

        # Get the top k items
        recommended_item_indices = np.argpartition(scores, -k)[-k:]
        recommended_scores = scores[recommended_item_indices]

        # Transform item indices back to original movie IDs
        recommended_movie_ids = self.item_enc.inverse_transform(recommended_item_indices)

        recommendations = pd.DataFrame({
            'item_id': recommended_movie_ids,
            'score': recommended_scores
        }).sort_values(by='score', ascending=False).reset_index(drop=True)

        return recommendations

In [3]:
!pip install Levenshtein

Collecting Levenshtein
  Downloading Levenshtein-0.25.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (177 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.4/177.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rapidfuzz<4.0.0,>=3.8.0 (from Levenshtein)
  Downloading rapidfuzz-3.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, Levenshtein
Successfully installed Levenshtein-0.25.1 rapidfuzz-3.9.4


In [4]:
import Levenshtein
import pandas as pd

class Recommender:
    def __init__(self, ratings_path, movie_names_path):
        self.ease = EASE()
        self.movie_names = self.read_movies_names_to_df(movie_names_path)
        self.ratings_path = ratings_path        # Ratings will be needed only if the model isn't already trained, so only store the path

    def read_ratings_to_df(self, file_path):
        df = pd.read_json(file_path, lines=True, orient='records')
        return df[['user_id', 'item_id', 'rating']]

    def read_movies_names_to_df(self, file_path):
        df = pd.read_json(file_path, lines=True, orient='records', dtype={'imdbId': str})
        return df[['item_id', 'title', 'popularity', 'imdbId']]

    def distance(self, movie_name, database_name):
        """ Calculates the Levenshtein distance between two strings. Some movies in the database have multiple titles, separated by " (a.k.a. ". In this case, the distance is calculated for each title and the minimum distance is returned.
        Args:
            movie_name (str): The name of the first (user) movie
            database_name (str): The name of the second (database) movie
        Returns:
            int: The Levenshtein distance between the two strings
        """

        movie_name_low = movie_name.lower()
        database_name_low = database_name.lower()
        aka_expression = " (a.k.a. "

        if aka_expression not in database_name_low:
            return Levenshtein.distance(movie_name_low, database_name_low)

        parts = database_name_low.split(aka_expression)
        distances = [Levenshtein.distance(movie_name_low, part) for part in parts]
        return min(distances)


    def get_movie_id(self, movies, movie_name, max_distance_threshold = 2):
        best_distance = float('inf')
        best_match = None

        remove_year_from_title = lambda title: title[:-7]   # Release year is always the last 7 characters

        for i, row in movies.iterrows():
            database_name = remove_year_from_title(row['title'])
            # distance = Levenshtein.distance(movie_name.lower(), database_name.lower())
            distance = self.distance(movie_name, database_name)

            if distance == 0:
                best_distance = distance
                best_match = row['item_id']
                break

            if distance < best_distance:
                best_distance = distance
                best_match = row['item_id']

        if best_distance > max_distance_threshold:
            print(f'Could not find a match for "{movie_name}"')
            return None

        found_movie_name = movies[movies['item_id'] == best_match]['title'].values[0]
        found_movie_id = movies[movies['item_id'] == best_match]['item_id'].values[0]
        print(f'Looking for "{movie_name}". Found "{found_movie_name}"')
        return found_movie_id

    def convert_movie_names_to_ids(self, user_ratings):
        """ Converts a list of (movie_name, rating) tuples into a list of (movie_id, rating) tuples. If a movie name is not found, it will be ignored. """
        converted_ratings = []
        for movie, rating in user_ratings:
            movie_id = self.get_movie_id(self.movie_names, movie)
            if movie_id is not None:
                converted_ratings.append((movie_id, rating))

        return converted_ratings



    def fit(self, model_path=None, lambda_=0.5, sparsity_coefficient=98, implicit=False):
        self.ease.fit(self, lambda_=lambda_, sparsity_coefficient=sparsity_coefficient, implicit=implicit, model_path=model_path)

    def merge_with_movie_names(self, predictions):
        return predictions.merge(self.movie_names, on='item_id').sort_values('score', ascending=False)

    def predict_most_popular(self, k=10):
       # self.movie_names is already sorted by popularity
        # returns k random movies from 10*k most popular movies
        recommendations = self.movie_names[:10*k].sample(k)[['item_id', 'title', 'popularity', 'imdbId']].sort_values('popularity', ascending=False)

        # Score is a normalized popularity between 0 and 1
        recommendations['score'] = recommendations['popularity'] / recommendations['popularity'].max()
        recommendations = recommendations[['item_id', 'score', 'title', 'popularity', 'imdbId']]    # Reorder columns

        return recommendations


    def predict(self, user_ratings, k=10):
        """ Predicts k movies for the user based on the user's ratings.
        Args:
            user_ratings (list): List of tuples with item_id and rating [(item_id, rating), ...]
            k (int, optional): Number of recommendations to return. Defaults to 10.
        Returns:
            pandas.DataFrame: DataFrame with columns item_id, score, title, popularity, imdbId
        """

        no_ratings_provided = len(user_ratings) == 0
        if no_ratings_provided:
            return self.predict_most_popular(k)

        predictions =  self.ease.predict(user_ratings, k)
        return self.merge_with_movie_names(predictions)

In [5]:
test_data = pd.read_json("drive/MyDrive/bc/taggenome/ratings_test.json", lines=True)
test_data.head()

Unnamed: 0,item_id,user_id,rating
0,1,211495,4.0
1,6,211495,3.0
2,11,211495,3.0
3,12,211495,1.0
4,16,211495,3.0


In [6]:
unique_users = test_data['user_id'].unique()
len(unique_users)

1000

In [7]:
def calculate_precision_recall(recommender, user_id, k=20):
    user_ratings = test_data[test_data['user_id'] == user_id][['item_id', 'rating']]
    sample_user_ratings = user_ratings.sample(k)
    target_user_ratings = user_ratings.drop(sample_user_ratings.index)

    predictions = recommender.predict(sample_user_ratings.values.tolist(), k=k)
    precision = len(predictions[predictions['item_id'].isin(target_user_ratings['item_id'])]) / k
    recall = len(predictions[predictions['item_id'].isin(target_user_ratings['item_id'])]) / len(target_user_ratings)

    return precision, recall

def f1_score(precision, recall):
    return 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0

def calculate_avg_precision_recall_f1(recommender, user_ids, k=20):
    recalls = []
    f1_scores = []
    precisions = []
    for user_id in user_ids:
        precision, recall = calculate_precision_recall(recommender, user_id, k)
        f1 = f1_score(precision, recall)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
    return np.mean(precisions), np.mean(recalls), np.mean(f1_scores)

In [9]:
lambdas = [0.01, 0.05, 0.25, 0.75, 1.0, 1.5, 2.0, 2.5, 3.0, 5.0, 10.0, 7.5, 15, 20, 30, 40, 50, 60, 70, 80, 100, 150, 200, 350, 500, 750, 1000, 1500]
big_lambdas = [2500, 5000, 7500, 10000, 12500, 15000, 20000]      # Larger lambdas were found to make the model more accurate

precisions = {}
recalls = {}
f1_scores = {}

for lambda_ in big_lambdas:
    recommender = Recommender(ratings_path="drive/MyDrive/bc/taggenome/ratings.json", movie_names_path="drive/MyDrive/bc/taggenome/movies.json")
    recommender.fit(model_path=f"drive/MyDrive/bc/models/model_{lambda_}_sparse.pkl", lambda_=lambda_, sparsity_coefficient=98)
    precision, recall, f1 = calculate_avg_precision_recall_f1(recommender, unique_users)
    recalls[lambda_] = recall
    precisions[lambda_] = precision
    f1_scores[lambda_] = f1

    print(f'F1 score for lambda={lambda_}: {f1}')
    print(f'Recall for lambda={lambda_}: {recall}')


Starting the fit process with lambda=2500...
Preparing values...
Creating matrix X...
Users: (9573225,). Items: (9573225,). Values: (9573225,).
X shape: (42368, 17807).
Computing matrix G...
Adding lambda
Inverting matrix G...
Converting the matrix to sparse...
Model saved successfully at drive/MyDrive/bc/models/model_2500_sparse.pkl.
F1 score for lambda=2500: 0.17997152383014264
Recall for lambda=2500: 0.1479852137305716
Starting the fit process with lambda=5000...
Preparing values...
Creating matrix X...
Users: (9573225,). Items: (9573225,). Values: (9573225,).
X shape: (42368, 17807).
Computing matrix G...
Adding lambda
Inverting matrix G...
Converting the matrix to sparse...
Model saved successfully at drive/MyDrive/bc/models/model_5000_sparse.pkl.
F1 score for lambda=5000: 0.17912409843352997
Recall for lambda=5000: 0.14618823913821916
Starting the fit process with lambda=7500...
Preparing values...
Creating matrix X...
Users: (9573225,). Items: (9573225,). Values: (9573225,).
X s

In [10]:
# Sort precisions by values
sorted_precisions = dict(sorted(precisions.items(), key=lambda item: item[1], reverse=True))
print("Precisions")
print(sorted_precisions)

# Sort recalls by values
sorted_recalls = dict(sorted(recalls.items(), key=lambda item: item[1], reverse=True))
print("Recalls")
print(sorted_recalls)

# Sort f1_scores by values
sorted_f1_scores = dict(sorted(f1_scores.items(), key=lambda item: item[1], reverse=True))
print("F1 scores")
print(sorted_f1_scores)

Precisions
{15000: 0.6077, 20000: 0.6076, 10000: 0.6028, 12500: 0.60205, 7500: 0.5908500000000001, 2500: 0.58265, 5000: 0.58185}
Recalls
{10000: 0.14946828462037404, 12500: 0.14922533670559926, 20000: 0.14816607399278423, 15000: 0.14801922489765967, 2500: 0.1479852137305716, 5000: 0.14618823913821916, 7500: 0.14450072699690514}
F1 scores
{10000: 0.18422925818979674, 12500: 0.18326891013840516, 15000: 0.1832375231123634, 20000: 0.18279367220773962, 2500: 0.17997152383014264, 7500: 0.17989031518762902, 5000: 0.17912409843352997}
