<a href="https://colab.research.google.com/github/andrejdaskalov/rec-sys-evaluation-paper/blob/main/RecPaperSystemEvaluations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
DATASET_DIR = "/gdrive/MyDrive/movielens_small/ml-latest-small/"
def get_dataset_path(filename: str) -> str:
    return DATASET_DIR + filename

this function is used to standardize model run names and path

In [None]:
import datetime
RESULT_PATH = "results/"
def get_results_path(model_name: str, dataset_type: str) -> str:
    current_time = datetime.datetime.now()
    datestr = "T".join(str(current_time).split(" "))
    subdir = RESULT_PATH + "_".join([model_name, dataset_type, datestr])
    return get_dataset_path(subdir + "/")

# Load and import

In [None]:
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import os

In [None]:
movie_data = pd.read_csv(get_dataset_path("enriched.csv"), na_filter=False)
movie_data.head()

Unnamed: 0,movieId,title,genres,tag,imdbId,cast,writers,producers,runtime,boxOfficeBudget,plotOutline,year
0,1,Toy Story,adventure|animation|children|comedy|fantasy,pixar|pixar|fun,114709,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter|Pete Docter|Andrew Stanton|Joe R...,Bonnie Arnold|Ed Catmull|Ralph Guggenheim|Stev...,81.0,30000000.0,A little boy named Andy loves to be in his roo...,1995
1,2,Jumanji,adventure|children|fantasy,fantasy|magic board game|Robin Williams|game,113497,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,Jonathan Hensleigh|Greg Taylor|Jim Strain|Greg...,Robert W. Cort|Ted Field|Larry Franco|Scott Kr...,104.0,50000000.0,"Jumanji, one of the most unique--and dangerous...",1995
2,3,Grumpier Old Men,comedy|romance,moldy|old,113228,Walter Matthau|Jack Lemmon|Sophia Loren|Ann-Ma...,Mark Steven Johnson|Mark Steven Johnson,Richard C. Berman|John Davis|George Folsey Jr....,101.0,25000000.0,Things don't seem to change much in Wabasha Co...,1995
3,4,Waiting to Exhale,comedy|drama|romance,,114885,Whitney Houston|Angela Bassett|Loretta Devine|...,Terry McMillan|Terry McMillan|Ron Bass,Ron Bass|Caron K|Terry McMillan|Deborah Schind...,124.0,16000000.0,This story based on the best selling novel by ...,1995
4,5,Father of the Bride Part II,comedy,pregnancy|remake,113041,Steve Martin|Diane Keaton|Martin Short|Kimberl...,Albert Hackett|Frances Goodrich|Nancy Meyers|C...,Carol Baum|Bruce A. Block|Julie B. Crane|Jim C...,106.0,30000000.0,"In this sequel to ""Father of the Bride"", Georg...",1995


In [None]:
USER_COLUMN_NAME = 'userId'
CONTENT_COLUMN_NAME = 'movieId'
RATING_COLUMN_NAME = 'rating'

In [None]:
movie_data['genres'] = movie_data['genres'].apply(lambda x: x.split("|"))
movie_data['tag'] = movie_data['tag'].apply(lambda x: x.split("|"))
movie_data['cast'] = movie_data['cast'].apply(lambda x: x.split("|"))
movie_data['writers'] = movie_data['writers'].apply(lambda x: x.split("|"))
movie_data['producers'] = movie_data['producers'].apply(lambda x: x.split("|"))
movie_data.head()

Unnamed: 0,movieId,title,genres,tag,imdbId,cast,writers,producers,runtime,boxOfficeBudget,plotOutline,year
0,1,Toy Story,"[adventure, animation, children, comedy, fantasy]","[pixar, pixar, fun]",114709,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[John Lasseter, Pete Docter, Andrew Stanton, J...","[Bonnie Arnold, Ed Catmull, Ralph Guggenheim, ...",81.0,30000000.0,A little boy named Andy loves to be in his roo...,1995
1,2,Jumanji,"[adventure, children, fantasy]","[fantasy, magic board game, Robin Williams, game]",113497,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[Jonathan Hensleigh, Greg Taylor, Jim Strain, ...","[Robert W. Cort, Ted Field, Larry Franco, Scot...",104.0,50000000.0,"Jumanji, one of the most unique--and dangerous...",1995
2,3,Grumpier Old Men,"[comedy, romance]","[moldy, old]",113228,"[Walter Matthau, Jack Lemmon, Sophia Loren, An...","[Mark Steven Johnson, Mark Steven Johnson]","[Richard C. Berman, John Davis, George Folsey ...",101.0,25000000.0,Things don't seem to change much in Wabasha Co...,1995
3,4,Waiting to Exhale,"[comedy, drama, romance]",[N/A],114885,"[Whitney Houston, Angela Bassett, Loretta Devi...","[Terry McMillan, Terry McMillan, Ron Bass]","[Ron Bass, Caron K, Terry McMillan, Deborah Sc...",124.0,16000000.0,This story based on the best selling novel by ...,1995
4,5,Father of the Bride Part II,[comedy],"[pregnancy, remake]",113041,"[Steve Martin, Diane Keaton, Martin Short, Kim...","[Albert Hackett, Frances Goodrich, Nancy Meyer...","[Carol Baum, Bruce A. Block, Julie B. Crane, J...",106.0,30000000.0,"In this sequel to ""Father of the Bride"", Georg...",1995


In [None]:
movie_data_indexed = movie_data.set_index(CONTENT_COLUMN_NAME)

# Datasets
> NOTE: use only one of below dataset preparation groups, otherwise they overwrite eachother

# Prepare Normal Interactions dataset

In [None]:
DATASET_TYPE = "full"

In [None]:
movie_ratings = pd.read_csv(get_dataset_path("ratings.csv"))
movie_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
len(movie_ratings)

100836

split by timestamp

In [None]:
INTERACTIONS_TEST_SIZE = 0.2
from math import ceil, floor
movie_ratings.sort_values('timestamp', inplace=True)
interactions_train_df = movie_ratings.groupby('userId', group_keys=False).apply(lambda x: x.head(ceil((1-INTERACTIONS_TEST_SIZE)*len(x))))
interactions_test_df = movie_ratings.groupby('userId', group_keys=False).apply(lambda x: x.tail(floor(INTERACTIONS_TEST_SIZE*len(x))))
interactions_train_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
43,1,804,4.0,964980499
73,1,1210,5.0,964980499
171,1,2628,4.0,964980523
183,1,2826,4.0,964980523
120,1,2018,5.0,964980523


In [None]:
interactions_train_df_indexed = interactions_train_df.set_index('userId')
interactions_test_df_indexed = interactions_test_df.set_index('userId')
movie_ratings_indexed = movie_ratings.set_index('userId')

# Prepare Reduced interactions dataset

In [None]:
DATASET_TYPE = "reduced"

In [None]:
interactions_train_df_indexed = pd.read_csv(get_dataset_path("reduced_interactions.csv"), index_col="userId")
interactions_test_df_indexed = pd.read_csv(get_dataset_path("reduced_interactions_heldout.csv"), index_col="userId")
movie_ratings_indexed = pd.concat([interactions_train_df_indexed, interactions_test_df_indexed])
movie_ratings_indexed.head()

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
429,595,5.0,828124615
429,588,5.0,828124615
429,590,5.0,828124615
107,140,4.0,829322340
107,105,4.0,829322340


In [None]:
interactions_train_df = interactions_train_df_indexed.reset_index()
interactions_test_df = interactions_test_df_indexed.reset_index()
movie_ratings = movie_ratings_indexed.reset_index()

# Prepare Mixed Interactions dataset

In [None]:
DATASET_TYPE = "mixed"

In [None]:
interactions_train_df_indexed = pd.read_csv(get_dataset_path("mixed_interactions.csv"), index_col="userId")
interactions_test_df_indexed = pd.read_csv(get_dataset_path("mixed_interactions_heldout.csv"), index_col="userId")
movie_ratings_indexed = pd.concat([interactions_train_df_indexed, interactions_test_df_indexed])
movie_ratings_indexed.head()

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
448,69640,3.0,1289145810
352,90866,4.5,1493674691
599,1623,2.5,1498516912
474,2583,4.0,1081177421
465,2278,4.0,959896203


In [None]:
interactions_train_df = interactions_train_df_indexed.reset_index()
interactions_test_df = interactions_test_df_indexed.reset_index()
movie_ratings = movie_ratings_indexed.reset_index()

# Evaluation

In [None]:
def get_items_interacted(person_id, interactions_df):
    # Get the user's data and merge in the movie information.
    interacted_items = interactions_df.loc[person_id][CONTENT_COLUMN_NAME]
    # interacted_items = interactions_df
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [None]:
#Top-N accuracy metrics consts
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

class ModelEvaluator:

    def __init__(self, interactions_df_full, interactions_df_train, interactions_df_test, content_df):
        self.interactions_df_full = interactions_df_full
        self.interactions_df_train = interactions_df_train
        self.interactions_df_test = interactions_df_test
        self.content_df = content_df


    def get_not_interacted_items_sample(self, person_id, sample_size, seed=42):
        interacted_items = get_items_interacted(person_id, self.interactions_df_full)
        all_items = set(self.interactions_df_full[CONTENT_COLUMN_NAME])
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(list(non_interacted_items), sample_size)
        return set(non_interacted_items_sample)

    def _verify_hit_top_n(self, item_id, recommended_items, topn):
            try:
                index = next(i for i, c in enumerate(recommended_items) if c == item_id)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index


    def _calculate_recall(self, person_interacted_items_testset, person_recs_df, person_id):
        hits_at_5_count = 0
        hits_at_10_count = 0

        interacted_items_count_testset = len(person_interacted_items_testset)
        #For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            #Getting a random sample (100) items the user has not interacted
            #(to represent items that are assumed to be no relevant to the user)
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id,
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS,
                                                                          seed=item_id%(2**32))

            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df[CONTENT_COLUMN_NAME].isin(items_to_filter_recs)]
            valid_recs = valid_recs_df[CONTENT_COLUMN_NAME].values
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items,
        #when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        recall_metrics = {'hits@5_count':hits_at_5_count,
                          'hits@10_count':hits_at_10_count,
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return recall_metrics


    def _calculate_precision(self, person_interacted_items_testset, person_recs_df : pd.DataFrame, person_id):

        hits_at_5_count = 0
        hits_at_10_count = 0


        top_5 = person_recs_df.head(5)
        top_10 = person_recs_df.head(10)

        top_5_filtered = top_5[top_5[CONTENT_COLUMN_NAME].isin(person_interacted_items_testset)]
        top_10_filtered = top_10[top_10[CONTENT_COLUMN_NAME].isin(person_interacted_items_testset)]

        hits_at_5_count = top_5_filtered.__len__()
        hits_at_10_count = top_10_filtered.__len__()

        precision_at_5 = hits_at_5_count / 5.0
        precision_at_10 = hits_at_10_count / 10.0


        precision_metrics = {'hits@5_count':hits_at_5_count,
                          'hits@10_count':hits_at_10_count,
                          'precision@5': precision_at_5,
                          'precision@10': precision_at_10}
        return precision_metrics


    # calculate the mean reciprocal rank (MRR)
    # Mean Reciprocal Rank (MRR) at K evaluates how quickly a ranking system can show the first relevant item in the top-K results.
    # MRR = 1/U sum_(u=1, U)(1/ rank_i)
    # where U is the total number of users and i is the position of the first relevant item for user u in top K results
    # in this method, I calculate the reciprocal rank for each user to later calculate the MRR for all users
    def _calculate_rr_user(self, person_interacted_items_testset: set[int], person_recs_df: pd.DataFrame, person_id: int):


        first_relevant_item = person_recs_df[person_recs_df[CONTENT_COLUMN_NAME]\
         .isin(person_interacted_items_testset)].head(1)

        if first_relevant_item.empty:
            return {"rr": -1 }

        idx_first_relevant_item = first_relevant_item.index[0]

        rr =  1/ int( idx_first_relevant_item + 1 )
        return {
            "rr": rr
        }



    def evaluate_model_for_user(self, model, person_id):
        #Getting the items in test set
        interacted_values_testset = self.interactions_df_test.loc[person_id]
        if type(interacted_values_testset[CONTENT_COLUMN_NAME]) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset[CONTENT_COLUMN_NAME])
        else:
            person_interacted_items_testset = set([int(interacted_values_testset[CONTENT_COLUMN_NAME])])
        # interacted_items_count_testset = len(person_interacted_items_testset)

        #Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend_items(person_id,
                                               items_to_ignore=get_items_interacted(person_id,
                                                                                    self.interactions_df_train),
                                               topn=1000, verbose=True)

        recall_metrics = self._calculate_recall(person_interacted_items_testset, person_recs_df, person_id)
        precision_metrics = self._calculate_precision(person_interacted_items_testset, person_recs_df, person_id)
        rr_metric = self._calculate_rr_user(person_interacted_items_testset, person_recs_df, person_id)

        person_metrics = {
            'recall': recall_metrics,
            'precision': precision_metrics,
            'rr': rr_metric,
        }

        return person_metrics

    def evaluate_model(self, model) -> tuple[pd.DataFrame, dict]:

        print(f'Evaluating {model.get_model_name()} recommendation model...')

        people_metrics_recall = []
        people_metrics_precision = []
        people_metrics_rr = []
        for idx, person_id in enumerate(list(self.interactions_df_test.index.unique().values)):

            person_metrics = self.evaluate_model_for_user(model, person_id)
            person_metrics_recall = person_metrics['recall']
            person_metrics_precision = person_metrics['precision']
            person_metrics_rr = person_metrics['rr']

            person_metrics_recall['_person_id'] = person_id
            people_metrics_recall.append(person_metrics_recall)

            person_metrics_precision['_person_id'] = person_id
            people_metrics_precision.append(person_metrics_precision)


            person_metrics_rr['_person_id'] = person_id
            people_metrics_rr.append(person_metrics_rr)
        print('%d users processed' % idx)

        detailed_results_recall_df = pd.DataFrame(people_metrics_recall) \
                            .sort_values('interacted_count', ascending=False)

        detailed_results_precision_df = pd.DataFrame(people_metrics_precision)

        detailed_results_rr_df = pd.DataFrame(people_metrics_rr)


        global_recall_at_5 = detailed_results_recall_df['hits@5_count'].sum() / float(detailed_results_recall_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_recall_df['hits@10_count'].sum() / float(detailed_results_recall_df['interacted_count'].sum())

        global_precision_at_5 = detailed_results_precision_df['hits@5_count'].sum() / float(detailed_results_precision_df.__len__() * 5)
        global_precision_at_10 = detailed_results_precision_df['hits@10_count'].sum() / float(detailed_results_precision_df.__len__() * 10)

        sum_rr = detailed_results_rr_df[detailed_results_rr_df['rr'] != -1]['rr'].sum()
        num_users = len(list(self.interactions_df_test.index.unique().values))
        mean_reciprocal_rank = (1 / int(num_users) ) * sum_rr

        global_metrics = {'modelName': [model.get_model_name()],
                          'recall@5': [global_recall_at_5],
                          'recall@10': [global_recall_at_10],
                          'precision@5': [global_precision_at_5],
                          'precision@10': [global_precision_at_10],
                          'mrr': [mean_reciprocal_rank]
                          }
        global_metrics_df = pd.DataFrame(global_metrics)
        dataframes = {'recall': detailed_results_recall_df,
                      'precision': detailed_results_precision_df,
                      'rr': detailed_results_rr_df
        }
        return global_metrics_df, dataframes

    def print_results(self, global_metrics, dataframes):
        print('\nGlobal metrics:\n%s' % global_metrics)
        for _, df in dataframes.items():
            print(df.head(50))

    def save_results(self, global_metrics: pd.DataFrame, dataframes: dict[str, pd.DataFrame], model_name, dataset_type):
        file_path = get_results_path(model_name, dataset_type)
        os.makedirs(file_path, exist_ok=True)
        global_metrics.to_csv(file_path + "global_metrics.csv")
        for key, df in dataframes.items():
            df.to_csv(file_path+key+".csv")


model_evaluator = ModelEvaluator(movie_ratings_indexed, interactions_train_df_indexed, interactions_test_df_indexed, movie_data)

# Popularity based (as a baseline)

In [None]:
class PopularityRecommender:

    MODEL_NAME = 'Popularity'


    def __init__(self, popularity_df, items_df=None):
        self.popularity_df = popularity_df
        self.items_df = items_df
        self.compute_most_popular()

    def compute_most_popular(self):
        # self.popularity_df = self.popularity_df.groupby(CONTENT_COLUMN_NAME)[RATING_COLUMN_NAME].sum().sort_values(ascending=False).reset_index()
        movie_avg_ratings = movie_ratings.groupby('movieId', as_index=False)['rating'].mean()
        movie_avg_ratings.columns = ['movieId', 'rating']
        movie_viewcount = movie_ratings.groupby(['movieId'], as_index=False).size()
        movie_viewcount.columns = ['movieId', 'views']
        self.popularity_df = movie_avg_ratings.merge(movie_viewcount,
                    how='inner',
                    left_on='movieId',
                    right_on='movieId').sort_values(['views', 'rating'], ascending=[False,False])




    def get_model_name(self):
        return self.MODEL_NAME

    def get_most_popular(self):
        return self.popularity_df


    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Recommend the more popular items that the user hasn't seen yet.
        recommendations_df = self.popularity_df[~self.popularity_df[CONTENT_COLUMN_NAME].isin(items_to_ignore)] \
                               .head(topn)

        most_views = int(recommendations_df.head(1).iloc[0]['views'])
        recommendations_df['recStrength'] = recommendations_df['views'].apply(lambda x: x/most_views)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left',
                                                          left_on = CONTENT_COLUMN_NAME,
                                                          right_on = CONTENT_COLUMN_NAME)


        return recommendations_df

popularity_model = PopularityRecommender(movie_ratings.copy(), movie_data)

In [None]:
popularity = popularity_model.get_most_popular()
popularity.head(50)

Unnamed: 0,movieId,rating,views
314,356,4.164134,329
277,318,4.429022,317
257,296,4.197068,307
510,593,4.16129,279
1938,2571,4.192446,278
224,260,4.231076,251
418,480,3.75,238
97,110,4.031646,237
507,589,3.970982,224
461,527,4.225,220


In [None]:
pop_global_metrics, pop_results_dataframes = model_evaluator.evaluate_model(popularity_model)
model_evaluator.print_results(pop_global_metrics, pop_results_dataframes)
model_evaluator.save_results(pop_global_metrics, pop_results_dataframes, popularity_model.MODEL_NAME, DATASET_TYPE)


Evaluating Popularity recommendation model...
609 users processed

Global metrics:
    modelName  recall@5  recall@10  precision@5  precision@10      mrr
0  Popularity  0.484991   0.586914     0.460656      0.417377  0.63199
     hits@5_count  hits@10_count  interacted_count  recall@5  recall@10  \
170           903            904              2695  0.335065   0.335436   
564           760            786              2475  0.307071   0.317576   
195           545            615              2105  0.258907   0.292162   
228           527            593              1861  0.283181   0.318646   
321           573            616              1343  0.426657   0.458675   
543           353            441              1299  0.271747   0.339492   
307           725            741              1257  0.576770   0.589499   
556           473            534              1215  0.389300   0.439506   
322           329            390              1112  0.295863   0.350719   
206           452        

# Content-based (with semantics) with word2vec


In [None]:
!pip install --upgrade numpy gensim

Collecting numpy
  Using cached numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)


In [None]:
from gensim.models import Word2Vec, Phrases, KeyedVectors
import gensim
from nltk.tokenize import sent_tokenize, word_tokenize
import spacy
import regex as re

warnings.filterwarnings(action='ignore')

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
# clean from non-ascii, lematize and lowercase
def clean_item(item):
    doc = nlp(item)
    lemmatized_tokens = [token.lemma_ for token in doc if not token.is_stop]
    lemmatized_text = ' '.join(lemmatized_tokens)
    cleaned_text = re.sub("[^A-Za-z']+", ' ', lemmatized_text)
    lowered = cleaned_text.lower().strip()
    return lowered.split(' ')


In [None]:
movie_data['cleaned_title'] = movie_data['title'].map(lambda x: clean_item(x))
movie_data.head()

Unnamed: 0,movieId,title,genres,tag,imdbId,cast,writers,producers,runtime,boxOfficeBudget,plotOutline,year,cleaned_title
0,1,Toy Story,"[adventure, animation, children, comedy, fantasy]","[pixar, pixar, fun]",114709,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[John Lasseter, Pete Docter, Andrew Stanton, J...","[Bonnie Arnold, Ed Catmull, Ralph Guggenheim, ...",81.0,30000000.0,A little boy named Andy loves to be in his roo...,1995,"[toy, story]"
1,2,Jumanji,"[adventure, children, fantasy]","[fantasy, magic board game, Robin Williams, game]",113497,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[Jonathan Hensleigh, Greg Taylor, Jim Strain, ...","[Robert W. Cort, Ted Field, Larry Franco, Scot...",104.0,50000000.0,"Jumanji, one of the most unique--and dangerous...",1995,[jumanji]
2,3,Grumpier Old Men,"[comedy, romance]","[moldy, old]",113228,"[Walter Matthau, Jack Lemmon, Sophia Loren, An...","[Mark Steven Johnson, Mark Steven Johnson]","[Richard C. Berman, John Davis, George Folsey ...",101.0,25000000.0,Things don't seem to change much in Wabasha Co...,1995,"[grumpier, old, men]"
3,4,Waiting to Exhale,"[comedy, drama, romance]",[N/A],114885,"[Whitney Houston, Angela Bassett, Loretta Devi...","[Terry McMillan, Terry McMillan, Ron Bass]","[Ron Bass, Caron K, Terry McMillan, Deborah Sc...",124.0,16000000.0,This story based on the best selling novel by ...,1995,"[wait, exhale]"
4,5,Father of the Bride Part II,[comedy],"[pregnancy, remake]",113041,"[Steve Martin, Diane Keaton, Martin Short, Kim...","[Albert Hackett, Frances Goodrich, Nancy Meyer...","[Carol Baum, Bruce A. Block, Julie B. Crane, J...",106.0,30000000.0,"In this sequel to ""Father of the Bride"", Georg...",1995,"[father, bride, ii]"


In [None]:
movie_data['cleaned_tags'] = movie_data['tag'].map(lambda x: clean_item(' '.join(x)))
movie_data.head()


Unnamed: 0,movieId,title,genres,tag,imdbId,cast,writers,producers,runtime,boxOfficeBudget,plotOutline,year,cleaned_title,cleaned_tags
0,1,Toy Story,"[adventure, animation, children, comedy, fantasy]","[pixar, pixar, fun]",114709,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[John Lasseter, Pete Docter, Andrew Stanton, J...","[Bonnie Arnold, Ed Catmull, Ralph Guggenheim, ...",81.0,30000000.0,A little boy named Andy loves to be in his roo...,1995,"[toy, story]","[pixar, pixar, fun]"
1,2,Jumanji,"[adventure, children, fantasy]","[fantasy, magic board game, Robin Williams, game]",113497,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[Jonathan Hensleigh, Greg Taylor, Jim Strain, ...","[Robert W. Cort, Ted Field, Larry Franco, Scot...",104.0,50000000.0,"Jumanji, one of the most unique--and dangerous...",1995,[jumanji],"[fantasy, magic, board, game, robin, williams,..."
2,3,Grumpier Old Men,"[comedy, romance]","[moldy, old]",113228,"[Walter Matthau, Jack Lemmon, Sophia Loren, An...","[Mark Steven Johnson, Mark Steven Johnson]","[Richard C. Berman, John Davis, George Folsey ...",101.0,25000000.0,Things don't seem to change much in Wabasha Co...,1995,"[grumpier, old, men]","[moldy, old]"
3,4,Waiting to Exhale,"[comedy, drama, romance]",[N/A],114885,"[Whitney Houston, Angela Bassett, Loretta Devi...","[Terry McMillan, Terry McMillan, Ron Bass]","[Ron Bass, Caron K, Terry McMillan, Deborah Sc...",124.0,16000000.0,This story based on the best selling novel by ...,1995,"[wait, exhale]",[n]
4,5,Father of the Bride Part II,[comedy],"[pregnancy, remake]",113041,"[Steve Martin, Diane Keaton, Martin Short, Kim...","[Albert Hackett, Frances Goodrich, Nancy Meyer...","[Carol Baum, Bruce A. Block, Julie B. Crane, J...",106.0,30000000.0,"In this sequel to ""Father of the Bride"", Georg...",1995,"[father, bride, ii]","[pregnancy, remake]"


In [None]:
vocabulary = movie_data['cleaned_title'].tolist() + movie_data['cleaned_tags'].tolist()
vocabulary

[['toy', 'story'],
 ['jumanji'],
 ['grumpier', 'old', 'men'],
 ['wait', 'exhale'],
 ['father', 'bride', 'ii'],
 ['heat'],
 ['sabrina'],
 ['tom', 'huck'],
 ['sudden', 'death'],
 ['goldeneye'],
 ['american', 'president'],
 ['dracula', 'dead', 'love'],
 ['balto'],
 ['nixon'],
 ['cutthroat', 'island'],
 ['casino'],
 ['sense', 'sensibility'],
 ['room'],
 ['ace', 'ventura', 'nature', 'call'],
 ['money', 'train'],
 ['shorty'],
 ['copycat'],
 ['assassin'],
 ['powder'],
 ['leave', 'las', 'vegas'],
 ['othello'],
 [''],
 ['persuasion'],
 ['city', 'lost', 'children', 'cit', 'des', 'enfant', 'perdus', 'la'],
 ['shanghai', 'triad', 'yao', 'yao', 'yao', 'dao', 'waipo', 'qiao'],
 ['dangerous', 'mind'],
 ['monkeys', 'a', 'k', 'a', 'monkeys'],
 ['babe'],
 ['dead', 'man', 'walk'],
 ['take'],
 ['clueless'],
 ['cry', 'beloved', 'country'],
 ['richard', 'iii'],
 ['dead', 'president'],
 ['restoration'],
 ['mortal', 'kombat'],
 ['die'],
 ['american', 'quilt'],
 ['seven', 'a', 'k', 'a', 'se', 'en'],
 ['pocahon

In [None]:
phrase_transformer = Phrases(vocabulary, min_count=30)
phrase_transformer.vocab

{'toy': 6,
 'story': 105,
 'toy_story': 3,
 'jumanji': 2,
 'grumpier': 1,
 'old': 17,
 'grumpier_old': 1,
 'men': 30,
 'old_men': 5,
 'wait': 8,
 'exhale': 1,
 'wait_exhale': 1,
 'father': 22,
 'bride': 18,
 'father_bride': 3,
 'ii': 114,
 'bride_ii': 1,
 'heat': 8,
 'sabrina': 2,
 'tom': 20,
 'huck': 2,
 'tom_huck': 1,
 'sudden': 2,
 'death': 65,
 'sudden_death': 1,
 'goldeneye': 1,
 'american': 65,
 'president': 11,
 'american_president': 1,
 'dracula': 12,
 'dead': 72,
 'dracula_dead': 1,
 'love': 122,
 'dead_love': 1,
 'balto': 1,
 'nixon': 3,
 'cutthroat': 1,
 'island': 20,
 'cutthroat_island': 1,
 'casino': 5,
 'sense': 6,
 'sensibility': 1,
 'sense_sensibility': 1,
 'room': 20,
 'ace': 5,
 'ventura': 3,
 'ace_ventura': 2,
 'nature': 6,
 'ventura_nature': 1,
 'call': 9,
 'nature_call': 2,
 'money': 21,
 'train': 25,
 'money_train': 1,
 'shorty': 1,
 'copycat': 1,
 'assassin': 13,
 'powder': 2,
 'leave': 8,
 'las': 6,
 'leave_las': 1,
 'vegas': 9,
 'las_vegas': 4,
 'othello': 4,
 

In [None]:
word2vec = Word2Vec(phrase_transformer[vocabulary], vector_size=100, window=5, min_count=1, workers=4, epochs=30)

In [None]:
# word2vec = KeyedVectors.load_word2vec_format("/gdrive/MyDrive/GoogleNews-vectors-negative300.bin.gz", binary=True)

In [None]:
genres = sorted(set(genre for sublist in movie_data['genres'] for genre in sublist))
genres

['N/A',
 'action',
 'adventure',
 'animation',
 'children',
 'comedy',
 'crime',
 'documentary',
 'drama',
 'fantasy',
 'film-noir',
 'horror',
 'imax',
 'musical',
 'mystery',
 'romance',
 'sci-fi',
 'thriller',
 'war',
 'western']

In [None]:
genre_matrix = pd.DataFrame(0, index=movie_data.index, columns=genres)
for i, genre_list in enumerate(movie_data['genres']):
    genre_matrix.loc[i, genre_list] = 1

genre_matrix


Unnamed: 0,N/A,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,film-noir,horror,imax,musical,mystery,romance,sci-fi,thriller,war,western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,0,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
9738,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
9739,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
9740,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
tfidf = TfidfVectorizer(analyzer='word',
                        ngram_range=(1, 3),
                        min_df=0.003,
                        max_df=0.5,
                        max_features=1000,
                        stop_words='english'
                     )

title_vectors = tfidf.fit_transform(movie_data['title'])


In [None]:
combined_features = pd.concat([pd.DataFrame(title_vectors.toarray()), genre_matrix], axis=1)
combined_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,film-noir,horror,imax,musical,mystery,romance,sci-fi,thriller,war,western
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
9738,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
9739,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
9740,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# convert to sparse matrix
movie_features_sparse = csr_matrix(combined_features.to_numpy())
movie_features_sparse


<9742x73 sparse matrix of type '<class 'numpy.float64'>'
	with 24842 stored elements in Compressed Sparse Row format>

In [None]:
movie_data_indexed = movie_data.set_index(CONTENT_COLUMN_NAME)
movie_data_indexed.head()

Unnamed: 0_level_0,title,genres,tag,imdbId,cast,writers,producers,runtime,boxOfficeBudget,plotOutline,year,cleaned_title,cleaned_tags
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,Toy Story,"[adventure, animation, children, comedy, fantasy]","[pixar, pixar, fun]",114709,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[John Lasseter, Pete Docter, Andrew Stanton, J...","[Bonnie Arnold, Ed Catmull, Ralph Guggenheim, ...",81.0,30000000.0,A little boy named Andy loves to be in his roo...,1995,"[toy, story]","[pixar, pixar, fun]"
2,Jumanji,"[adventure, children, fantasy]","[fantasy, magic board game, Robin Williams, game]",113497,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[Jonathan Hensleigh, Greg Taylor, Jim Strain, ...","[Robert W. Cort, Ted Field, Larry Franco, Scot...",104.0,50000000.0,"Jumanji, one of the most unique--and dangerous...",1995,[jumanji],"[fantasy, magic, board, game, robin, williams,..."
3,Grumpier Old Men,"[comedy, romance]","[moldy, old]",113228,"[Walter Matthau, Jack Lemmon, Sophia Loren, An...","[Mark Steven Johnson, Mark Steven Johnson]","[Richard C. Berman, John Davis, George Folsey ...",101.0,25000000.0,Things don't seem to change much in Wabasha Co...,1995,"[grumpier, old, men]","[moldy, old]"
4,Waiting to Exhale,"[comedy, drama, romance]",[N/A],114885,"[Whitney Houston, Angela Bassett, Loretta Devi...","[Terry McMillan, Terry McMillan, Ron Bass]","[Ron Bass, Caron K, Terry McMillan, Deborah Sc...",124.0,16000000.0,This story based on the best selling novel by ...,1995,"[wait, exhale]",[n]
5,Father of the Bride Part II,[comedy],"[pregnancy, remake]",113041,"[Steve Martin, Diane Keaton, Martin Short, Kim...","[Albert Hackett, Frances Goodrich, Nancy Meyer...","[Carol Baum, Bruce A. Block, Julie B. Crane, J...",106.0,30000000.0,"In this sequel to ""Father of the Bride"", Georg...",1995,"[father, bride, ii]","[pregnancy, remake]"


In [None]:
def get_item_profile(item_id):
    idx = item_ids.index(item_id)
    item_profile = movie_features_sparse[idx:idx+1]
    return item_profile

def get_item_profiles(ids):
    item_profiles_list = [get_item_profile(x) for x in ids]
    item_profiles = scipy.sparse.vstack(item_profiles_list)
    return item_profiles

def build_users_profile(person_id, interactions_indexed_df):
    interactions_person_df = interactions_indexed_df.loc[person_id]
    user_item_profiles = get_item_profiles(interactions_person_df[CONTENT_COLUMN_NAME])

    user_item_strengths = np.array(interactions_person_df[RATING_COLUMN_NAME]).reshape(-1,1)
    #Weighted average of item profiles by the interactions strength
    user_item_strengths_weighted_avg = np.sum(user_item_profiles.multiply(user_item_strengths), axis=0) / np.sum(user_item_strengths)
    user_profile_norm = sklearn.preprocessing.normalize(np.asarray(user_item_strengths_weighted_avg)) # a: converted to nparray
    return user_profile_norm

def build_user_keywords(person_id, interactions_indexed_df):
    interactions_person_df = interactions_indexed_df.loc[person_id].head(100) # get top 100 latest interactions
    interactions_ids = interactions_person_df[CONTENT_COLUMN_NAME].tolist()
    person_item_keywords = movie_data_indexed.loc[interactions_ids]['cleaned_title'].tolist()
    person_item_tags = movie_data_indexed.loc[interactions_ids]['cleaned_tags'].tolist()

    if type(person_item_keywords) == str:
        return [person_item_keywords]

    keyword_list = [keyword for keyword_list in person_item_keywords for keyword in keyword_list]
    tag_list = [keyword for keyword_list in person_item_tags for keyword in keyword_list]

    return keyword_list + tag_list

def build_users_profiles():
    interactions_indexed_df = interactions_train_df[interactions_train_df[CONTENT_COLUMN_NAME] \
                                                   .isin(movie_data[CONTENT_COLUMN_NAME])].set_index(USER_COLUMN_NAME).sort_values('timestamp', ascending=False) # sort to get most recent
    user_profiles = {}
    user_keywords = {}
    for person_id in interactions_indexed_df.index.unique():
        user_profiles[person_id] = build_users_profile(person_id, interactions_indexed_df)
        user_keywords[person_id] = build_user_keywords(person_id, interactions_indexed_df)
    return user_profiles, user_keywords

In [None]:
item_ids = movie_data[CONTENT_COLUMN_NAME].tolist()

In [None]:
user_profiles, user_keywords = build_users_profiles()

Generate cosine similarity matrix

In [None]:
nlp_feature_matrix = np.empty((0,100))

In [None]:
for i, movie in movie_data.iterrows():
    words = movie['cleaned_title']+ movie['cleaned_tags']
    tmp_vec = word2vec.wv.get_mean_vector(words)
    nlp_feature_matrix = np.vstack((nlp_feature_matrix, tmp_vec))

nlp_feature_matrix

array([[-0.14693396,  0.09856401,  0.07261126, ..., -0.08653352,
        -0.04004115,  0.06121834],
       [-0.13134831,  0.10795542,  0.08560921, ..., -0.08825599,
        -0.0436101 ,  0.03664498],
       [-0.12745723,  0.10856656,  0.08219623, ..., -0.10108852,
        -0.0366875 ,  0.03874938],
       ...,
       [-0.13752475,  0.10546469,  0.09300745, ..., -0.09588847,
        -0.03554516,  0.03660046],
       [-0.1345007 ,  0.11921982,  0.08311919, ..., -0.09798641,
        -0.03299344,  0.03611523],
       [-0.1331241 ,  0.10729369,  0.08978736, ..., -0.09427726,
        -0.04238031,  0.03957961]])

In [None]:
class ContentBasedRecommenderNLP:

    MODEL_NAME = 'Content-Based'

    def __init__(self, items_df=None):
        self.item_ids = item_ids
        self.items_df = items_df

    def get_model_name(self):
        return self.MODEL_NAME


    # def _get_similar_items_to_user_profile(self, person_id, topn=1000):

    #     feat_strength = 0.9
    #     nlp_strength = 1.0
    #     # #Computes the cosine similarity between the user profile and all item profiles
    #     cosine_similarities_features = cosine_similarity(user_profiles[person_id], movie_features_sparse)[0]

    #     user_vector = word2vec.wv.get_mean_vector(user_keywords[person_id])
    #     cosine_similarities_nlp = word2vec.wv.cosine_similarities(user_vector, nlp_feature_matrix)

    #     cosine_similarities = np.add(cosine_similarities_features*feat_strength,
    #                                  cosine_similarities_nlp*nlp_strength)

    #     # #Gets the top similar items
    #     similar_indices = cosine_similarities.argsort().flatten()[-topn:]
    #     #Sort the similar items by similarity
    #     similar_items = sorted([(item_ids[i], cosine_similarities[i]) for i in similar_indices], key=lambda x: -x[1])
    #     return similar_items


    def _get_similar_items_to_user_profile(self, person_id, topn=1000):

        feat_strength = 0.9
        nlp_strength = 1.0
        # #Computes the cosine similarity between the user profile and all item profiles
        cosine_similarities_features = cosine_similarity(user_profiles[person_id], movie_features_sparse)[0]

        user_vector = word2vec.wv.get_mean_vector(user_keywords[person_id])
        cosine_similarities_nlp = word2vec.wv.cosine_similarities(user_vector, nlp_feature_matrix)

        cosine_similarities = np.add(cosine_similarities_features*feat_strength,
                                     cosine_similarities_nlp*nlp_strength)

        # #Gets the top similar items
        similar_indices = cosine_similarities.argsort().flatten()[-topn:]
        #Sort the similar items by similarity
        similar_items = sorted([(item_ids[i], cosine_similarities[i]) for i in similar_indices], key=lambda x: -x[1])
        return similar_items


    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        similar_items = self._get_similar_items_to_user_profile(user_id)
        #Ignores items the user has already interacted
        similar_items_filtered = list(filter(lambda x: x[0] not in items_to_ignore, similar_items))

        recommendations_df = pd.DataFrame(similar_items_filtered, columns=[CONTENT_COLUMN_NAME, 'recStrength']) \
                                    .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left',
                                                          left_on = CONTENT_COLUMN_NAME,
                                                          right_on = CONTENT_COLUMN_NAME)[['recStrength', CONTENT_COLUMN_NAME, 'title']]


        return recommendations_df

content_based_recommender_nlp_model = ContentBasedRecommenderNLP(movie_data)

In [None]:
cb5_global_metrics, cb5_results_dataframes = model_evaluator.evaluate_model(content_based_recommender_nlp_model)
model_evaluator.print_results(cb5_global_metrics, cb5_results_dataframes)
model_evaluator.save_results(cb5_global_metrics, cb5_results_dataframes, content_based_recommender_nlp_model.get_model_name(), DATASET_TYPE)


Evaluating Content-Based recommendation model...
609 users processed

Global metrics:
       modelName  recall@5  recall@10  precision@5  precision@10       mrr
0  Content-Based  0.090904   0.145395     0.066557      0.058689  0.165339
     hits@5_count  hits@10_count  interacted_count  recall@5  recall@10  \
170           172            295              2695  0.063822   0.109462   
564           180            305              2475  0.072727   0.123232   
195           233            302              2105  0.110689   0.143468   
228           276            383              1861  0.148307   0.205803   
321            62            114              1343  0.046165   0.084885   
543           138            222              1299  0.106236   0.170901   
307            77            123              1257  0.061257   0.097852   
556           269            375              1215  0.221399   0.308642   
322            92            167              1112  0.082734   0.150180   
206           

# Collaborative filtering

In [None]:
#Creating a sparse pivot table with users in rows and items in columns
users_items_pivot_matrix_df = interactions_train_df.pivot(index=USER_COLUMN_NAME,
                                                          columns=CONTENT_COLUMN_NAME,
                                                          values=RATING_COLUMN_NAME).fillna(0)

users_items_pivot_matrix_df.head(10)

movieId,1,2,3,5,7,10,11,16,17,19,...,131724,134130,134853,137857,152081,160718,164179,164909,168250,177765
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# convert to sparse matrix
user_movies_rating_sparse = csr_matrix(users_items_pivot_matrix_df.to_numpy())
user_movies_rating_sparse

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1830 stored elements and shape (610, 683)>

In [None]:
#The number of factors to factor the user-item matrix.
NUMBER_OF_FACTORS_MF = 10 #change when more users are used
#Performs matrix factorization of the original user item matrix
#U, sigma, Vt = svds(users_items_pivot_matrix, k = NUMBER_OF_FACTORS_MF)
U, sigma, Vt = svds(user_movies_rating_sparse, k = NUMBER_OF_FACTORS_MF)

In [None]:
sigma = np.diag(sigma)
sigma

array([[19.37897953,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        , 20.88803891,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        , 21.61714961,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , 22.71099716,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        , 24.05545424,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        24.77509023,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        , 25.1726034 ,  0.        

In [None]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt)
all_user_predicted_ratings

array([[-4.58635778e-02, -4.11692865e-02,  1.67762857e-04, ...,
        -1.61756877e-03, -7.39264967e-19, -9.85686623e-19],
       [-2.30056853e-02, -2.07788562e-03, -3.48370715e-05, ...,
        -9.22267655e-04, -3.32654160e-20, -4.43538881e-20],
       [ 1.19225722e-05,  1.40161706e-05,  6.42037962e-09, ...,
         6.24538385e-07,  6.39039599e-23,  8.52052799e-23],
       ...,
       [ 5.89944121e-05,  3.23820097e-05,  2.00125136e-09, ...,
         2.52407077e-06,  6.16305679e-23,  8.21740905e-23],
       [-3.73679753e-03,  1.72267717e-05, -8.92007234e-07, ...,
         8.20085025e-05, -4.63909729e-20, -6.18546306e-20],
       [ 8.05422177e-02, -2.84132064e-03,  7.65228009e-06, ...,
         4.08868827e-03,  2.93954123e-19,  3.91938830e-19]])

In [None]:
users_ids = list(users_items_pivot_matrix_df.index)
users_ids[:10]

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [None]:
# normalize
all_user_predicted_ratings_norm = (all_user_predicted_ratings - all_user_predicted_ratings.min()) / (all_user_predicted_ratings.max() - all_user_predicted_ratings.min())

# convert back to df
cf_preds_df = pd.DataFrame(all_user_predicted_ratings_norm, columns = users_items_pivot_matrix_df.columns, index=users_ids).transpose()
cf_preds_df.head(10)

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.195507,0.198553,0.201621,0.20196,0.201017,0.201244,0.201656,0.202769,0.201619,0.233975,...,0.199163,0.201665,0.201618,0.202751,0.201847,0.20169,0.201619,0.201627,0.201121,0.212353
2,0.196133,0.201342,0.201621,0.201631,0.201611,0.201527,0.201621,0.201598,0.201619,0.201767,...,0.200854,0.201648,0.201619,0.201519,0.201588,0.201655,0.201619,0.201624,0.201622,0.201241
3,0.201642,0.201615,0.201619,0.201619,0.201619,0.20162,0.201619,0.20162,0.201619,0.201605,...,0.201609,0.201619,0.201619,0.20162,0.201619,0.201619,0.201619,0.201619,0.201619,0.20162
5,0.201675,0.20162,0.201619,0.201619,0.201619,0.20162,0.201619,0.20162,0.201619,0.201609,...,0.201625,0.201619,0.201619,0.20162,0.201619,0.201619,0.201619,0.201619,0.201619,0.201612
7,0.201619,0.201619,0.201619,0.201619,0.201619,0.201619,0.201619,0.201619,0.201619,0.201619,...,0.201619,0.201619,0.201619,0.201619,0.201619,0.201619,0.201619,0.201619,0.201619,0.201619
10,0.201567,0.201576,0.201619,0.201623,0.201608,0.20162,0.201619,0.201643,0.201619,0.201455,...,0.201688,0.201617,0.201619,0.201639,0.201617,0.201619,0.201619,0.201619,0.201609,0.201438
11,0.201049,0.200662,0.20162,0.201703,0.201731,0.201317,0.201638,0.201198,0.201619,0.218053,...,0.193876,0.201572,0.201619,0.201515,0.20154,0.201661,0.201619,0.201622,0.201732,0.206658
16,0.201646,0.201612,0.201619,0.201619,0.201618,0.20162,0.201619,0.201624,0.201619,0.201654,...,0.201557,0.20162,0.201619,0.201623,0.201618,0.201619,0.201619,0.201619,0.201618,0.201617
17,0.201513,0.20157,0.201619,0.201623,0.201616,0.201613,0.20162,0.201623,0.201619,0.201982,...,0.201529,0.201619,0.201619,0.201625,0.201621,0.20162,0.201619,0.201619,0.201617,0.20173
19,0.201562,0.201601,0.201619,0.20162,0.201621,0.201615,0.201619,0.201612,0.201619,0.201716,...,0.201584,0.201618,0.201619,0.201616,0.201619,0.20162,0.201619,0.201619,0.201621,0.20169


In [None]:
class CFRecommender:

    MODEL_NAME = 'Collaborative Filtering'

    def __init__(self, cf_predictions_df, items_df=None):
        self.cf_predictions_df = cf_predictions_df
        self.items_df = items_df

    def get_model_name(self):
        return self.MODEL_NAME

    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Get and sort the user's predictions
        sorted_user_predictions = self.cf_predictions_df[user_id].sort_values(ascending=False) \
                                    .reset_index().rename(columns={user_id: 'recStrength'})

        # Recommend the highest predicted rating movies that the user hasn't seen yet.
        recommendations_df = sorted_user_predictions[~sorted_user_predictions[CONTENT_COLUMN_NAME].isin(items_to_ignore)] \
                               .sort_values('recStrength', ascending = False) \
                               .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left',
                                                          left_on = CONTENT_COLUMN_NAME,
                                                          right_on = CONTENT_COLUMN_NAME)[['recStrength', CONTENT_COLUMN_NAME, 'title', 'genres']]


        return recommendations_df

cf_recommender_model = CFRecommender(cf_preds_df, movie_data)

In [None]:
cf_global_metrics, cf_results_dataframes = model_evaluator.evaluate_model(cf_recommender_model)
model_evaluator.print_results(cf_global_metrics, cf_results_dataframes)
model_evaluator.save_results(cf_global_metrics, cf_results_dataframes, cf_recommender_model.get_model_name(), DATASET_TYPE)

Evaluating Collaborative Filtering recommendation model...
609 users processed

Global metrics:
                 modelName  recall@5  recall@10  precision@5  precision@10  \
0  Collaborative Filtering   0.28067   0.363453     0.406557      0.344918   

        mrr  
0  0.590379  
     hits@5_count  hits@10_count  interacted_count  recall@5  recall@10  \
170           544            549              2695  0.201855   0.203711   
564           470            481              2475  0.189899   0.194343   
195           401            422              2105  0.190499   0.200475   
228           279            329              1861  0.149919   0.176787   
321           301            339              1343  0.224125   0.252420   
543           148            181              1299  0.113934   0.139338   
307           386            413              1257  0.307080   0.328560   
556           253            290              1215  0.208230   0.238683   
322           227            265            

# Hybrid recommender (combined)

In [None]:
class HybridRecommender:

    MODEL_NAME = 'Hybrid'

    def __init__(self, cb_rec_model, cf_rec_model, pop_rec_model, items_df, cb_ensemble_weight=1.0, cf_ensemble_weight=1.0, pop_ensemble_weight=1.0):
        self.cb_rec_model = cb_rec_model
        self.cf_rec_model = cf_rec_model
        self.pop_rec_model = pop_rec_model
        self.cb_ensemble_weight = cb_ensemble_weight
        self.cf_ensemble_weight = cf_ensemble_weight
        self.pop_ensemble_weight = pop_ensemble_weight
        self.items_df = items_df

    def get_model_name(self):
        return self.MODEL_NAME

    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        #Getting the top-1000 Content-based filtering recommendations
        cb_recs_df = self.cb_rec_model.recommend_items(user_id, items_to_ignore=items_to_ignore, verbose=verbose,
                                                           topn=1000).rename(columns={'recStrength': 'recStrengthCB'})

        #Getting the top-1000 Collaborative filtering recommendations
        cf_recs_df = self.cf_rec_model.recommend_items(user_id, items_to_ignore=items_to_ignore, verbose=verbose,
                                                           topn=1000).rename(columns={'recStrength': 'recStrengthCF'})

        # pop_recs_df = self.pop_rec_model.recommend_items(user_id, items_to_ignore=items_to_ignore, verbose=verbose,
        #                                                    topn=1000).rename(columns={'recStrength': 'recStrengthPop'})

        #Combining the results by contentId
        recs_df = cb_recs_df.merge(cf_recs_df,
                                   how = 'outer',
                                   left_on = CONTENT_COLUMN_NAME,
                                   right_on = CONTENT_COLUMN_NAME).fillna(0.0)


        # recs_df = recs_df.merge(pop_recs_df,
        #                            how = 'outer',
        #                            left_on = CONTENT_COLUMN_NAME,
        #                            right_on = CONTENT_COLUMN_NAME).fillna(0.0)

        # print(recs_df)

        #Computing a hybrid recommendation score based on CF and CB scores
        #recs_df['recStrengthHybrid'] = recs_df['recStrengthCB'] * recs_df['recStrengthCF']
        recs_df['recStrengthHybrid'] = (recs_df['recStrengthCF'] * self.cf_ensemble_weight) \
                                        + (recs_df['recStrengthCB'] * self.cb_ensemble_weight) \
                                    #  + (recs_df['recStrengthPop'] * self.pop_ensemble_weight)


        #Sorting recommendations by hybrid score
        recommendations_df = recs_df.sort_values('recStrengthHybrid', ascending=False).head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')


            recommendations_df = recommendations_df.merge(self.items_df, how = 'left',
                                                          left_on = CONTENT_COLUMN_NAME,
                                                          right_on = CONTENT_COLUMN_NAME)[['recStrengthHybrid', CONTENT_COLUMN_NAME, 'title']]


        return recommendations_df

hybrid_recommender_model = HybridRecommender(content_based_recommender_nlp_model, cf_recommender_model, popularity_model, movie_data,
                                             cb_ensemble_weight=10.0, cf_ensemble_weight=100.0, pop_ensemble_weight=40.0)

In [None]:
hybrid_global_metrics, hybrid_results_dataframes = model_evaluator.evaluate_model(hybrid_recommender_model)
model_evaluator.print_results(hybrid_global_metrics, hybrid_results_dataframes)
model_evaluator.save_results(hybrid_global_metrics, hybrid_results_dataframes, hybrid_recommender_model.get_model_name(), DATASET_TYPE)

Evaluating Hybrid recommendation model...
609 users processed

Global metrics:
  modelName  recall@5  recall@10  precision@5  precision@10       mrr
0    Hybrid  0.438666   0.578636     0.106557      0.088852  0.244342
     hits@5_count  hits@10_count  interacted_count  recall@5  recall@10  \
413            49             91               539  0.090909   0.168831   
598            24             56               495  0.048485   0.113131   
473           100            135               421  0.237530   0.320665   
447            35             56               372  0.094086   0.150538   
273            84            126               269  0.312268   0.468401   
609            59             88               260  0.226923   0.338462   
67             87            116               252  0.345238   0.460317   
379            68             94               243  0.279835   0.386831   
605            54             71               223  0.242152   0.318386   
287            44             6

# Reduced data weighted hybrid recommender

In [None]:
class LowDataHybridRecommender:

    MODEL_NAME = 'LowDataHybrid'

    def __init__(self, cf_rec_model, pop_rec_model, items_df, cf_ensemble_weight=1.0, pop_ensemble_weight=1.0):
        self.cf_rec_model = cf_rec_model
        self.pop_rec_model = pop_rec_model
        self.cf_ensemble_weight = cf_ensemble_weight
        self.pop_ensemble_weight = pop_ensemble_weight
        self.items_df = items_df

    def get_model_name(self):
        return self.MODEL_NAME

    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):

        #Getting the top-1000 Collaborative filtering recommendations
        cf_recs_df = self.cf_rec_model.recommend_items(user_id, items_to_ignore=items_to_ignore, verbose=verbose,
                                                           topn=1000).rename(columns={'recStrength': 'recStrengthCF'})

        pop_recs_df = self.pop_rec_model.recommend_items(user_id, items_to_ignore=items_to_ignore, verbose=verbose,
                                                           topn=1000).rename(columns={'recStrength': 'recStrengthPop'})

        #Combining the results by contentId
        recs_df = pop_recs_df.merge(cf_recs_df,
                                   how = 'outer',
                                   left_on = CONTENT_COLUMN_NAME,
                                   right_on = CONTENT_COLUMN_NAME).fillna(0.0)


        #Computing a hybrid recommendation score based on CF and CB scores
        recs_df['recStrength'] = (recs_df['recStrengthCF'] * self.cf_ensemble_weight) \
                                     + (recs_df['recStrengthPop'] * self.pop_ensemble_weight)


        #Sorting recommendations by hybrid score
        recommendations_df = recs_df.sort_values('recStrength', ascending=False).head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')


            recommendations_df = recommendations_df.merge(self.items_df, how = 'left',
                                                          left_on = CONTENT_COLUMN_NAME,
                                                          right_on = CONTENT_COLUMN_NAME)[['recStrength', CONTENT_COLUMN_NAME, 'title']]


        return recommendations_df

lowdata_hybrid_recommender_model = LowDataHybridRecommender(cf_recommender_model, popularity_model, movie_data,
                                              cf_ensemble_weight=40.0, pop_ensemble_weight=60.0)

In [None]:
lowdata_global_metrics, lowdata_results_dataframes = model_evaluator.evaluate_model(lowdata_hybrid_recommender_model)
model_evaluator.print_results(lowdata_global_metrics, lowdata_results_dataframes)
model_evaluator.save_results(lowdata_global_metrics, lowdata_results_dataframes, lowdata_hybrid_recommender_model.get_model_name(), DATASET_TYPE)

Evaluating LowDataHybrid recommendation model...
609 users processed

Global metrics:
       modelName  recall@5  recall@10  precision@5  precision@10      mrr
0  LowDataHybrid  0.451104   0.556067      0.46623      0.425738  0.63426
     hits@5_count  hits@10_count  interacted_count  recall@5  recall@10  \
170           849            852              2695  0.315028   0.316141   
564           696            733              2475  0.281212   0.296162   
195           557            612              2105  0.264608   0.290736   
228           467            538              1861  0.250940   0.289092   
321           506            561              1343  0.376768   0.417722   
543           261            368              1299  0.200924   0.283295   
307           633            672              1257  0.503580   0.534606   
556           415            481              1215  0.341564   0.395885   
322           315            390              1112  0.283273   0.350719   
206           43

# Switching hybrid recommender

In [None]:
class SwitchingHybridRecommender:

    MODEL_NAME = 'SwitchingHybrid'

    def __init__(self, cf_rec_model, pop_rec_model, hybrid_model, interactions_df, items_df):
        self.interactions_df: pd.DataFrame = interactions_df
        self.cf_rec_model = cf_rec_model
        self.pop_rec_model = pop_rec_model
        self.hybrid_model = hybrid_model
        self.items_df: pd.DataFrame = items_df

    def get_model_name(self):
        return self.MODEL_NAME

    def get_model_for_user(self, user_id):
        num_interactions = len(self.interactions_df.loc[user_id].index)
        if num_interactions <= 3:
            return self.pop_rec_model
        elif num_interactions < 5:
            return self.hybrid_model
        else:
            return self.cf_rec_model



    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        #Getting the top-1000 Content-based filtering recommendations
        chosen_model = self.get_model_for_user(user_id)
        recs_df = chosen_model.recommend_items(user_id, items_to_ignore=items_to_ignore, verbose=verbose,
                                                          topn=1000)

        print(chosen_model)
        print(recs_df.head())

        #Sorting recommendations by hybrid score
        recommendations_df = recs_df.sort_values('recStrength', ascending=False).head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            cols_to_use = self.items_df.columns.difference(recommendations_df.columns).append(pd.Index(['movieId'])) # to avoid column clash

            recommendations_df = recommendations_df.merge(self.items_df[cols_to_use], how = 'left',
                                                          left_on = CONTENT_COLUMN_NAME,
                                                          right_on = CONTENT_COLUMN_NAME)[['recStrength', CONTENT_COLUMN_NAME, 'title']]


        return recommendations_df

switching_hybrid_recommender_model = SwitchingHybridRecommender(cf_recommender_model, popularity_model, lowdata_hybrid_recommender_model,
                                                      interactions_train_df_indexed, movie_data)

In [None]:
switching_global_metrics, switching_results_dataframes = model_evaluator.evaluate_model(switching_hybrid_recommender_model)
model_evaluator.print_results(switching_global_metrics, switching_results_dataframes)
model_evaluator.save_results(switching_global_metrics, switching_results_dataframes, switching_hybrid_recommender_model.get_model_name(), DATASET_TYPE)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
3                 [adventure, fantasy]  
4            [crime, horror, thriller]  
<__main__.CFRecommender object at 0x7f71672f7e10>
   recStrength  movieId                                              title  \
0     0.330922     7153     Lord of the Rings: The Return of the King, The   
1     0.328591      356                                       Forrest Gump   
2     0.326838      318                          Shawshank Redemption, The   
3     0.322523     4993  Lord of the Rings: The Fellowship of the Ring,...   
4     0.319795     5952             Lord of the Rings: The Two Towers, The   

                                genres  
0  [action, adventure, drama, fantasy]  
1        [comedy, drama, romance, war]  
2                       [crime, drama]  
3                 [adventure, fantasy]  
4                 [adventure, fantasy]  
<__main__.PopularityRecommender object at 0x7f71674ae410>
   movieId    rating  views  r

# Misc

In [None]:
from google.colab import runtime
runtime.unassign()