# Content-based model

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import scipy.stats
from scipy.sparse.linalg import svds
import math
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import warnings
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from sklearn.metrics import mean_squared_error
from math import sqrt
from scipy.sparse import csr_matrix
from surprise import accuracy
from sklearn.model_selection import ParameterGrid
warnings.filterwarnings("ignore")

In [2]:
df_merged = pd.read_pickle('data/df_movies_cleaned.pkl')
df_ratings = pd.read_pickle('data/df_ratings_cleaned.pkl')

### Textual Feature - Combined Text

In [3]:
df_merged['combined_text'] = df_merged.apply(lambda row: ' '.join([
    ' '.join(row['genre_extracted']), 
    ' '.join(row['actors']), 
    ' '.join(row['keywords_extracted']), 
    row['overview'], 
    ' '.join(row['production_company_extracted'])
]).lower(), axis=1)

The combined_text feature aggregates critical textual metadata from genres, actors, keywords, and movie descriptions into a single comprehensive descriptor for each movie. This aggregation captures the essence of a movie’s content, thematic elements, and appeal, which is crucial for content-based filtering. By synthesizing this information, the recommender system can identify and suggest movies with similar thematic and content attributes, enhancing personalization and user engagement.

### Modeling Preprocessing

#### Combining df_ratings and df_merged

In [4]:
df_combined = pd.merge(df_ratings, df_merged, on='movieId', how='inner')

#### Setting Rating Threshold

The decision to set a threshold of 20 ratings for each movie before including it in the item-based recommender system is strategic, with the goal of ensuring the reliability and validity of the generated recommendations. This threshold acts as a quality control measure, weeding out movies with sparse feedback that could otherwise result in skewed or less confident recommendations due to insufficient user data. By setting this minimum, the system focuses on movies with a high level of viewer engagement, allowing recommendations to be built on a solid foundation of user feedback. This approach improves the system's ability to deliver accurate, trustworthy recommendations based on broad consensus rather than outliers or minimal feedback, resulting in a better user experience and increased overall credibility for the recommender system.

In [5]:
ratings_per_movie = df_combined.groupby('movieId').size()

movies_with_enough_ratings = ratings_per_movie[ratings_per_movie >= 20].index

df_item_modeling = df_combined[df_combined['movieId'].isin(movies_with_enough_ratings)]

print(f"Original dataset size: {df_combined.shape}")
print(f"Filtered dataset size: {df_item_modeling.shape}")

Original dataset size: (24669326, 19)
Filtered dataset size: (24548423, 19)


With the filtered dataset, df_item_modeling, now comprising 24.548.423 rows out of the original 24.669.326, it's evident that the vast majority of the data meets the threshold of having at least 20 ratings per movie. This minimal reduction in dataset size suggests that most movies in the dataset have a sufficient number of ratings, indicating robust user engagement across a wide range of movies.

##### Grouping Movies

In [6]:
df_grouped = df_item_modeling.groupby('movieId', as_index=False).agg({
    'title': 'first',
    'combined_text': 'first', 
})

df_grouped.head()

Unnamed: 0,movieId,title,combined_text
0,1,Toy Story,animation comedy family tom hanks tim allen do...
1,2,Jumanji,adventure fantasy family robin williams jonath...
2,3,Grumpier Old Men,romance comedy walter matthau jack lemmon ann-...
3,4,Waiting to Exhale,comedy drama romance whitney houston angela ba...
4,5,Father of the Bride Part II,comedy steve martin diane keaton martin short ...


# Content-Based Filtering

Vectorizing the combined_text using TF-IDF transforms qualitative textual information into quantitative vectors, facilitating the measurement of content similarity between movies. This numerical representation allows for sophisticated algorithms to compute similarities based on thematic elements, narrative structures, and genre affiliations. For our movie recommender system, this means being able to recommend movies that are contextually and thematically aligned with a user’s preferences, enhancing the discovery of relevant and appealing content.

## Baseline Model

In [7]:
class BaselineContentRecommender:
    def __init__(self, movies_df, k=100):
        self.movies_df = movies_df.copy()
        self.movies_df['movieId'] = self.movies_df['movieId'].astype(str)
        self.movie_id_to_index = {movie_id: i for i, movie_id in enumerate(self.movies_df['movieId'])}
        self.tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
        self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(df_grouped['combined_text'])
        self.similarity_matrix = cosine_similarity(self.tfidf_matrix)

    def recommend(self, movie_id, top_n=10):
        movie_id = str(movie_id)
        if movie_id not in self.movie_id_to_index:
            print(f"Movie ID {movie_id} not found in the dataset.")
            return []
        
        movie_index = self.movie_id_to_index[movie_id]
        similarity_scores = self.similarity_matrix[movie_index]
        top_k_indices = np.argsort(similarity_scores)[::-1][1:top_n+1]
        recommendations = self.movies_df.iloc[top_k_indices].copy()
        recommendations['cosine_similarity'] = similarity_scores[top_k_indices]
        
        return recommendations.sort_values('cosine_similarity', ascending=False)

In [8]:
recommender_base = BaselineContentRecommender(df_grouped, k=100)
recommendations_base1 = recommender_base.recommend('1', top_n=10)  
print(recommendations_base1[['movieId', 'title', 'cosine_similarity']])

      movieId            title  cosine_similarity
2874     3114      Toy Story 2           0.498092
12007   78499      Toy Story 3           0.417211
1722     1920   Small Soldiers           0.216866
2048     2253             Toys           0.186901
7112     7987            Dolls           0.180138
12339   83219  The Pixar Story           0.178043
1552     1707     Home Alone 3           0.163932
1793     1991     Child's Play           0.151260
9645    46948    Monster House           0.144818
1795     1993   Child's Play 3           0.143198


In [9]:
recommendations_base2 = recommender_base.recommend('2', top_n=10)  
print(recommendations_base2[['movieId', 'title', 'cosine_similarity']])

      movieId                             title  cosine_similarity
8558    27884                         Word Wars           0.246909
5855     6304                         Brainscan           0.164207
9503    44731                        Stay Alive           0.144236
14410  113889  Angry Video Game Nerd: The Movie           0.143081
9366    42725                     Grandma's Boy           0.142472
13344   97913                    Wreck-It Ralph           0.139435
7422     8633              The Last Starfighter           0.132591
15323  139847                         Chevalier           0.131599
8295    26985                           Nirvana           0.130764
14499  115534                             Ouija           0.127464


## Advanced Model 

Vectorized combined_text, which includes genres, keywords, and other descriptive elements, captures the thematic essence of films. This feature uses TF-IDF vectors to emphasize unique descriptors, allowing for the recommendation of movies with similar thematic and stylistic content. This is critical for a content-based system that relies on content similarities. This feature, as already used for the baseline-model, is critical and is shaping the foundation of our modeling approach. However, in order to advance the model we are increasing complexity by adding additonal features. 

#### Feature Engineering
Feature engineering is an important step in the development of machine learning models, including recommender systems, because it involves extracting meaningful variables from raw data to improve model performance and accuracy. This process transforms complex and often unstructured information into structured, analytically useful formats, allowing models to uncover previously unknown patterns, relationships, and insights. In the context of developing a movie recommendation system, effective feature engineering ensures that the nuances of movie content, user preferences, and contextual factors are accurately captured and used. By carefully selecting, combining, and transforming data into features such as weighted scores, combined textual data, and sentiment analysis, developers can significantly improve the system's ability to provide personalized, relevant, and appealing movie recommendations. This not only improves user satisfaction and engagement, but it also strengthens the business case by increasing platform usage and retention.

#### Weighted Score 

In [10]:
movie_stats = df_ratings.groupby('movieId').agg(average_rating=('rating', 'mean'), rating_count=('rating', 'count')).reset_index()

C = movie_stats['average_rating'].mean()
m = movie_stats['rating_count'].quantile(0.90)

def weighted_rating(x, m=m, C=C):
    v = x['rating_count']  
    R = x['average_rating'] 
    return (v/(v+m) * R) + (m/(m+v) * C)

movie_stats['weighted_score'] = movie_stats.apply(weighted_rating, axis=1)

df_ratings = df_ratings.merge(movie_stats[['movieId', 'weighted_score']], on='movieId', how='left')

df_ratings = df_ratings.merge(movie_stats[['movieId', 'average_rating', 'rating_count']], on='movieId', how='left')

print(df_ratings.head())

   userId  movieId  rating           timestamp  user_mean_rating  \
0       1      110     1.0 2015-03-09 22:52:09          4.277778   
1       1      147     4.5 2015-03-09 23:07:15          4.277778   
2       1      858     5.0 2015-03-09 22:52:03          4.277778   
3       1     1221     5.0 2015-03-09 22:52:26          4.277778   
4       1     1246     5.0 2015-03-09 22:52:36          4.277778   

   liked_by_user  weighted_score  average_rating  rating_count  
0          False        4.000352        4.010725         62332  
1           True        3.513227        3.581926          4559  
2           True        4.319932        4.336495         52237  
3           True        4.238059        4.261745         34163  
4           True        3.888786        3.911582         25012  


The weighted score combines a movie's average rating (vote_average) and the number of ratings (vote_count) it has received to provide a balanced metric that reflects both popularity and quality. This approach mitigates the bias towards movies with a high average rating but a low number of ratings, ensuring that the recommendations are not only high-quality but also broadly appreciated. For a movie recommender system, integrating the weighted score helps prioritize movies that have proven appeal, aligning recommendations with broader viewer satisfaction.

#### Movie Age

In [11]:
current_year = datetime.datetime.now().year

df_merged['movie_age'] = current_year - pd.to_datetime(df_merged['release_date']).dt.year

Calculating the movie_age from the release date provides insight into the recency and potential cultural relevance of a movie. In the context of a movie recommender system, this allows for temporal filtering and trend analysis, enabling recommendations that cater to preferences for newer releases or classic films. Understanding movie age is essential for aligning recommendations with temporal viewing trends and user preferences for contemporary versus classic cinema.

#### Sentiment Analysis of Overview

In [12]:
def get_sentiment(text):
    try:
        return TextBlob(text).sentiment.polarity
    except:
        return None 

df_merged['sentiment_polarity'] = df_merged['overview'].apply(get_sentiment)

Performing sentiment analysis on movie descriptions yields a sentiment_polarity score, offering a nuanced view of the emotional tone or mood conveyed by the movie's narrative. This feature is particularly important for recommending movies that match a user’s emotional preferences or current mood, adding an additional layer of personalization. By integrating sentiment analysis, your recommender system can differentiate movies not just by genre or content but also by the emotional experience they offer, enhancing user satisfaction and engagement.

#### Adding new features to df_grouped

In [13]:
df_ratings_aggregated = df_ratings.groupby('movieId', as_index=False).agg({
    'weighted_score': 'mean'  
})

df_merged_aggregated = df_merged.groupby('movieId', as_index=False).agg({
    'movie_age': 'mean', 
    'sentiment_polarity': 'mean'
})

We combine df_ratings and df_merged to simplify our dataset, ensuring that each movieId is represented by a single set of features. By averaging weighted_score, movie_age, and sentiment_polarity, we capture each film's overall essence, reflecting collective attributes and sentiments. This preprocessing step converts our data into a unified df_grouped format, with each movie listed uniquely, simplifying subsequent analyses and modeling efforts. This approach not only consolidates our dataset to improve efficiency, but it also aligns with our goal of building a cohesive and analytically robust foundation for our recommendation system.


In [14]:
df_grouped = df_grouped.merge(df_ratings_aggregated, on='movieId', how='left')
df_grouped = df_grouped.merge(df_merged_aggregated, on='movieId', how='left')

In [15]:
df_grouped

Unnamed: 0,movieId,title,combined_text,weighted_score,movie_age,sentiment_polarity
0,1,Toy Story,animation comedy family tom hanks tim allen do...,3.884349,29.0,0.112121
1,2,Jumanji,adventure fantasy family robin williams jonath...,3.227394,29.0,-0.218750
2,3,Grumpier Old Men,romance comedy walter matthau jack lemmon ann-...,3.141024,29.0,0.038889
3,4,Waiting to Exhale,comedy drama romance whitney houston angela ba...,2.895332,29.0,0.600000
4,5,Father of the Bride Part II,comedy steve martin diane keaton martin short ...,3.061166,29.0,0.466667
...,...,...,...,...,...,...
16122,173941,Atomic Blonde,action thriller charlize theron james mcavoy s...,3.067902,7.0,-0.266667
16123,174053,Black Mirror: White Christmas,drama horror mystery science fiction thriller ...,3.149876,10.0,0.067143
16124,174055,Dunkirk,action drama history thriller war fionn whiteh...,3.344268,7.0,0.000000
16125,174371,Once Upon a Time in Venice,action comedy thriller bruce willis jason momo...,3.026085,7.0,0.075000


When developing a content-based recommender system, the selection of features is critical to its success. The selected features - weighted_score, vectorized combined_text, movie_age, runtime, and sentiment_polarity - are critical for capturing the multifaceted nature of films and their reception by audiences.

The weighted_score is critical for determining a movie's appeal, as it combines the average rating with the number of ratings to provide a balanced picture of its popularity and acceptance. This feature helps to reduce biases toward movies with less ratings, ensuring that recommendations are not only popular but also well-regarded.

Movie_age incorporate personal preference and temporal relevance into the recommendation process.This feature allows the system to align recommendations with users' inclinations towards newer releases or classic films. These features provide additional layers of personalization, increasing user satisfaction by accommodating individual preferences for movie duration and novelty.

Finally, sentiment_polarity provides information about the emotional tone of movie descriptions or reviews. This feature allows the system to recommend movies that match not only in content but also in mood, providing a more nuanced approach to similarity that goes beyond simple thematic alignment.

Together, these features form a strong foundation for an item-based recommender system. By taking into account both content and key characteristics that influence viewer preferences, the system is better able to provide precise and satisfying movie recommendations.

In [16]:
class AdvancedContentRecommender:
    def __init__(self, movies_df, k=100):
        self.movies_df = movies_df.copy()
        self.movies_df['movieId'] = self.movies_df['movieId'].astype(str)
        self.k = k
        self.tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
        self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(self.movies_df['combined_text'])
        self.similarity_matrix = cosine_similarity(self.tfidf_matrix)

    def recommend(self, movie_id, top_n=10):
        movie_id = str(movie_id)
        if movie_id not in self.movies_df['movieId'].values:
            print(f"Movie ID {movie_id} not found in the dataset.")
            return pd.DataFrame()

        movie_index = self.movies_df.index[self.movies_df['movieId'] == movie_id].tolist()[0]
        similarity_scores = self.similarity_matrix[movie_index]
        top_k_indices = np.argsort(similarity_scores)[::-1][1:self.k+1]
        top_k_df = self.movies_df.iloc[top_k_indices].copy()
        
        scaler = MinMaxScaler()
        for feature in ['weighted_score', 'movie_age']:
            if feature in top_k_df:
                top_k_df[feature] = scaler.fit_transform(top_k_df[[feature]].values.reshape(-1, 1))

        top_k_df['cosine_similarity'] = similarity_scores[top_k_indices]
        top_k_df['combined_score'] = (
            0.5 * top_k_df['cosine_similarity'] +
            0.2 * top_k_df['weighted_score'] + 
            0.2 * top_k_df['sentiment_polarity'] +
            0.1 * top_k_df['movie_age']
        )
        
        return top_k_df.nlargest(top_n, 'combined_score')[['movieId', 'title', 'combined_score']]

In [17]:
recommender_advanced = AdvancedContentRecommender(df_grouped, k=100)
recommendations_advanced1 = recommender_advanced.recommend('1', top_n=10)  
print(recommendations_advanced1[['movieId', 'title', 'combined_score']])

      movieId                  title  combined_score
2874     3114            Toy Story 2        0.515295
12007   78499            Toy Story 3        0.378643
5555     5974    The Thief of Bagdad        0.355765
1130     1230             Annie Hall        0.343074
7112     7987                  Dolls        0.323468
1143     1244              Manhattan        0.315088
4019     4339     Von Ryan's Express        0.314689
2142     2355           A Bug's Life        0.313347
576       596              Pinocchio        0.308571
1024     1103  Rebel Without a Cause        0.302987


In [18]:
recommendations_advanced2 = recommender_advanced.recommend('2', top_n=10)  
print(recommendations_advanced2[['movieId', 'title', 'combined_score']])

      movieId                                              title  \
1100     1197                                 The Princess Bride   
5788     6232                                          Born Free   
10217   54259                                           Stardust   
1962     2161                              The NeverEnding Story   
2865     3105                                         Awakenings   
9304    41566  The Chronicles of Narnia: The Lion, the Witch ...   
12067   79318                                      Winnebago Man   
3901     4210                                          Manhunter   
7422     8633                               The Last Starfighter   
4768     5126                                  The Deadly Mantis   

       combined_score  
1100         0.394854  
5788         0.308018  
10217        0.303716  
1962         0.300134  
2865         0.278083  
9304         0.267593  
12067        0.250902  
3901         0.249790  
7422         0.243689  
4768       

The advanced model extends the baseline content recommender by including additional movie attributes—weighted_score, sentiment_polarity, and movie_age—as well as feature scaling for weighted_score and movie_age. This evolution enables the model to consider not only thematic content similarity, but also movie quality, viewer sentiment, and timeliness. By combining these various factors, the advanced model hopes to provide more nuanced and personalized recommendations. The strategic inclusion and scaling of these features improves the model's ability to better align recommendations with individual user preferences, potentially improving recommendation accuracy and user satisfaction over the baseline model.

## Evalutation of content-based models 

#### Sampling

In this scenario, the sampling technique used is to calculate a statistically significant sample size in order to estimate the proportion of movies rated 4.0 or higher in a dataset. This decision is based on a specific confidence level (95%) and margin of error (5%), with the goal of obtaining precise and reliable inferences about the population's characteristics from a sample of data. The method used employs a standard formula that includes the Z-score associated with the desired confidence level and the estimated proportion of interest, ensuring that the sample size is sufficient to accurately reflect the population. This technique is critical for designing studies or analyses that require accurate estimations of population parameters for decision-making or hypothesis testing, as it minimizes potential biases and errors caused by small or arbitrarily chosen sample sizes. By rigorously determining the required sample size, the approach improves the credibility and validity of the findings derived from the sample data, making it a cornerstone of statistical analysis and research methodologies.

In [19]:
def calculate_sample_size(confidence_level, margin_of_error, proportion):
    z_score = abs(scipy.stats.norm.ppf((1 - confidence_level) / 2))
    sample_size = math.ceil((z_score ** 2 * proportion * (1 - proportion)) / (margin_of_error ** 2))
    return sample_size

confidence_level = 0.95
margin_of_error = 0.05

proportion_higher_ratings = df_ratings[df_ratings['rating'] >= 4.0].shape[0] / df_ratings.shape[0]
required_sample_size = calculate_sample_size(confidence_level, margin_of_error, proportion_higher_ratings)
print(f"Required sample size: {required_sample_size}")

Required sample size: 385


In [20]:
sample_movie_ids = np.random.choice(df_grouped['movieId'].unique(), size=required_sample_size, replace=False)

In [21]:
subset_df_ratings = df_ratings[df_ratings['movieId'].isin(sample_movie_ids)]

#### Evaluation Function

In [56]:
def evaluate_movie(movie_id, df_ratings, recommender, top_n=10):
    """Evaluate a single movie for the recommender system, adjusted for actual user ratings."""
    recommendations = recommender.recommend(str(movie_id), top_n=top_n)
    if recommendations.empty:
        return np.array([]), None 

    recommended_ids = recommendations['movieId'].astype(str).tolist()
    
    matching_ratings = df_ratings[df_ratings['movieId'].astype(str).isin(recommended_ids)]
    
    hit_rate = (matching_ratings['rating'] >= 4.0).mean() if not matching_ratings.empty else None

    return np.array(matching_ratings['rating']), hit_rate

def evaluate_recommender(df_ratings, recommender, sample_movie_ids, top_n=10, threshold=4.0):
    """Evaluate the recommender system using sampled movie IDs, including adjusted hit rate."""
    all_ratings, hit_rates = [], []

    for movie_id in sample_movie_ids:
        movie_ratings, hit_rate = evaluate_movie(movie_id, df_ratings, recommender, top_n=top_n)
        if movie_ratings.size > 0:
            all_ratings.extend(movie_ratings)
        if hit_rate is not None:
            hit_rates.append(hit_rate)
    
    all_ratings = np.array(all_ratings)
    if len(all_ratings) > 0:
        mae = np.mean(np.abs(all_ratings - 5))
        mse = np.mean((all_ratings - 5) ** 2)
        rmse = np.sqrt(mse)
        precision = np.sum(all_ratings >= threshold) / len(all_ratings)
    else:
        mae, mse, rmse, precision = 0, 0, 0, 0

    avg_hit_rate = np.mean(hit_rates) if hit_rates else None  

    print(f"Sample Size: {len(sample_movie_ids)}")
    print(f"MAE: {mae:.4f}\nMSE: {mse:.4f}\nRMSE: {rmse:.4f}\nPrecision: {precision:.4f}\nAverage Hit Rate: {avg_hit_rate if avg_hit_rate is not None else 'N/A'}")

    return mae, mse, rmse, precision, avg_hit_rate


#### Evaluation of Baseline-Model

In [57]:
mae, mse, rmse, precision, avg_hit_rate = evaluate_recommender(df_ratings, recommender_base, sample_movie_ids, top_n=10, threshold=4.0)

KeyboardInterrupt: 

#### Evaluation of Advanced-Model

In [None]:
mae, mse, rmse, precision, avg_hit_rate = evaluate_recommender(df_ratings, recommender_advanced, sample_movie_ids, top_n=10, threshold=4.0)

Sample Size: 385
MAE: 1.1128
MSE: 2.1326
RMSE: 1.4603
Precision: 0.6535
Average Hit Rate: 0.6382666966643713


The advanced model outperforms the baseline model on all metrics. It has lower MAE, MSE, and RMSE values, implying that its recommendations are, on average, closer to ideal ratings and less prone to large errors. Its higher precision and average hit rate indicate that it is more effective at recommending movies that users are likely to rate highly (4.0 or higher), demonstrating a better understanding and matching of user preferences. In short, depending on the evaluation method used, the advanced model provides more accurate and user-aligned recommendations than the baseline model, making it the better option for increasing user satisfaction with the recommender system.

# Collaborative Filtering Model

Collaborative filtering is a method of making automatic predictions (filtering) about the interests of a user by collecting preferences from many users (collaborating). The underlying assumption of the collaborative filtering approach is that if a person A has the same opinion as a person B on an issue, A is more likely to have B's opinion on a different issue than that of a randomly chosen person.

There are two types of collaborative filtering: user-based and item-based. User-based collaborative filtering is based on the similarity between users and item-based collaborative filtering is based on the similarity between items. For our recommender system we chose an item-based approach. The reasons for that are many. Item-based collaborative filtering is often preferred over user-based collaborative filtering, particularly in environments where the item catalog is relatively stable and doesn't grow as quickly as the user base. Item-based systems have a better scalability and efficiency, especially with large user bases. Unlike user preferences, which can change rapidly and complicate similarity calculations, the characteristics of movies remain constant, making it easier to calculate and store the item similarities as their relationship are stable. An item-based approach sidesteps the complexity and computational demand of constantly updating user similarities, making it a more straightforward choice for delivering recommendations also for new users and less popular items.



### Item-based Collaborative Filtering

To build an item-based collaborative filtering system, we need to calculate the similarity between items based on the ratings users have given to those items. We will use the cosine similarity to calculate the similarity between items. 



In [22]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24848104 entries, 0 to 24848103
Data columns (total 9 columns):
 #   Column            Dtype         
---  ------            -----         
 0   userId            int64         
 1   movieId           int64         
 2   rating            Float64       
 3   timestamp         datetime64[ns]
 4   user_mean_rating  Float64       
 5   liked_by_user     boolean       
 6   weighted_score    float64       
 7   average_rating    Float64       
 8   rating_count      int64         
dtypes: Float64(3), boolean(1), datetime64[ns](1), float64(1), int64(3)
memory usage: 1.8 GB


In [23]:
df_ratings = df_ratings.drop(columns=['user_mean_rating', 'liked_by_user'])

In [24]:
df_ratings.describe()

Unnamed: 0,userId,movieId,rating,weighted_score,average_rating,rating_count
count,24848100.0,24848100.0,24848104.0,24848100.0,24848104.0,24848100.0
mean,135003.6,16211.73,3.528737,3.488773,3.528737,15475.74
std,78175.12,31358.02,1.060048,0.4132034,0.462782,16857.05
min,1.0,1.0,0.5,1.776663,0.5,1.0
25%,67126.0,1088.0,3.0,3.19116,3.253788,3245.0
50%,135134.0,2670.0,3.5,3.510029,3.60369,9539.0
75%,202642.0,6711.0,4.0,3.802293,3.87012,21946.0
max,270896.0,176275.0,5.0,4.411433,5.0,82895.0


To safe computational time, we will use a subset of the data. We will only use ratings from 2016 onwards. A final implementation could use the entire dataset to improve accuracy.

In [25]:
#df_ratings['timestamp'] = pd.to_datetime(df_ratings['timestamp'])
df_ratings = df_ratings.sort_values('timestamp')
df_ratings_subset = df_ratings[df_ratings['timestamp'] > '2016-01-01']
df_ratings_subset = df_ratings_subset.drop(columns=['timestamp'])
df_ratings_subset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3321925 entries, 1319144 to 16502857
Data columns (total 6 columns):
 #   Column          Dtype  
---  ------          -----  
 0   userId          int64  
 1   movieId         int64  
 2   rating          Float64
 3   weighted_score  float64
 4   average_rating  Float64
 5   rating_count    int64  
dtypes: Float64(2), float64(1), int64(3)
memory usage: 183.7 MB


In [26]:
# memory usage of subset / original
(df_ratings_subset.memory_usage() / df_ratings.memory_usage()) * 100

Index             13.368927
average_rating    13.368927
movieId           13.368927
rating            13.368927
rating_count      13.368927
timestamp               NaN
userId            13.368927
weighted_score    13.368927
dtype: float64

For the train/test split we will split the data historically. Temporal splitting ensures that the training data contains information from the past, and the test data contains information from the future. This reflects a real-world scenario better, where the system is trained on historical data and evaluated on more recent/future data to assess its performance. We also tried a random split that resulted in better RMSE values than the temporal split. However, we decided to use the temporal split for the sake of a more realistic approach and to align with industry standards. As a model deployment is not possible we can ensure a better real-world performance by that, at this stage.  

We will use 80% of the data for training and 20% for testing.

In [27]:
# Temporal Train/Test Split
split_index = int(len(df_ratings_subset) * 0.8)

train_data = df_ratings_subset[:split_index]
test_data = df_ratings_subset[split_index:]

We will now create a similarity matrix. The matrix will contain the similarity between each pair of items. We will use the cosine similarity to calculate the similarity between items.

In [28]:
# User-Item Matrix for Training
user_item_matrix_train = train_data.pivot_table(index='userId', columns='movieId', values='rating')

# Item-Item Similarity Matrix
item_similarity = cosine_similarity(user_item_matrix_train.fillna(0).T)
item_similarity_df = pd.DataFrame(item_similarity, index=user_item_matrix_train.columns, columns=user_item_matrix_train.columns)

print(item_similarity_df)
print(item_similarity_df.info())


Looking at our output we encounter a first problem: the matrix size. By only using roughly 13 % of the original data (calculated in memory usage) we end up with a similarity matrix of almost 10 GB in size. This is not feasible for our use case. Consequently, we will implement a Singular Value Decomposition (SVD) to reduce the dimensionality of the matrix.

### Singular Value Decomposition (SVD)


SVD helps in extracting latent factors that explain observed ratings, efficiently reducing data dimensionality while preserving essential information. This significantly speeds up calculations, making the process of predicting ratings more efficient, especially when dealing with a large dataset like ours. Additionally, by focusing on these latent factors, SVD enables a deeper understanding of user preferences and item characteristics, promising more personalized and accurate recommendations.

In [28]:
# create matrix
user_item_matrix_train = train_data.pivot(index='userId', columns='movieId', values='rating').fillna(0)
user_item_matrix_sparse = csr_matrix(user_item_matrix_train.values.astype(float))

# mean centering
mean_user_rating = user_item_matrix_sparse.mean(axis=1)
user_item_matrix_centered = user_item_matrix_sparse - mean_user_rating

# SVD 
U, sigma, Vt = svds(user_item_matrix_centered, k=50) # k selected manually at this stage
sigma_matrix = np.diag(sigma)

# Predict ratings for all users
all_user_predicted_ratings = np.dot(np.dot(U, sigma_matrix), Vt) + mean_user_rating.A1.reshape(-1, 1)

# Create a DataFrame with the predicted ratings
preds_df = pd.DataFrame(all_user_predicted_ratings, index=user_item_matrix_train.index, columns=user_item_matrix_train.columns)

# Predict ratings for the test set
def safe_get_prediction(row):
    try:
        return preds_df.loc[row['userId'], row['movieId']]
    except KeyError:
        return np.nan

test_data['predicted'] = test_data.apply(safe_get_prediction, axis=1)

# filter only rows where we have a prediction
filtered_test_data = test_data.dropna(subset=['predicted'])

# RMSE 
rmse = sqrt(mean_squared_error(filtered_test_data['rating'], filtered_test_data['predicted']))
print(f'RMSE: {rmse}')


Given a 1-to-5 scale, an RMSE of 3.165 is quite high, indicating that the predictions can be quite far off from the actual ratings. Let's try to improve our model.

For that, we will use the surprise library. Surprise automatically handles normalization and scaling of the data as well as the handling of cold start and sparsity issues.


In [None]:
reader = Reader()
data = Dataset.load_from_df(train_data[['userId', 'movieId', 'rating']], reader)

svd = SVD()

# Fit the model 
svd.fit(data.build_full_trainset())

# Predict ratings for the test set
testset = list(zip(test_data['userId'].values, test_data['movieId'].values, test_data['rating'].values))
predictions = svd.test(testset)

print(accuracy.rmse(predictions))
print(accuracy.mae(predictions))

RMSE: 0.9694
0.9693988323484404
MAE:  0.7421
0.7421251681528029


We can see a major improvement of the metrics when using the Surprise library compared to our previous approach!

A Root Mean Square Error (RMSE) of approximately 0.9694 suggests that, on average, our predicted ratings deviate from the actual ratings by around 0.97 units on a scale of 1 to 5. Without considering their direction, they deviate around around 0.7420 units (MAE). We consider this level of error as moderate to good. 


Let's also compute the RMSE and MAE with a random split for illustrative purposes before fine tuning the model on a temporal split.

In [None]:
reader_random = Reader()

data_random_split = Dataset.load_from_df(df_ratings_subset[['userId', 'movieId', 'rating']], reader_random)

svd_random = SVD()

cross_validate(svd_random, data_random_split, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7848  0.7854  0.7840  0.7847  0.7846  0.7847  0.0005  
MAE (testset)     0.5841  0.5849  0.5834  0.5838  0.5838  0.5840  0.0005  
Fit time          18.08   18.23   18.77   18.17   18.26   18.30   0.24    
Test time         2.99    4.10    2.73    2.45    2.66    2.99    0.58    


{'test_rmse': array([0.78475205, 0.78539078, 0.78395027, 0.78474373, 0.78458848]),
 'test_mae': array([0.58412186, 0.58493104, 0.58343147, 0.58379567, 0.58384229]),
 'fit_time': (18.07552695274353,
  18.23200798034668,
  18.76857304573059,
  18.16711926460266,
  18.25535297393799),
 'test_time': (2.991689920425415,
  4.100869178771973,
  2.7325620651245117,
  2.454019784927368,
  2.6558640003204346)}

The superior performance of the random split (model RMSE 0.7847) suggests that it may offer a more balanced and varied dataset for both training and testing phases, potentially leading to a model that is better at generalizing across the entire dataset. 

Yet, as already mentioned for a real-world recommender systems, a temporal split is often preferred to account for evolving preferences and trends over time. For a movie recommender system, especially one like DreamStream that might experience frequent updates to its movie catalog and shifts in user preferences, we  suggest a temporal split. This approach acknowledges the evolving nature of both movies and user tastes, preparing the system to adapt to real-world scenarios more effectively. It also allows the system to better handle cold start problems with new releases. 

Let's get back to our temporal split and try to optimize our model using a GridSearch to find the best combination of hyperparameter for the model. 

In [28]:
reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(train_data[['userId', 'movieId', 'rating']], reader)

trainset = data.build_full_trainset()
testset = list(zip(test_data['userId'].values, test_data['movieId'].values, test_data['rating'].values))

# our grid of parameters
param_grid = {'n_factors': [50, 100],  # Number of factors
              'n_epochs': [20],         # Number of iterations
              'lr_all': [0.005, 0.01],      # Learning rate
              'reg_all': [0.02, 0.05]}      # Regularization term

svd = SVD()

best_rmse = float('inf')
best_params = None

# Loop through parameter combinations
for params in ParameterGrid(param_grid):
    svd = SVD(**params)
    svd.fit(trainset)


    predictions = svd.test(testset)

    # RMSE
    rmse = accuracy.rmse(predictions)

    # Update best RMSE and parameters if necessary
    if rmse < best_rmse:
        best_rmse = rmse
        best_params = params

print("Best RMSE score obtained: ", best_rmse)
print("Best parameters: ", best_params)




RMSE: 0.9689
RMSE: 0.9687
RMSE: 0.9690
RMSE: 0.9686
RMSE: 0.9723
RMSE: 0.9703
RMSE: 0.9720
RMSE: 0.9699
Best RMSE score obtained:  0.968631935393709
Best parameters:  {'lr_all': 0.005, 'n_epochs': 20, 'n_factors': 100, 'reg_all': 0.05}


The best RMSE score obtained is 0.9686 with the following parameters: 

lr_all: 0.005, n_epochs: 20, n_factors: 100, reg_all: 0.05

This is a slightly  better RMSE score as we obtained with the default parameters (RMSE 0.9698). With higher computational power and time, we could further optimize the model by testing more hyperparameters and combinations. At this stage we will stick with the selected parameters from our GridSearch.

Let us now train the best version of our model on the full subset and predict the top ten recommendations for a selected user.

In [29]:
svd = SVD(**best_params)
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x212dfe3bfd0>

Prediction for user: 14204

In [30]:
selected_user_id = 14204
rated_movie_ids = df_ratings_subset[df_ratings_subset['userId'] == selected_user_id]['movieId'].unique()
all_movie_ids = df_ratings_subset['movieId'].unique()

# Predict ratings for all movies that the selected user has not rated yet
predicted_unrated_movies = []
for movie_id in all_movie_ids:
    if movie_id not in rated_movie_ids:
        prediction = svd.predict(uid=selected_user_id, iid=movie_id)
        predicted_unrated_movies.append((movie_id, prediction.est))

# sorting
sorted_predicted_unrated_movies = sorted(predicted_unrated_movies, key=lambda x: x[1], reverse=True)
top_10_unrated_movies = sorted_predicted_unrated_movies[:10]

# Top 10 predicted ratings for the selected user
print(f"Top 10 recommended movies for User {selected_user_id}:")
for i, (movie_id, predicted_rating) in enumerate(top_10_unrated_movies, start=1):
    print(f"Rank {i}: Movie ID {movie_id}, Predicted Rating: {predicted_rating}")


Top 10 recommended movies for User 14204:
Rank 1: Movie ID 93040, Predicted Rating: 4.195004543536218
Rank 2: Movie ID 170705, Predicted Rating: 4.134805189136601
Rank 3: Movie ID 140265, Predicted Rating: 4.062797962633411
Rank 4: Movie ID 159817, Predicted Rating: 4.054666416352564
Rank 5: Movie ID 157373, Predicted Rating: 4.0277859449841396
Rank 6: Movie ID 8484, Predicted Rating: 4.020928384373775
Rank 7: Movie ID 82143, Predicted Rating: 4.002642547273006
Rank 8: Movie ID 5475, Predicted Rating: 3.984403508927792
Rank 9: Movie ID 111130, Predicted Rating: 3.9779020069275632
Rank 10: Movie ID 107412, Predicted Rating: 3.977519160419064


# Hybrid Model

In our pursuit of creating a more nuanced and effective recommendation system, we've decided to integrate our two models — a content-based model and an item-based collaborative filtering model using SVD — into a singular hybrid approach. This approach is driven by our goal to combine the strengths of both models: the content-based model's ability to recommend items based on their intrinsic properties and similarities, and the collaborative filtering model's capacity to incorporate user preferences and historical interactions to predict item ratings with high accuracy. By combining these approaches, we aim to deliver more personalized, diverse, and contextually relevant recommendations, thereby enhancing user satisfaction and engagement with our platform.

In [31]:
def hybrid_model(movie_id, user_id, top_n, svd_model):
    # Step 1: Get top N content-based recommendations
    content_recs = recommender_advanced.recommend(movie_id, top_n)

    content_recs['movieId'] = content_recs['movieId'].astype(int)

    # Step 2: Apply SVD to predict ratings for the top N movies
    content_recs['predicted_rating'] = content_recs['movieId'].apply(
        lambda x: svd_model.predict(user_id, x).est
    )
    
    # Step 3: Sort recommendations by predicted ratings, then by cosine similarity
    final_recs = content_recs.sort_values(
        by=['predicted_rating', 'combined_score'], ascending=[False, False]
    )
    
    final_recs = final_recs[['movieId', 'title', 'combined_score', 'predicted_rating']]

    return final_recs

In [42]:
hybrid_model(1, 2, 10, svd)

Unnamed: 0,movieId,title,combined_score,predicted_rating
1143,1244,Manhattan,0.315088,3.948005
1024,1103,Rebel Without a Cause,0.302987,3.775396
1130,1230,Annie Hall,0.343074,3.762786
12007,78499,Toy Story 3,0.378643,3.677054
4019,4339,Von Ryan's Express,0.314689,3.664717
2874,3114,Toy Story 2,0.515295,3.618099
5555,5974,The Thief of Bagdad,0.355765,3.578651
7112,7987,Dolls,0.323468,3.460829
576,596,Pinocchio,0.308571,3.387574
2142,2355,A Bug's Life,0.313347,3.352371


In [43]:
hybrid_model(50, 2, 10, svd)

Unnamed: 0,movieId,title,combined_score,predicted_rating
10576,58559,The Dark Knight,0.257266,4.018121
411,428,A Bronx Tale,0.249364,3.905122
2800,3039,Trading Places,0.236398,3.776068
5610,6035,Pépé le Moko,0.294232,3.775973
6543,7076,Bullitt,0.251069,3.767918
10367,55765,American Gangster,0.236905,3.720463
8230,26774,Innocent Blood,0.31895,3.669139
310,322,Swimming with Sharks,0.26564,3.541532
5110,5487,Harry and Walter Go To New York,0.237823,3.471251
8590,30818,Beyond the Sea,0.237414,3.339717
