# Content-based model

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import scipy.stats
import math
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import warnings
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
warnings.filterwarnings("ignore")

In [2]:
df_merged = pd.read_pickle('data/df_movies_cleaned.pkl')
df_ratings = pd.read_pickle('data/df_ratings_cleaned.pkl')

### Textual Feature - Combined Text

In [3]:
df_merged['combined_text'] = df_merged.apply(lambda row: ' '.join([
    ' '.join(row['genre_extracted']), 
    ' '.join(row['actors']), 
    ' '.join(row['keywords_extracted']), 
    row['overview'], 
    ' '.join(row['production_company_extracted'])
]).lower(), axis=1)

The combined_text feature aggregates critical textual metadata from genres, actors, keywords, and movie descriptions into a single comprehensive descriptor for each movie. This aggregation captures the essence of a movie’s content, thematic elements, and appeal, which is crucial for content-based filtering. By synthesizing this information, the recommender system can identify and suggest movies with similar thematic and content attributes, enhancing personalization and user engagement.

### Modeling Preprocessing

#### Combining df_ratings and df_merged

In [4]:
df_combined = pd.merge(df_ratings, df_merged, on='movieId', how='inner')

#### Setting Rating Threshold

The decision to set a threshold of 20 ratings for each movie before including it in the item-based recommender system is strategic, with the goal of ensuring the reliability and validity of the generated recommendations. This threshold acts as a quality control measure, weeding out movies with sparse feedback that could otherwise result in skewed or less confident recommendations due to insufficient user data. By setting this minimum, the system focuses on movies with a high level of viewer engagement, allowing recommendations to be built on a solid foundation of user feedback. This approach improves the system's ability to deliver accurate, trustworthy recommendations based on broad consensus rather than outliers or minimal feedback, resulting in a better user experience and increased overall credibility for the recommender system.

In [5]:
ratings_per_movie = df_combined.groupby('movieId').size()

movies_with_enough_ratings = ratings_per_movie[ratings_per_movie >= 20].index

df_item_modeling = df_combined[df_combined['movieId'].isin(movies_with_enough_ratings)]

print(f"Original dataset size: {df_combined.shape}")
print(f"Filtered dataset size: {df_item_modeling.shape}")

Original dataset size: (24669326, 19)
Filtered dataset size: (24548423, 19)


With the filtered dataset, df_item_modeling, now comprising 24.548.423 rows out of the original 24.669.326, it's evident that the vast majority of the data meets the threshold of having at least 20 ratings per movie. This minimal reduction in dataset size suggests that most movies in the dataset have a sufficient number of ratings, indicating robust user engagement across a wide range of movies.

##### Grouping Movies

In [6]:
df_grouped = df_item_modeling.groupby('movieId', as_index=False).agg({
    'title': 'first',
    'combined_text': 'first', 
})

df_grouped.head()

Unnamed: 0,movieId,title,combined_text
0,1,Toy Story,animation comedy family tom hanks tim allen do...
1,2,Jumanji,adventure fantasy family robin williams jonath...
2,3,Grumpier Old Men,romance comedy walter matthau jack lemmon ann-...
3,4,Waiting to Exhale,comedy drama romance whitney houston angela ba...
4,5,Father of the Bride Part II,comedy steve martin diane keaton martin short ...


# Content-Based Filtering

Vectorizing the combined_text using TF-IDF transforms qualitative textual information into quantitative vectors, facilitating the measurement of content similarity between movies. This numerical representation allows for sophisticated algorithms to compute similarities based on thematic elements, narrative structures, and genre affiliations. For our movie recommender system, this means being able to recommend movies that are contextually and thematically aligned with a user’s preferences, enhancing the discovery of relevant and appealing content.

## Baseline Model

In [7]:
class BaselineContentRecommender:
    def __init__(self, movies_df, k=100):
        self.movies_df = movies_df.copy()
        self.movies_df['movieId'] = self.movies_df['movieId'].astype(str)
        self.movie_id_to_index = {movie_id: i for i, movie_id in enumerate(self.movies_df['movieId'])}
        self.tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
        self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(df_grouped['combined_text'])
        self.similarity_matrix = cosine_similarity(self.tfidf_matrix)

    def recommend(self, movie_id, top_n=10):
        movie_id = str(movie_id)
        if movie_id not in self.movie_id_to_index:
            print(f"Movie ID {movie_id} not found in the dataset.")
            return []
        
        movie_index = self.movie_id_to_index[movie_id]
        similarity_scores = self.similarity_matrix[movie_index]
        top_k_indices = np.argsort(similarity_scores)[::-1][1:top_n+1]
        recommendations = self.movies_df.iloc[top_k_indices].copy()
        recommendations['cosine_similarity'] = similarity_scores[top_k_indices]
        
        return recommendations.sort_values('cosine_similarity', ascending=False)

In [8]:
recommender_base = BaselineContentRecommender(df_grouped, k=100)
recommendations_base1 = recommender_base.recommend('1', top_n=10)  
print(recommendations_base1[['movieId', 'title', 'cosine_similarity']])

      movieId            title  cosine_similarity
2874     3114      Toy Story 2           0.498092
12007   78499      Toy Story 3           0.417211
1722     1920   Small Soldiers           0.216866
2048     2253             Toys           0.186901
7112     7987            Dolls           0.180138
12339   83219  The Pixar Story           0.178043
1552     1707     Home Alone 3           0.163932
1793     1991     Child's Play           0.151260
9645    46948    Monster House           0.144818
1795     1993   Child's Play 3           0.143198


In [9]:
recommendations_base2 = recommender_base.recommend('2', top_n=10)  
print(recommendations_base2[['movieId', 'title', 'cosine_similarity']])

      movieId                             title  cosine_similarity
8558    27884                         Word Wars           0.246909
5855     6304                         Brainscan           0.164207
9503    44731                        Stay Alive           0.144236
14410  113889  Angry Video Game Nerd: The Movie           0.143081
9366    42725                     Grandma's Boy           0.142472
13344   97913                    Wreck-It Ralph           0.139435
7422     8633              The Last Starfighter           0.132591
15323  139847                         Chevalier           0.131599
8295    26985                           Nirvana           0.130764
14499  115534                             Ouija           0.127464


## Advanced Model 

Vectorized combined_text, which includes genres, keywords, and other descriptive elements, captures the thematic essence of films. This feature uses TF-IDF vectors to emphasize unique descriptors, allowing for the recommendation of movies with similar thematic and stylistic content. This is critical for a content-based system that relies on content similarities. This feature, as already used for the baseline-model, is critical and is shaping the foundation of our modeling approach. However, in order to advance the model we are increasing complexity by adding additonal features. 

#### Feature Engineering
Feature engineering is an important step in the development of machine learning models, including recommender systems, because it involves extracting meaningful variables from raw data to improve model performance and accuracy. This process transforms complex and often unstructured information into structured, analytically useful formats, allowing models to uncover previously unknown patterns, relationships, and insights. In the context of developing a movie recommendation system, effective feature engineering ensures that the nuances of movie content, user preferences, and contextual factors are accurately captured and used. By carefully selecting, combining, and transforming data into features such as weighted scores, combined textual data, and sentiment analysis, developers can significantly improve the system's ability to provide personalized, relevant, and appealing movie recommendations. This not only improves user satisfaction and engagement, but it also strengthens the business case by increasing platform usage and retention.

#### Weighted Score 

In [10]:
movie_stats = df_ratings.groupby('movieId').agg(average_rating=('rating', 'mean'), rating_count=('rating', 'count')).reset_index()

C = movie_stats['average_rating'].mean()
m = movie_stats['rating_count'].quantile(0.90)

def weighted_rating(x, m=m, C=C):
    v = x['rating_count']  
    R = x['average_rating'] 
    return (v/(v+m) * R) + (m/(m+v) * C)

movie_stats['weighted_score'] = movie_stats.apply(weighted_rating, axis=1)

df_ratings = df_ratings.merge(movie_stats[['movieId', 'weighted_score']], on='movieId', how='left')

df_ratings = df_ratings.merge(movie_stats[['movieId', 'average_rating', 'rating_count']], on='movieId', how='left')

print(df_ratings.head())

   userId  movieId  rating           timestamp  user_mean_rating  \
0       1      110     1.0 2015-03-09 22:52:09          4.277778   
1       1      147     4.5 2015-03-09 23:07:15          4.277778   
2       1      858     5.0 2015-03-09 22:52:03          4.277778   
3       1     1221     5.0 2015-03-09 22:52:26          4.277778   
4       1     1246     5.0 2015-03-09 22:52:36          4.277778   

   liked_by_user  weighted_score  average_rating  rating_count  
0          False        4.000352        4.010725         62332  
1           True        3.513227        3.581926          4559  
2           True        4.319932        4.336495         52237  
3           True        4.238059        4.261745         34163  
4           True        3.888786        3.911582         25012  


The weighted score combines a movie's average rating (vote_average) and the number of ratings (vote_count) it has received to provide a balanced metric that reflects both popularity and quality. This approach mitigates the bias towards movies with a high average rating but a low number of ratings, ensuring that the recommendations are not only high-quality but also broadly appreciated. For a movie recommender system, integrating the weighted score helps prioritize movies that have proven appeal, aligning recommendations with broader viewer satisfaction.

#### Movie Age

In [11]:
current_year = datetime.datetime.now().year

df_merged['movie_age'] = current_year - pd.to_datetime(df_merged['release_date']).dt.year

Calculating the movie_age from the release date provides insight into the recency and potential cultural relevance of a movie. In the context of a movie recommender system, this allows for temporal filtering and trend analysis, enabling recommendations that cater to preferences for newer releases or classic films. Understanding movie age is essential for aligning recommendations with temporal viewing trends and user preferences for contemporary versus classic cinema.

#### Sentiment Analysis of Overview

In [12]:
def get_sentiment(text):
    try:
        return TextBlob(text).sentiment.polarity
    except:
        return None 

df_merged['sentiment_polarity'] = df_merged['overview'].apply(get_sentiment)

Performing sentiment analysis on movie descriptions yields a sentiment_polarity score, offering a nuanced view of the emotional tone or mood conveyed by the movie's narrative. This feature is particularly important for recommending movies that match a user’s emotional preferences or current mood, adding an additional layer of personalization. By integrating sentiment analysis, your recommender system can differentiate movies not just by genre or content but also by the emotional experience they offer, enhancing user satisfaction and engagement.

#### Adding new features to df_grouped

In [13]:
df_ratings_aggregated = df_ratings.groupby('movieId', as_index=False).agg({
    'weighted_score': 'mean'  
})

df_merged_aggregated = df_merged.groupby('movieId', as_index=False).agg({
    'movie_age': 'mean', 
    'sentiment_polarity': 'mean'
})

We combine df_ratings and df_merged to simplify our dataset, ensuring that each movieId is represented by a single set of features. By averaging weighted_score, movie_age, and sentiment_polarity, we capture each film's overall essence, reflecting collective attributes and sentiments. This preprocessing step converts our data into a unified df_grouped format, with each movie listed uniquely, simplifying subsequent analyses and modeling efforts. This approach not only consolidates our dataset to improve efficiency, but it also aligns with our goal of building a cohesive and analytically robust foundation for our recommendation system.


In [14]:
df_grouped = df_grouped.merge(df_ratings_aggregated, on='movieId', how='left')
df_grouped = df_grouped.merge(df_merged_aggregated, on='movieId', how='left')

In [15]:
df_grouped

Unnamed: 0,movieId,title,combined_text,weighted_score,movie_age,sentiment_polarity
0,1,Toy Story,animation comedy family tom hanks tim allen do...,3.884349,29.0,0.112121
1,2,Jumanji,adventure fantasy family robin williams jonath...,3.227394,29.0,-0.218750
2,3,Grumpier Old Men,romance comedy walter matthau jack lemmon ann-...,3.141024,29.0,0.038889
3,4,Waiting to Exhale,comedy drama romance whitney houston angela ba...,2.895332,29.0,0.600000
4,5,Father of the Bride Part II,comedy steve martin diane keaton martin short ...,3.061166,29.0,0.466667
...,...,...,...,...,...,...
16122,173941,Atomic Blonde,action thriller charlize theron james mcavoy s...,3.067902,7.0,-0.266667
16123,174053,Black Mirror: White Christmas,drama horror mystery science fiction thriller ...,3.149876,10.0,0.067143
16124,174055,Dunkirk,action drama history thriller war fionn whiteh...,3.344268,7.0,0.000000
16125,174371,Once Upon a Time in Venice,action comedy thriller bruce willis jason momo...,3.026085,7.0,0.075000


When developing a content-based recommender system, the selection of features is critical to its success. The selected features - weighted_score, vectorized combined_text, movie_age, runtime, and sentiment_polarity - are critical for capturing the multifaceted nature of films and their reception by audiences.

The weighted_score is critical for determining a movie's appeal, as it combines the average rating with the number of ratings to provide a balanced picture of its popularity and acceptance. This feature helps to reduce biases toward movies with less ratings, ensuring that recommendations are not only popular but also well-regarded.

Movie_age incorporate personal preference and temporal relevance into the recommendation process.This feature allows the system to align recommendations with users' inclinations towards newer releases or classic films. These features provide additional layers of personalization, increasing user satisfaction by accommodating individual preferences for movie duration and novelty.

Finally, sentiment_polarity provides information about the emotional tone of movie descriptions or reviews. This feature allows the system to recommend movies that match not only in content but also in mood, providing a more nuanced approach to similarity that goes beyond simple thematic alignment.

Together, these features form a strong foundation for an item-based recommender system. By taking into account both content and key characteristics that influence viewer preferences, the system is better able to provide precise and satisfying movie recommendations.

In [18]:
class AdvancedContentRecommender:
    def __init__(self, movies_df, k=100):
        self.movies_df = movies_df.copy()
        self.movies_df['movieId'] = self.movies_df['movieId'].astype(str)
        self.k = k
        self.tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
        self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(self.movies_df['combined_text'])
        self.similarity_matrix = cosine_similarity(self.tfidf_matrix)

    def recommend(self, movie_id, top_n=10):
        movie_id = str(movie_id)
        if movie_id not in self.movies_df['movieId'].values:
            print(f"Movie ID {movie_id} not found in the dataset.")
            return pd.DataFrame()

        movie_index = self.movies_df.index[self.movies_df['movieId'] == movie_id].tolist()[0]
        similarity_scores = self.similarity_matrix[movie_index]
        top_k_indices = np.argsort(similarity_scores)[::-1][1:self.k+1]
        top_k_df = self.movies_df.iloc[top_k_indices].copy()
        
        scaler = MinMaxScaler()
        for feature in ['weighted_score', 'movie_age']:
            if feature in top_k_df:
                top_k_df[feature] = scaler.fit_transform(top_k_df[[feature]].values.reshape(-1, 1))

        top_k_df['cosine_similarity'] = similarity_scores[top_k_indices]
        top_k_df['combined_score'] = (
            0.5 * top_k_df['cosine_similarity'] +
            0.2 * top_k_df['weighted_score'] + 
            0.2 * top_k_df['sentiment_polarity'] +
            0.1 * top_k_df['movie_age']
        )
        
        return top_k_df.nlargest(top_n, 'combined_score')[['movieId', 'title', 'combined_score']]

In [34]:
recommender_advanced = AdvancedContentRecommender(df_grouped, k=100)
recommendations_advanced1 = recommender_advanced.recommend('1', top_n=10)  
print(recommendations_advanced1[['movieId', 'title', 'combined_score']])

      movieId                  title  combined_score
2874     3114            Toy Story 2        0.515295
12007   78499            Toy Story 3        0.378643
5555     5974    The Thief of Bagdad        0.355765
1130     1230             Annie Hall        0.343074
7112     7987                  Dolls        0.323468
1143     1244              Manhattan        0.315088
4019     4339     Von Ryan's Express        0.314689
2142     2355           A Bug's Life        0.313347
576       596              Pinocchio        0.308571
1024     1103  Rebel Without a Cause        0.302987


In [35]:
recommendations_advanced2 = recommender_advanced.recommend('2', top_n=10)  
print(recommendations_advanced2[['movieId', 'title', 'combined_score']])

      movieId                                              title  \
1100     1197                                 The Princess Bride   
5788     6232                                          Born Free   
10217   54259                                           Stardust   
1962     2161                              The NeverEnding Story   
2865     3105                                         Awakenings   
9304    41566  The Chronicles of Narnia: The Lion, the Witch ...   
12067   79318                                      Winnebago Man   
3901     4210                                          Manhunter   
7422     8633                               The Last Starfighter   
4768     5126                                  The Deadly Mantis   

       combined_score  
1100         0.394854  
5788         0.308018  
10217        0.303716  
1962         0.300134  
2865         0.278083  
9304         0.267593  
12067        0.250902  
3901         0.249790  
7422         0.243689  
4768       

The advanced model extends the baseline content recommender by including additional movie attributes—weighted_score, sentiment_polarity, and movie_age—as well as feature scaling for weighted_score and movie_age. This evolution enables the model to consider not only thematic content similarity, but also movie quality, viewer sentiment, and timeliness. By combining these various factors, the advanced model hopes to provide more nuanced and personalized recommendations. The strategic inclusion and scaling of these features improves the model's ability to better align recommendations with individual user preferences, potentially improving recommendation accuracy and user satisfaction over the baseline model.

## Evalutation of content-based models 

#### Sampling

In this scenario, the sampling technique used is to calculate a statistically significant sample size in order to estimate the proportion of movies rated 4.0 or higher in a dataset. This decision is based on a specific confidence level (95%) and margin of error (5%), with the goal of obtaining precise and reliable inferences about the population's characteristics from a sample of data. The method used employs a standard formula that includes the Z-score associated with the desired confidence level and the estimated proportion of interest, ensuring that the sample size is sufficient to accurately reflect the population. This technique is critical for designing studies or analyses that require accurate estimations of population parameters for decision-making or hypothesis testing, as it minimizes potential biases and errors caused by small or arbitrarily chosen sample sizes. By rigorously determining the required sample size, the approach improves the credibility and validity of the findings derived from the sample data, making it a cornerstone of statistical analysis and research methodologies.

In [20]:
def calculate_sample_size(confidence_level, margin_of_error, proportion):
    z_score = abs(scipy.stats.norm.ppf((1 - confidence_level) / 2))
    sample_size = math.ceil((z_score ** 2 * proportion * (1 - proportion)) / (margin_of_error ** 2))
    return sample_size

confidence_level = 0.95
margin_of_error = 0.05

proportion_higher_ratings = df_ratings[df_ratings['rating'] >= 4.0].shape[0] / df_ratings.shape[0]
required_sample_size = calculate_sample_size(confidence_level, margin_of_error, proportion_higher_ratings)
print(f"Required sample size: {required_sample_size}")

Required sample size: 385


In [21]:
sample_movie_ids = np.random.choice(df_grouped['movieId'].unique(), size=required_sample_size, replace=False)

In [24]:
subset_df_ratings = df_ratings[df_ratings['movieId'].isin(sample_movie_ids)]

#### Evaluation Function

In [30]:
def evaluate_movie(movie_id, df_ratings, recommender, top_n=10):
    """Evaluate a single movie for the recommender system, adjusted for actual user ratings."""
    recommendations = recommender.recommend(str(movie_id), top_n=top_n)
    if recommendations.empty:
        return np.array([]), None 

    recommended_ids = recommendations['movieId'].astype(str).tolist()
    
    matching_ratings = df_ratings[df_ratings['movieId'].astype(str).isin(recommended_ids)]
    
    hit_rate = (matching_ratings['rating'] >= 4.0).mean() if not matching_ratings.empty else None

    return np.array(matching_ratings['rating']), hit_rate

def evaluate_recommender(df_ratings, recommender, sample_movie_ids, top_n=10, threshold=4.0):
    """Evaluate the recommender system using sampled movie IDs, including adjusted hit rate."""
    all_ratings, hit_rates = [], []

    for movie_id in sample_movie_ids:
        movie_ratings, hit_rate = evaluate_movie(movie_id, df_ratings, recommender, top_n=top_n)
        if movie_ratings.size > 0:
            all_ratings.extend(movie_ratings)
        if hit_rate is not None:
            hit_rates.append(hit_rate)
    
    all_ratings = np.array(all_ratings)
    if len(all_ratings) > 0:
        mae = np.mean(np.abs(all_ratings - 5))
        mse = np.mean((all_ratings - 5) ** 2)
        rmse = np.sqrt(mse)
        precision = np.sum(all_ratings >= threshold) / len(all_ratings)
    else:
        mae, mse, rmse, precision = 0, 0, 0, 0

    avg_hit_rate = np.mean(hit_rates) if hit_rates else None  

    print(f"Sample Size: {len(sample_movie_ids)}")
    print(f"MAE: {mae:.4f}\nMSE: {mse:.4f}\nRMSE: {rmse:.4f}\nPrecision: {precision:.4f}\nAverage Hit Rate: {avg_hit_rate if avg_hit_rate is not None else 'N/A'}")

    return mae, mse, rmse, precision, avg_hit_rate


#### Evaluation of Baseline-Model

In [33]:
mae, mse, rmse, precision, avg_hit_rate = evaluate_recommender(df_ratings, recommender_base, sample_movie_ids, top_n=10, threshold=4.0)

Sample Size: 385
MAE: 1.4382
MSE: 3.1930
RMSE: 1.7869
Precision: 0.5136
Average Hit Rate: 0.46579011848169105


#### Evaluation of Advanced-Model

In [36]:
mae, mse, rmse, precision, avg_hit_rate = evaluate_recommender(df_ratings, recommender_advanced, sample_movie_ids, top_n=10, threshold=4.0)

Sample Size: 385
MAE: 1.1128
MSE: 2.1326
RMSE: 1.4603
Precision: 0.6535
Average Hit Rate: 0.6382666966643713


The advanced model outperforms the baseline model on all metrics. It has lower MAE, MSE, and RMSE values, implying that its recommendations are, on average, closer to ideal ratings and less prone to large errors. Its higher precision and average hit rate indicate that it is more effective at recommending movies that users are likely to rate highly (4.0 or higher), demonstrating a better understanding and matching of user preferences. In short, depending on the evaluation method used, the advanced model provides more accurate and user-aligned recommendations than the baseline model, making it the better option for increasing user satisfaction with the recommender system.

# Collaborative Filtering Model

### Setting Rating Thresholds

In [None]:
ratings_per_user = df_ratings["userId"].value_counts()
ratings_per_movie = df_ratings["movieId"].value_counts()

print(ratings_per_user.describe())
print(ratings_per_movie.describe())

count    166444.000000
mean        149.288073
std         248.021364
min          20.000000
25%          35.000000
50%          69.000000
75%         158.000000
max       18276.000000
Name: count, dtype: float64
count    45028.000000
mean       551.836724
std       2869.798512
min          1.000000
25%          2.000000
50%          8.000000
75%         68.000000
max      82895.000000
Name: count, dtype: float64


In [None]:
ratings_count_per_movie = df_ratings.groupby('movieId').size()
movies_with_enough_ratings = ratings_count_per_movie[ratings_count_per_movie >= 20].index
prelim_df_ratings_filtered = df_ratings[df_ratings['movieId'].isin(movies_with_enough_ratings)]

ratings_count_per_user = prelim_df_ratings_filtered.groupby('userId').size()
users_with_enough_ratings = ratings_count_per_user[ratings_count_per_user >= 20].index
df_ratings_filtered_final = prelim_df_ratings_filtered[prelim_df_ratings_filtered['userId'].isin(users_with_enough_ratings)]

final_ratings_count_per_movie = df_ratings_filtered_final.groupby('movieId').size()
final_movies_with_enough_ratings = final_ratings_count_per_movie[final_ratings_count_per_movie >= 20].index
df_ratings_filtered_final = df_ratings_filtered_final[df_ratings_filtered_final['movieId'].isin(final_movies_with_enough_ratings)]

print(df_ratings_filtered_final.groupby('userId').size().describe())
print(df_ratings_filtered_final.groupby('movieId').size().describe())

count    166317.000000
mean        148.571679
std         239.911174
min          20.000000
25%          35.000000
50%          69.000000
75%         158.000000
max        9503.000000
dtype: float64
count    16705.000000
mean      1479.197606
std       4563.793861
min         20.000000
25%         48.000000
50%        163.000000
75%        794.000000
max      82877.000000
dtype: float64


In [27]:
df_ratings_subset = df_ratings_filtered_final.sample(frac=0.05, random_state=42)
df_ratings_subset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1235500 entries, 14910339 to 23314789
Data columns (total 6 columns):
 #   Column            Non-Null Count    Dtype         
---  ------            --------------    -----         
 0   userId            1235500 non-null  int64         
 1   movieId           1235500 non-null  int64         
 2   rating            1235500 non-null  Float64       
 3   timestamp         1235500 non-null  datetime64[ns]
 4   user_mean_rating  1235500 non-null  Float64       
 5   liked_by_user     1235500 non-null  boolean       
dtypes: Float64(2), boolean(1), datetime64[ns](1), int64(2)
memory usage: 61.3 MB


In [28]:
reader = Reader(rating_scale=(0.5, 5))

# Prepare the data for Surprise
data = Dataset.load_from_df(df_ratings_subset[['userId', 'movieId', 'rating']], reader)

# Initialize the SVD algorithm
svd = SVD()

# Perform cross-validation
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9130  0.9133  0.9135  0.9132  0.9129  0.9132  0.0002  
MAE (testset)     0.7031  0.7028  0.7036  0.7036  0.7034  0.7033  0.0003  
Fit time          9.91    9.58    10.30   9.77    9.89    9.89    0.24    
Test time         1.11    1.07    1.12    1.41    1.33    1.21    0.14    


{'test_rmse': array([0.91303477, 0.91332888, 0.91352622, 0.91320278, 0.91292555]),
 'test_mae': array([0.70313489, 0.7028378 , 0.70359434, 0.70358235, 0.70335566]),
 'fit_time': (9.909661293029785,
  9.575968027114868,
  10.30473780632019,
  9.768281936645508,
  9.891952991485596),
 'test_time': (1.1085481643676758,
  1.0685420036315918,
  1.11647629737854,
  1.4074289798736572,
  1.3325819969177246)}

In [24]:
# TODO GILIAN: Matrix geben lassen:


We get a mean Root Mean Sqaure Error of 0.95 approx which is more than good enough for our case. Let us now train on our dataset and arrive at predictions.

In [29]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x3ae7a0190>

In [38]:
svd.predict(2, 150)

Prediction(uid=2, iid=150, r_ui=None, est=4.232214969794782, details={'was_impossible': False})

### Grid Search CV

In [72]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import GridSearchCV

param_grid = {
    'n_factors': [50, 100],  # Number of factors
    'n_epochs': [20],        # Number of iterations
    'lr_all': [0.005, 0.01], # Learning rate
    'reg_all': [0.02, 0.05]  # Regularization term
}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1)

gs.fit(data)

print("Best RMSE score obtained: ", gs.best_score['rmse'])
print("Best parameters: ", gs.best_params['rmse'])

optimized_svd = gs.best_estimator['rmse']

Best RMSE score obtained:  0.9041399444694408
Best parameters:  {'n_factors': 50, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.05}


-> intializing optimized model

In [84]:
optimized_svd = SVD(n_factors=50, n_epochs=20, lr_all=0.005, reg_all=0.05)

reader = Reader(rating_scale=(0.5, 5))  
data = Dataset.load_from_df(df_ratings_subset[['userId', 'movieId', 'rating']], reader)
trainset = data.build_full_trainset()

In [85]:
optimized_svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x581b81290>

In [86]:
prediction = optimized_svd.predict(uid=2, iid=3114)
print(prediction.est)

3.973394488110365


# Hybrid Model

In [87]:
def hybrid_model(movie_id, user_id, top_n, svd_model):
    # Step 1: Get top N content-based recommendations
    content_recs = recommender_base.recommend(movie_id, top_n)

    content_recs['movieId'] = content_recs['movieId'].astype(int)

    # Step 2: Apply SVD to predict ratings for the top N movies
    content_recs['predicted_rating'] = content_recs['movieId'].apply(
        lambda x: svd_model.predict(user_id, x).est
    )
    
    # Step 3: Sort recommendations by predicted ratings, then by cosine similarity
    final_recs = content_recs.sort_values(
        by=['predicted_rating', 'cosine_similarity'], ascending=[False, False]
    )
    
    final_recs = final_recs[['movieId', 'title', 'cosine_similarity', 'predicted_rating']]

    return final_recs

In [88]:
hybrid_model(1, 1, 10, optimized_svd)

Unnamed: 0,movieId,title,cosine_similarity,predicted_rating
12007,78499,Toy Story 3,0.417211,4.015826
2874,3114,Toy Story 2,0.498092,3.799826
12339,83219,The Pixar Story,0.178043,3.752612
7112,7987,Dolls,0.180138,3.623393
9645,46948,Monster House,0.144818,3.47235
1722,1920,Small Soldiers,0.216866,2.87554
1793,1991,Child's Play,0.15126,2.797392
2048,2253,Toys,0.186901,2.783716
1795,1993,Child's Play 3,0.143198,2.366231
1552,1707,Home Alone 3,0.163932,2.00895
