# Content-based model

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import warnings
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
warnings.filterwarnings("ignore")

In [2]:
df_merged = pd.read_pickle('data/df_movies_cleaned.pkl')
df_ratings = pd.read_pickle('data/df_ratings_cleaned.pkl')

### Setting Rating Thresholds

In [3]:
ratings_per_user = df_ratings["userId"].value_counts()
ratings_per_movie = df_ratings["movieId"].value_counts()

print(ratings_per_user.describe())
print(ratings_per_movie.describe())

count    166444.000000
mean        149.288073
std         248.021364
min          20.000000
25%          35.000000
50%          69.000000
75%         158.000000
max       18276.000000
Name: count, dtype: float64
count    45028.000000
mean       551.836724
std       2869.798512
min          1.000000
25%          2.000000
50%          8.000000
75%         68.000000
max      82895.000000
Name: count, dtype: float64


In [4]:
ratings_count_per_movie = df_ratings.groupby('movieId').size()
movies_with_enough_ratings = ratings_count_per_movie[ratings_count_per_movie >= 20].index
prelim_df_ratings_filtered = df_ratings[df_ratings['movieId'].isin(movies_with_enough_ratings)]

ratings_count_per_user = prelim_df_ratings_filtered.groupby('userId').size()
users_with_enough_ratings = ratings_count_per_user[ratings_count_per_user >= 20].index
df_ratings_filtered_final = prelim_df_ratings_filtered[prelim_df_ratings_filtered['userId'].isin(users_with_enough_ratings)]

final_ratings_count_per_movie = df_ratings_filtered_final.groupby('movieId').size()
final_movies_with_enough_ratings = final_ratings_count_per_movie[final_ratings_count_per_movie >= 20].index
df_ratings_filtered_final = df_ratings_filtered_final[df_ratings_filtered_final['movieId'].isin(final_movies_with_enough_ratings)]

print(df_ratings_filtered_final.groupby('userId').size().describe())
print(df_ratings_filtered_final.groupby('movieId').size().describe())

count    166317.000000
mean        148.571679
std         239.911174
min          20.000000
25%          35.000000
50%          69.000000
75%         158.000000
max        9503.000000
dtype: float64
count    16705.000000
mean      1479.197606
std       4563.793861
min         20.000000
25%         48.000000
50%        163.000000
75%        794.000000
max      82877.000000
dtype: float64


#### Textual Feature - Combined Text

In [5]:
df_merged['combined_text'] = df_merged.apply(lambda row: ' '.join([
    ' '.join(row['genre_extracted']), 
    ' '.join(row['actors']), 
    ' '.join(row['keywords_extracted']), 
    row['overview'], 
    ' '.join(row['production_company_extracted'])
]).lower(), axis=1)

The combined_text feature aggregates critical textual metadata from genres, actors, keywords, and movie descriptions into a single comprehensive descriptor for each movie. This aggregation captures the essence of a movie’s content, thematic elements, and appeal, which is crucial for content-based filtering. By synthesizing this information, the recommender system can identify and suggest movies with similar thematic and content attributes, enhancing personalization and user engagement.

## Modeling Preprocessing

#### Combining df_ratings and df_merged

In [6]:
df_combined = pd.merge(df_ratings, df_merged, on='movieId', how='inner')

#### Setting Rating Threshold

The decision to set a threshold of 20 ratings for each movie before including it in the item-based recommender system is strategic, with the goal of ensuring the reliability and validity of the generated recommendations. This threshold acts as a quality control measure, weeding out movies with sparse feedback that could otherwise result in skewed or less confident recommendations due to insufficient user data. By setting this minimum, the system focuses on movies with a high level of viewer engagement, allowing recommendations to be built on a solid foundation of user feedback. This approach improves the system's ability to deliver accurate, trustworthy recommendations based on broad consensus rather than outliers or minimal feedback, resulting in a better user experience and increased overall credibility for the recommender system.

In [7]:
ratings_per_movie = df_combined.groupby('movieId').size()

movies_with_enough_ratings = ratings_per_movie[ratings_per_movie >= 20].index

df_item_modeling = df_combined[df_combined['movieId'].isin(movies_with_enough_ratings)]

print(f"Original dataset size: {df_combined.shape}")
print(f"Filtered dataset size: {df_item_modeling.shape}")

Original dataset size: (24669326, 19)
Filtered dataset size: (24548423, 19)


With the filtered dataset, df_item_modeling, now comprising 24,528,484 rows out of the original 24,639,944, it's evident that the vast majority of the data meets the threshold of having at least 20 ratings per movie. This minimal reduction in dataset size suggests that most movies in the dataset have a sufficient number of ratings, indicating robust user engagement across a wide range of movies.

##### Grouping Movies

In [8]:
df_grouped = df_item_modeling.groupby('movieId', as_index=False).agg({
    'title': 'first',
    'combined_text': 'first', 
})

In [9]:
df_grouped

Unnamed: 0,movieId,title,combined_text
0,1,Toy Story,animation comedy family tom hanks tim allen do...
1,2,Jumanji,adventure fantasy family robin williams jonath...
2,3,Grumpier Old Men,romance comedy walter matthau jack lemmon ann-...
3,4,Waiting to Exhale,comedy drama romance whitney houston angela ba...
4,5,Father of the Bride Part II,comedy steve martin diane keaton martin short ...
...,...,...,...
16122,173941,Atomic Blonde,action thriller charlize theron james mcavoy s...
16123,174053,Black Mirror: White Christmas,drama horror mystery science fiction thriller ...
16124,174055,Dunkirk,action drama history thriller war fionn whiteh...
16125,174371,Once Upon a Time in Venice,action comedy thriller bruce willis jason momo...


### Content-Based Filtering

#### Vectorizing 'combined_text' feature

In [10]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df_grouped['combined_text'])

Vectorizing the combined_text using TF-IDF transforms qualitative textual information into quantitative vectors, facilitating the measurement of content similarity between movies. This numerical representation allows for sophisticated algorithms to compute similarities based on thematic elements, narrative structures, and genre affiliations. For our movie recommender system, this means being able to recommend movies that are contextually and thematically aligned with a user’s preferences, enhancing the discovery of relevant and appealing content.

### BaseLine Model

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

class SimplifiedContentRecommender:
    def __init__(self, movies_df, tfidf_matrix, k=100):
        self.movies_df = movies_df.copy()
        self.movies_df['movieId'] = self.movies_df['movieId'].astype(str)
        self.movie_id_to_index = {movie_id: i for i, movie_id in enumerate(self.movies_df['movieId'])}
        self.tfidf_matrix = tfidf_matrix 
        self.similarity_matrix = cosine_similarity(self.tfidf_matrix)

    def recommend(self, movie_id, top_n=10):
        movie_id = str(movie_id)
        if movie_id not in self.movie_id_to_index:
            print(f"Movie ID {movie_id} not found in the dataset.")
            return []
        
        movie_index = self.movie_id_to_index[movie_id]
        similarity_scores = self.similarity_matrix[movie_index]
        top_k_indices = np.argsort(similarity_scores)[::-1][1:top_n+1]
        recommendations = self.movies_df.iloc[top_k_indices].copy()
        recommendations['cosine_similarity'] = similarity_scores[top_k_indices]
        
        return recommendations.sort_values('cosine_similarity', ascending=False)

In [12]:
recommender_base = SimplifiedContentRecommender(df_grouped, tfidf_matrix, k=100)
recommendations_base = recommender_base.recommend('1', top_n=10)  
print(recommendations_base[['movieId', 'title', 'cosine_similarity']])

      movieId            title  cosine_similarity
2874     3114      Toy Story 2           0.498092
12007   78499      Toy Story 3           0.417211
1722     1920   Small Soldiers           0.216866
2048     2253             Toys           0.186901
7112     7987            Dolls           0.180138
12339   83219  The Pixar Story           0.178043
1552     1707     Home Alone 3           0.163932
1793     1991     Child's Play           0.151260
9645    46948    Monster House           0.144818
1795     1993   Child's Play 3           0.143198


### Sampling

In this scenario, the sampling technique used is to calculate a statistically significant sample size in order to estimate the proportion of movies rated 4.0 or higher in a dataset. This decision is based on a specific confidence level (95%) and margin of error (5%), with the goal of obtaining precise and reliable inferences about the population's characteristics from a sample of data. The method used employs a standard formula that includes the Z-score associated with the desired confidence level and the estimated proportion of interest, ensuring that the sample size is sufficient to accurately reflect the population. This technique is critical for designing studies or analyses that require accurate estimations of population parameters for decision-making or hypothesis testing, as it minimizes potential biases and errors caused by small or arbitrarily chosen sample sizes. By rigorously determining the required sample size, the approach improves the credibility and validity of the findings derived from the sample data, making it a cornerstone of statistical analysis and research methodologies.

In [13]:
import scipy.stats
import math

def calculate_sample_size(confidence_level, margin_of_error, proportion):
    z_score = abs(scipy.stats.norm.ppf((1 - confidence_level) / 2))
    sample_size = math.ceil((z_score ** 2 * proportion * (1 - proportion)) / (margin_of_error ** 2))
    return sample_size

confidence_level = 0.95
margin_of_error = 0.05

proportion_higher_ratings = df_ratings[df_ratings['rating'] >= 4.0].shape[0] / df_ratings.shape[0]
required_sample_size = calculate_sample_size(confidence_level, margin_of_error, proportion_higher_ratings)
print(f"Required sample size: {required_sample_size}")

Required sample size: 385


In [14]:
sample_movie_ids = np.random.choice(df_grouped['movieId'].unique(), size=required_sample_size, replace=False)

## Evaluation Function

In [74]:
def evaluate_movie(movie_id, df_ratings, recommender, top_n=10):
    """Evaluate a single movie for the recommender system, adjusted for actual user ratings."""
    recommendations = recommender.recommend(str(movie_id), top_n=top_n)
    if recommendations.empty:
        return np.array([]), None  # Use None to indicate no data for calculation

    recommended_ids = recommendations['movieId'].astype(str).tolist()
    # Filter ratings to those that match the recommended movie IDs
    matching_ratings = df_ratings[df_ratings['movieId'].astype(str).isin(recommended_ids)]
    
    # Calculate hit rate only for recommended movies that have been rated
    hit_rate = (matching_ratings['rating'] >= 4.0).mean() if not matching_ratings.empty else None

    return np.array(matching_ratings['rating']), hit_rate

def evaluate_recommender(df_ratings, recommender, sample_movie_ids, top_n=10, threshold=4.0):
    """Evaluate the recommender system using sampled movie IDs, including adjusted hit rate."""
    all_ratings, hit_rates = [], []

    for movie_id in sample_movie_ids:
        movie_ratings, hit_rate = evaluate_movie(movie_id, df_ratings, recommender, top_n=top_n)
        if movie_ratings.size > 0:
            all_ratings.extend(movie_ratings)
        if hit_rate is not None:
            hit_rates.append(hit_rate)
    
    all_ratings = np.array(all_ratings)
    # Adjust calculations to handle potential None values in hit_rates
    if len(all_ratings) > 0:
        mae = np.mean(np.abs(all_ratings - 5))
        mse = np.mean((all_ratings - 5) ** 2)
        rmse = np.sqrt(mse)
        precision = np.sum(all_ratings >= threshold) / len(all_ratings)
    else:
        mae, mse, rmse, precision = 0, 0, 0, 0

    avg_hit_rate = np.mean(hit_rates) if hit_rates else None  # Use None or a placeholder if no hit rates available

    print(f"Sample Size: {len(sample_movie_ids)}")
    # Adjust the print statement to handle None value for avg_hit_rate
    print(f"MAE: {mae:.4f}\nMSE: {mse:.4f}\nRMSE: {rmse:.4f}\nPrecision: {precision:.4f}\nAverage Hit Rate: {avg_hit_rate if avg_hit_rate is not None else 'N/A'}")

    return mae, mse, rmse, precision, avg_hit_rate


# Item-based Collaborative Filtering model

In [27]:
df_ratings_subset = df_ratings_filtered_final.sample(frac=0.05, random_state=42)
df_ratings_subset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1235500 entries, 14910339 to 23314789
Data columns (total 6 columns):
 #   Column            Non-Null Count    Dtype         
---  ------            --------------    -----         
 0   userId            1235500 non-null  int64         
 1   movieId           1235500 non-null  int64         
 2   rating            1235500 non-null  Float64       
 3   timestamp         1235500 non-null  datetime64[ns]
 4   user_mean_rating  1235500 non-null  Float64       
 5   liked_by_user     1235500 non-null  boolean       
dtypes: Float64(2), boolean(1), datetime64[ns](1), int64(2)
memory usage: 61.3 MB


In [28]:
reader = Reader(rating_scale=(0.5, 5))

# Prepare the data for Surprise
data = Dataset.load_from_df(df_ratings_subset[['userId', 'movieId', 'rating']], reader)

# Initialize the SVD algorithm
svd = SVD()

# Perform cross-validation
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9130  0.9133  0.9135  0.9132  0.9129  0.9132  0.0002  
MAE (testset)     0.7031  0.7028  0.7036  0.7036  0.7034  0.7033  0.0003  
Fit time          9.91    9.58    10.30   9.77    9.89    9.89    0.24    
Test time         1.11    1.07    1.12    1.41    1.33    1.21    0.14    


{'test_rmse': array([0.91303477, 0.91332888, 0.91352622, 0.91320278, 0.91292555]),
 'test_mae': array([0.70313489, 0.7028378 , 0.70359434, 0.70358235, 0.70335566]),
 'fit_time': (9.909661293029785,
  9.575968027114868,
  10.30473780632019,
  9.768281936645508,
  9.891952991485596),
 'test_time': (1.1085481643676758,
  1.0685420036315918,
  1.11647629737854,
  1.4074289798736572,
  1.3325819969177246)}

In [24]:
# TODO GILIAN: Matrix geben lassen:


We get a mean Root Mean Sqaure Error of 0.95 approx which is more than good enough for our case. Let us now train on our dataset and arrive at predictions.

In [29]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x3ae7a0190>

In [38]:
svd.predict(2, 150)

Prediction(uid=2, iid=150, r_ui=None, est=4.232214969794782, details={'was_impossible': False})

### Grid Search CV

In [72]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import GridSearchCV

param_grid = {
    'n_factors': [50, 100],  # Number of factors
    'n_epochs': [20],        # Number of iterations
    'lr_all': [0.005, 0.01], # Learning rate
    'reg_all': [0.02, 0.05]  # Regularization term
}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1)

gs.fit(data)

print("Best RMSE score obtained: ", gs.best_score['rmse'])
print("Best parameters: ", gs.best_params['rmse'])

optimized_svd = gs.best_estimator['rmse']

Best RMSE score obtained:  0.9041399444694408
Best parameters:  {'n_factors': 50, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.05}


-> intializing optimized model

In [84]:
optimized_svd = SVD(n_factors=50, n_epochs=20, lr_all=0.005, reg_all=0.05)

reader = Reader(rating_scale=(0.5, 5))  
data = Dataset.load_from_df(df_ratings_subset[['userId', 'movieId', 'rating']], reader)
trainset = data.build_full_trainset()

In [85]:
optimized_svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x581b81290>

In [86]:
prediction = optimized_svd.predict(uid=2, iid=3114)
print(prediction.est)

3.973394488110365


# Hybrid Model

In [87]:
def hybrid_model(movie_id, user_id, top_n, svd_model):
    # Step 1: Get top N content-based recommendations
    content_recs = recommender_base.recommend(movie_id, top_n)

    content_recs['movieId'] = content_recs['movieId'].astype(int)

    # Step 2: Apply SVD to predict ratings for the top N movies
    content_recs['predicted_rating'] = content_recs['movieId'].apply(
        lambda x: svd_model.predict(user_id, x).est
    )
    
    # Step 3: Sort recommendations by predicted ratings, then by cosine similarity
    final_recs = content_recs.sort_values(
        by=['predicted_rating', 'cosine_similarity'], ascending=[False, False]
    )
    
    final_recs = final_recs[['movieId', 'title', 'cosine_similarity', 'predicted_rating']]

    return final_recs

In [88]:
hybrid_model(1, 1, 10, optimized_svd)

Unnamed: 0,movieId,title,cosine_similarity,predicted_rating
12007,78499,Toy Story 3,0.417211,4.015826
2874,3114,Toy Story 2,0.498092,3.799826
12339,83219,The Pixar Story,0.178043,3.752612
7112,7987,Dolls,0.180138,3.623393
9645,46948,Monster House,0.144818,3.47235
1722,1920,Small Soldiers,0.216866,2.87554
1793,1991,Child's Play,0.15126,2.797392
2048,2253,Toys,0.186901,2.783716
1795,1993,Child's Play 3,0.143198,2.366231
1552,1707,Home Alone 3,0.163932,2.00895
