# Content-based model

In [2]:
!pip install textblob

Collecting textblob
  Obtaining dependency information for textblob from https://files.pythonhosted.org/packages/02/07/5fd2945356dd839974d3a25de8a142dc37293c21315729a41e775b5f3569/textblob-0.18.0.post0-py3-none-any.whl.metadata
  Downloading textblob-0.18.0.post0-py3-none-any.whl.metadata (4.5 kB)
Downloading textblob-0.18.0.post0-py3-none-any.whl (626 kB)
   ---------------------------------------- 0.0/626.3 kB ? eta -:--:--
   --------------------- ------------------ 337.9/626.3 kB 7.1 MB/s eta 0:00:01
   ---------------------------------------- 626.3/626.3 kB 7.9 MB/s eta 0:00:00
Installing collected packages: textblob
Successfully installed textblob-0.18.0.post0


In [2]:
!pip install surprise

Collecting surprise
  Obtaining dependency information for surprise from https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl.metadata
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25ldone
[?25h  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp311-cp311-macosx_11_0_arm64.whl size=1091189 sha256=71a3e2ead1c476eb260af70a9defb319c8101f6495a4c31677a12ea39a5dc471
  Stored in directory: /Users/bene/Library/Caches/pip/

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import warnings
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
warnings.filterwarnings("ignore")

In [2]:
df_merged = pd.read_pickle('data/df_movies_cleaned.pkl')
df_ratings = pd.read_pickle('data/df_ratings_cleaned.pkl')

### Setting Rating Thresholds

In [3]:
ratings_per_user = df_ratings["userId"].value_counts()
ratings_per_movie = df_ratings["movieId"].value_counts()

print(ratings_per_user.describe())
print(ratings_per_movie.describe())

count    166444.000000
mean        149.288073
std         248.021364
min          20.000000
25%          35.000000
50%          69.000000
75%         158.000000
max       18276.000000
Name: count, dtype: float64
count    45028.000000
mean       551.836724
std       2869.798512
min          1.000000
25%          2.000000
50%          8.000000
75%         68.000000
max      82895.000000
Name: count, dtype: float64


In [4]:
ratings_count_per_movie = df_ratings.groupby('movieId').size()
movies_with_enough_ratings = ratings_count_per_movie[ratings_count_per_movie >= 20].index
prelim_df_ratings_filtered = df_ratings[df_ratings['movieId'].isin(movies_with_enough_ratings)]

ratings_count_per_user = prelim_df_ratings_filtered.groupby('userId').size()
users_with_enough_ratings = ratings_count_per_user[ratings_count_per_user >= 20].index
df_ratings_filtered_final = prelim_df_ratings_filtered[prelim_df_ratings_filtered['userId'].isin(users_with_enough_ratings)]

final_ratings_count_per_movie = df_ratings_filtered_final.groupby('movieId').size()
final_movies_with_enough_ratings = final_ratings_count_per_movie[final_ratings_count_per_movie >= 20].index
df_ratings_filtered_final = df_ratings_filtered_final[df_ratings_filtered_final['movieId'].isin(final_movies_with_enough_ratings)]

print(df_ratings_filtered_final.groupby('userId').size().describe())
print(df_ratings_filtered_final.groupby('movieId').size().describe())

count    166317.000000
mean        148.571679
std         239.911174
min          20.000000
25%          35.000000
50%          69.000000
75%         158.000000
max        9503.000000
dtype: float64
count    16705.000000
mean      1479.197606
std       4563.793861
min         20.000000
25%         48.000000
50%        163.000000
75%        794.000000
max      82877.000000
dtype: float64


#### Textual Feature - Combined Text

In [5]:
df_merged['combined_text'] = df_merged.apply(lambda row: ' '.join([
    ' '.join(row['genre_extracted']), 
    ' '.join(row['actors']), 
    ' '.join(row['keywords_extracted']), 
    row['overview'], 
    ' '.join(row['production_company_extracted'])
]).lower(), axis=1)

The combined_text feature aggregates critical textual metadata from genres, actors, keywords, and movie descriptions into a single comprehensive descriptor for each movie. This aggregation captures the essence of a movie’s content, thematic elements, and appeal, which is crucial for content-based filtering. By synthesizing this information, the recommender system can identify and suggest movies with similar thematic and content attributes, enhancing personalization and user engagement.

## Modeling Preprocessing

#### Combining df_ratings and df_merged

In [6]:
df_combined = pd.merge(df_ratings, df_merged, on='movieId', how='inner')

#### Setting Rating Threshold

The decision to set a threshold of 20 ratings for each movie before including it in the item-based recommender system is strategic, with the goal of ensuring the reliability and validity of the generated recommendations. This threshold acts as a quality control measure, weeding out movies with sparse feedback that could otherwise result in skewed or less confident recommendations due to insufficient user data. By setting this minimum, the system focuses on movies with a high level of viewer engagement, allowing recommendations to be built on a solid foundation of user feedback. This approach improves the system's ability to deliver accurate, trustworthy recommendations based on broad consensus rather than outliers or minimal feedback, resulting in a better user experience and increased overall credibility for the recommender system.

In [7]:
ratings_per_movie = df_combined.groupby('movieId').size()

movies_with_enough_ratings = ratings_per_movie[ratings_per_movie >= 20].index

df_item_modeling = df_combined[df_combined['movieId'].isin(movies_with_enough_ratings)]

print(f"Original dataset size: {df_combined.shape}")
print(f"Filtered dataset size: {df_item_modeling.shape}")

Original dataset size: (24669326, 19)
Filtered dataset size: (24548423, 19)


With the filtered dataset, df_item_modeling, now comprising 24,528,484 rows out of the original 24,639,944, it's evident that the vast majority of the data meets the threshold of having at least 20 ratings per movie. This minimal reduction in dataset size suggests that most movies in the dataset have a sufficient number of ratings, indicating robust user engagement across a wide range of movies.

##### Grouping Movies

In [8]:
df_grouped = df_item_modeling.groupby('movieId', as_index=False).agg({
    'title': 'first',
    'combined_text': 'first',  # Picking the first since all are the same
})

In [9]:
df_grouped

Unnamed: 0,movieId,title,combined_text
0,1,Toy Story,animation comedy family tom hanks tim allen do...
1,2,Jumanji,adventure fantasy family robin williams jonath...
2,3,Grumpier Old Men,romance comedy walter matthau jack lemmon ann-...
3,4,Waiting to Exhale,comedy drama romance whitney houston angela ba...
4,5,Father of the Bride Part II,comedy steve martin diane keaton martin short ...
...,...,...,...
16122,173941,Atomic Blonde,action thriller charlize theron james mcavoy s...
16123,174053,Black Mirror: White Christmas,drama horror mystery science fiction thriller ...
16124,174055,Dunkirk,action drama history thriller war fionn whiteh...
16125,174371,Once Upon a Time in Venice,action comedy thriller bruce willis jason momo...


### Content-Based Filtering

#### Vectorizing 'combined_text' feature

In [10]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df_grouped['combined_text'])

Vectorizing the combined_text using TF-IDF transforms qualitative textual information into quantitative vectors, facilitating the measurement of content similarity between movies. This numerical representation allows for sophisticated algorithms to compute similarities based on thematic elements, narrative structures, and genre affiliations. For our movie recommender system, this means being able to recommend movies that are contextually and thematically aligned with a user’s preferences, enhancing the discovery of relevant and appealing content.

### BaseLine Model

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

class SimplifiedContentRecommender:
    def __init__(self, movies_df, tfidf_matrix, k=100):
        self.movies_df = movies_df.copy()
        self.movies_df['movieId'] = self.movies_df['movieId'].astype(str)
        self.movie_id_to_index = {movie_id: i for i, movie_id in enumerate(self.movies_df['movieId'])}
        self.tfidf_matrix = tfidf_matrix 
        self.similarity_matrix = cosine_similarity(self.tfidf_matrix)

    def recommend(self, movie_id, top_n=10):
        movie_id = str(movie_id)
        if movie_id not in self.movie_id_to_index:
            print(f"Movie ID {movie_id} not found in the dataset.")
            return []
        
        movie_index = self.movie_id_to_index[movie_id]
        similarity_scores = self.similarity_matrix[movie_index]
        top_k_indices = np.argsort(similarity_scores)[::-1][1:top_n+1]
        recommendations = self.movies_df.iloc[top_k_indices].copy()
        recommendations['cosine_similarity'] = similarity_scores[top_k_indices]
        
        return recommendations.sort_values('cosine_similarity', ascending=False)

In [12]:
recommender_base = SimplifiedContentRecommender(df_grouped, tfidf_matrix, k=100)
recommendations_base = recommender_base.recommend('1', top_n=10)  
print(recommendations_base[['movieId', 'title', 'cosine_similarity']])

      movieId            title  cosine_similarity
2874     3114      Toy Story 2           0.498092
12007   78499      Toy Story 3           0.417211
1722     1920   Small Soldiers           0.216866
2048     2253             Toys           0.186901
7112     7987            Dolls           0.180138
12339   83219  The Pixar Story           0.178043
1552     1707     Home Alone 3           0.163932
1793     1991     Child's Play           0.151260
9645    46948    Monster House           0.144818
1795     1993   Child's Play 3           0.143198


### Sampling

In this scenario, the sampling technique used is to calculate a statistically significant sample size in order to estimate the proportion of movies rated 4.0 or higher in a dataset. This decision is based on a specific confidence level (95%) and margin of error (5%), with the goal of obtaining precise and reliable inferences about the population's characteristics from a sample of data. The method used employs a standard formula that includes the Z-score associated with the desired confidence level and the estimated proportion of interest, ensuring that the sample size is sufficient to accurately reflect the population. This technique is critical for designing studies or analyses that require accurate estimations of population parameters for decision-making or hypothesis testing, as it minimizes potential biases and errors caused by small or arbitrarily chosen sample sizes. By rigorously determining the required sample size, the approach improves the credibility and validity of the findings derived from the sample data, making it a cornerstone of statistical analysis and research methodologies.

In [13]:
import scipy.stats
import math

def calculate_sample_size(confidence_level, margin_of_error, proportion):
    z_score = abs(scipy.stats.norm.ppf((1 - confidence_level) / 2))
    sample_size = math.ceil((z_score ** 2 * proportion * (1 - proportion)) / (margin_of_error ** 2))
    return sample_size

confidence_level = 0.95
margin_of_error = 0.05

proportion_higher_ratings = df_ratings[df_ratings['rating'] >= 4.0].shape[0] / df_ratings.shape[0]
required_sample_size = calculate_sample_size(confidence_level, margin_of_error, proportion_higher_ratings)
print(f"Required sample size: {required_sample_size}")

Required sample size: 385


In [14]:
sample_movie_ids = np.random.choice(df_grouped['movieId'].unique(), size=required_sample_size, replace=False)

## Evaluation Function

In [20]:
def evaluate_movie(movie_id, df_ratings, recommender, top_n=10):
    """Evaluate a single movie for the recommender system, adjusted for actual user ratings."""
    recommendations = recommender.recommend(str(movie_id), top_n=top_n)
    if recommendations.empty:
        return np.array([]), None  # Use None to indicate no data for calculation

    recommended_ids = recommendations['movieId'].astype(str).tolist()
    # Filter ratings to those that match the recommended movie IDs
    matching_ratings = df_ratings[df_ratings['movieId'].astype(str).isin(recommended_ids)]
    
    # Calculate hit rate only for recommended movies that have been rated
    hit_rate = (matching_ratings['rating'] >= 4.0).mean() if not matching_ratings.empty else None

    return np.array(matching_ratings['rating']), hit_rate

def evaluate_recommender(df_ratings, recommender, sample_movie_ids, top_n=10, threshold=4.0):
    """Evaluate the recommender system using sampled movie IDs, including adjusted hit rate."""
    all_ratings, hit_rates = [], []

    for movie_id in sample_movie_ids:
        movie_ratings, hit_rate = evaluate_movie(movie_id, df_ratings, recommender, top_n=top_n)
        if movie_ratings.size > 0:
            all_ratings.extend(movie_ratings)
        if hit_rate is not None:
            hit_rates.append(hit_rate)
    
    all_ratings = np.array(all_ratings)
    # Adjust calculations to handle potential None values in hit_rates
    if len(all_ratings) > 0:
        mae = np.mean(np.abs(all_ratings - 5))
        mse = np.mean((all_ratings - 5) ** 2)
        rmse = np.sqrt(mse)
        precision = np.sum(all_ratings >= threshold) / len(all_ratings)
    else:
        mae, mse, rmse, precision = 0, 0, 0, 0

    avg_hit_rate = np.mean(hit_rates) if hit_rates else None  # Use None or a placeholder if no hit rates available

    print(f"Sample Size: {len(sample_movie_ids)}")
    # Adjust the print statement to handle None value for avg_hit_rate
    print(f"MAE: {mae:.4f}\nMSE: {mse:.4f}\nRMSE: {rmse:.4f}\nPrecision: {precision:.4f}\nAverage Hit Rate: {avg_hit_rate if avg_hit_rate is not None else 'N/A'}")

    return mae, mse, rmse, precision, avg_hit_rate


# Item-based Collaborative Filtering model

In [27]:
df_ratings_subset = df_ratings_filtered_final.sample(frac=0.05, random_state=42)
df_ratings_subset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1235500 entries, 14910339 to 23314789
Data columns (total 6 columns):
 #   Column            Non-Null Count    Dtype         
---  ------            --------------    -----         
 0   userId            1235500 non-null  int64         
 1   movieId           1235500 non-null  int64         
 2   rating            1235500 non-null  Float64       
 3   timestamp         1235500 non-null  datetime64[ns]
 4   user_mean_rating  1235500 non-null  Float64       
 5   liked_by_user     1235500 non-null  boolean       
dtypes: Float64(2), boolean(1), datetime64[ns](1), int64(2)
memory usage: 61.3 MB


In [28]:
reader = Reader()

# Prepare the data for Surprise
data = Dataset.load_from_df(df_ratings_subset[['userId', 'movieId', 'rating']], reader)

# Initialize the SVD algorithm
svd = SVD()

# Perform cross-validation
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9130  0.9133  0.9135  0.9132  0.9129  0.9132  0.0002  
MAE (testset)     0.7031  0.7028  0.7036  0.7036  0.7034  0.7033  0.0003  
Fit time          9.91    9.58    10.30   9.77    9.89    9.89    0.24    
Test time         1.11    1.07    1.12    1.41    1.33    1.21    0.14    


{'test_rmse': array([0.91303477, 0.91332888, 0.91352622, 0.91320278, 0.91292555]),
 'test_mae': array([0.70313489, 0.7028378 , 0.70359434, 0.70358235, 0.70335566]),
 'fit_time': (9.909661293029785,
  9.575968027114868,
  10.30473780632019,
  9.768281936645508,
  9.891952991485596),
 'test_time': (1.1085481643676758,
  1.0685420036315918,
  1.11647629737854,
  1.4074289798736572,
  1.3325819969177246)}

In [24]:
# TODO GILIAN: Matrix geben lassen:


We get a mean Root Mean Sqaure Error of 0.95 approx which is more than good enough for our case. Let us now train on our dataset and arrive at predictions.

In [29]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x3ae7a0190>

In [38]:
svd.predict(2, 150)

Prediction(uid=2, iid=150, r_ui=None, est=4.232214969794782, details={'was_impossible': False})

### Grid Search CV

In [72]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import GridSearchCV

# Assuming 'data' is your Dataset object and is correctly loaded
# For example:
# data = Dataset.load_from_df(df_ratings_subset[['userId', 'movieId', 'rating']], Reader(rating_scale=(1, 5)))

param_grid = {
    'n_factors': [50, 100],  # Number of factors
    'n_epochs': [20],        # Number of iterations
    'lr_all': [0.005, 0.01], # Learning rate
    'reg_all': [0.02, 0.05]  # Regularization term
}

# Initialize GridSearchCV
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1)

# Perform grid search. Note that we use 'data' directly here, not 'trainset'
gs.fit(data)

# After fitting, we can now access the results
print("Best RMSE score obtained: ", gs.best_score['rmse'])
print("Best parameters: ", gs.best_params['rmse'])

# Use the best estimator for further predictions
optimized_svd = gs.best_estimator['rmse']


Best RMSE score obtained:  0.9041399444694408
Best parameters:  {'n_factors': 50, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.05}


# Hybrid Model

In [66]:
def hybrid_model(movie_id, user_id, top_n, svd_model):
    # Step 1: Get top N content-based recommendations
    content_recs = recommender_base.recommend(movie_id, top_n)

    content_recs['movieId'] = content_recs['movieId'].astype(int)

    # Step 2: Apply SVD to predict ratings for the top N movies
    content_recs['predicted_rating'] = content_recs['movieId'].apply(
        lambda x: svd_model.predict(user_id, x).est
    )
    
    # Step 3: Sort recommendations by predicted ratings, then by cosine similarity
    final_recs = content_recs.sort_values(
        by=['predicted_rating', 'cosine_similarity'], ascending=[False, False]
    )
    
    final_recs = final_recs[['movieId', 'title', 'cosine_similarity', 'predicted_rating']]

    return final_recs

In [67]:
hybrid_model(1, 1, 10, svd)

Unnamed: 0,movieId,title,cosine_similarity,predicted_rating
12007,78499,Toy Story 3,0.417211,3.973841
7112,7987,Dolls,0.180138,3.711014
2874,3114,Toy Story 2,0.498092,3.70599
12339,83219,The Pixar Story,0.178043,3.536388
9645,46948,Monster House,0.144818,3.334867
2048,2253,Toys,0.186901,2.775245
1793,1991,Child's Play,0.15126,2.715609
1722,1920,Small Soldiers,0.216866,2.601583
1795,1993,Child's Play 3,0.143198,2.212409
1552,1707,Home Alone 3,0.163932,1.877622


In [59]:
content_recs_test = recommender_base.recommend(1, 10)
content_recs_test.dtypes

movieId               object
title                 object
combined_text         object
cosine_similarity    float64
dtype: object

In [36]:
recommendations_base = recommender_base.recommend('1', top_n=10)  
print(recommendations_base[['movieId', 'title', 'cosine_similarity']])

      movieId            title  cosine_similarity
2874     3114      Toy Story 2           0.498092
12007   78499      Toy Story 3           0.417211
1722     1920   Small Soldiers           0.216866
2048     2253             Toys           0.186901
7112     7987            Dolls           0.180138
12339   83219  The Pixar Story           0.178043
1552     1707     Home Alone 3           0.163932
1793     1991     Child's Play           0.151260
9645    46948    Monster House           0.144818
1795     1993   Child's Play 3           0.143198


In [33]:
# Customize weights

weight_similarity = 0.5
weight_svd = 0.5

In [48]:
svd.predict(1, 3114)

Prediction(uid=1, iid=3114, r_ui=None, est=3.705989519002013, details={'was_impossible': False})

In [46]:
svd.predict(1, 78499)

Prediction(uid=1, iid=78499, r_ui=None, est=3.9738405797222405, details={'was_impossible': False})

In [47]:
svd.predict(1, 1920)

Prediction(uid=1, iid=1920, r_ui=None, est=2.601583327439441, details={'was_impossible': False})

In [42]:
svd.predict(1, 2253)

Prediction(uid=1, iid=2253, r_ui=None, est=2.775245138568771, details={'was_impossible': False})

In [34]:
def hybrid_recommendation(movie_id, user_id, top_n=10):
    # Get top N content-based recommendations
    content_recs = recommender_base.recommend(movie_id, top_n)
    
    # Prepare final recommendations with an additional column for SVD predictions
    content_recs['svd_prediction'] = content_recs['movieId'].apply(lambda x: svd.predict(user_id, x).est)
    
    # Sort the recommendations based solely on the SVD predictions
    final_recs = content_recs.sort_values('svd_prediction', ascending=False).head(top_n)
    
    return final_recs

In [35]:
hybrid_recommendation(9, 1, top_n=50)

Unnamed: 0,movieId,title,combined_text,cosine_similarity,svd_prediction
1997,2196,Knock Off,action adventure thriller jean-claude van damm...,0.269357,3.543738
3891,4199,Death Warrant,action crime drama mystery thriller jean-claud...,0.138215,3.543738
15962,165087,Brimstone,mystery thriller western guy pearce dakota fan...,0.159557,3.543738
3475,3766,Missing in Action,action adventure thriller war chuck norris m. ...,0.158418,3.543738
11563,71810,Legionnaire,adventure drama action history thriller jean-c...,0.156425,3.543738
12773,90434,Assassination Games,drama action crime jean-claude van damme scott...,0.156123,3.543738
8527,27828,The Memory Of A Killer,crime drama thriller action koen de bouw werne...,0.15474,3.543738
8728,31892,"No Retreat, No Surrender",action kurt mckinney jean-claude van damme j.w...,0.154175,3.543738
15099,133689,Pound of Flesh,action jean-claude van damme john ralston darr...,0.151983,3.543738
2578,2808,Universal Soldier,thriller action science fiction crime jean-cla...,0.145358,3.543738


In [76]:
import numpy as np
import pandas as pd
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Assuming df_grouped and df_ratings are already defined

# Content-based recommender setup
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df_grouped['combined_text'])

class SimplifiedContentRecommender:
    def __init__(self, movies_df, tfidf_matrix):
        self.movies_df = movies_df.copy()
        self.movies_df['movieId'] = self.movies_df['movieId'].astype(str)
        self.movie_id_to_index = {movie_id: i for i, movie_id in enumerate(self.movies_df['movieId'])}
        self.tfidf_matrix = tfidf_matrix 
        self.similarity_matrix = cosine_similarity(self.tfidf_matrix)

    def recommend(self, movie_id, top_n=10):
        movie_id = str(movie_id)
        if movie_id not in self.movie_id_to_index:
            print(f"Movie ID {movie_id} not found in the dataset.")
            return []
        
        movie_index = self.movie_id_to_index[movie_id]
        similarity_scores = self.similarity_matrix[movie_index]
        top_k_indices = np.argsort(similarity_scores)[::-1][1:top_n+1]
        recommendations = self.movies_df.iloc[top_k_indices].copy()
        recommendations['cosine_similarity'] = similarity_scores[top_k_indices]
        
        return recommendations

# Prepare the SVD model with filtered dataset based on sample_movie_ids
# Assuming sample_movie_ids is defined and contains movie IDs from df_grouped
df_ratings_aligned_subset = df_ratings[df_ratings['movieId'].astype(str).isin(sample_movie_ids.astype(str))]

reader = Reader(rating_scale=(0.5, 5))  # Adjust based on your dataset
data_aligned_subset = Dataset.load_from_df(df_ratings_aligned_subset[['userId', 'movieId', 'rating']], reader)
trainset_aligned_subset = data_aligned_subset.build_full_trainset()

svd = SVD()
svd.fit(trainset_aligned_subset)

# Define the hybrid recommendation function
def hybrid_recommendation(user_id, movie_id, top_n=10):
    # Content-based recommendations
    content_recommendations = recommender_base.recommend(movie_id, top_n)
    
    # SVD predictions
    predictions = []
    for _, row in content_recommendations.iterrows():
        predicted_rating = svd.predict(user_id, str(row['movieId'])).est  # Ensure movieId is treated as a string
        predictions.append((row['movieId'], row['title'], row['cosine_similarity'], predicted_rating))
    
    # Sort and prepare final DataFrame
    predictions.sort(key=lambda x: x[3], reverse=True)
    recommendations_df = pd.DataFrame(predictions, columns=['movieId', 'title', 'cosine_similarity', 'predicted_rating'])
    
    return recommendations_df

# Initialize the content-based recommender
recommender_base = SimplifiedContentRecommender(df_grouped, tfidf_matrix)

# Example usage
hybrid_recommendations = hybrid_recommendation(user_id=1, movie_id='1', top_n=40)
print(hybrid_recommendations)

   movieId                                              title  \
0     3114                                        Toy Story 2   
1    78499                                        Toy Story 3   
2     1920                                     Small Soldiers   
3     2253                                               Toys   
4     7987                                              Dolls   
5    83219                                    The Pixar Story   
6     1707                                       Home Alone 3   
7     1991                                       Child's Play   
8    46948                                      Monster House   
9     1993                                     Child's Play 3   
10   55999                     Mr. Magorium's Wonder Emporium   
11    1992                                     Child's Play 2   
12   55176                          In the Shadow of the Moon   
13    1103                              Rebel Without a Cause   
14    4886               

In [27]:
from surprise.model_selection import cross_validate
cross_validate(svd, data_aligned_subset, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9254  0.9278  0.9309  0.9281  0.0022  
MAE (testset)     0.7101  0.7115  0.7143  0.7120  0.0018  
Fit time          3.16    3.20    3.47    3.28    0.14    
Test time         0.96    0.92    0.62    0.83    0.15    


{'test_rmse': array([0.92544584, 0.92783799, 0.93090299]),
 'test_mae': array([0.71006841, 0.71149897, 0.71433969]),
 'fit_time': (3.1633121967315674, 3.202285051345825, 3.469529867172241),
 'test_time': (0.9583938121795654, 0.9194438457489014, 0.6153781414031982)}

In [78]:
# Sample some user-item pairs
sample_user = 8  # Example user ID
sample_items = df_ratings['movieId'].sample(10).tolist()  # Replace with your actual data frame and sample items

# Predict ratings for these items using the SVD model
for item in sample_items:
    predicted_rating = svd.predict(sample_user, item).est
    print(f"Predicted rating for user {sample_user} and item {item}: {predicted_rating}")


Predicted rating for user 8 and item 303: 3.323557974653404
Predicted rating for user 8 and item 590: 3.323557974653404
Predicted rating for user 8 and item 3793: 3.323557974653404
Predicted rating for user 8 and item 1914: 3.323557974653404
Predicted rating for user 8 and item 7254: 3.323557974653404
Predicted rating for user 8 and item 452: 3.323557974653404
Predicted rating for user 8 and item 58559: 3.323557974653404
Predicted rating for user 8 and item 2699: 3.323557974653404
Predicted rating for user 8 and item 6060: 3.323557974653404
Predicted rating for user 8 and item 142448: 3.323557974653404


In [83]:
df_ratings_aligned_subset.describe()

Unnamed: 0,userId,movieId,rating,timestamp,user_mean_rating
count,224397.0,224397.0,224397.0,224397,224397.0
mean,134989.979042,8121.098615,3.633496,2006-05-31 13:36:14.770674560,3.553506
min,2.0,4.0,0.5,1996-02-01 14:33:45,0.5
25%,67202.0,260.0,3.0,2000-11-22 07:45:04,3.273529
50%,135324.0,1339.0,4.0,2005-05-14 08:55:35,3.576067
75%,202520.0,3977.0,4.5,2011-08-15 19:45:15,3.862595
max,270896.0,175977.0,5.0,2017-08-04 06:07:40,5.0
std,78095.86123,23635.460498,1.104368,,0.46752


In [37]:
df_ratings_aligned_subset

Unnamed: 0,userId,movieId,rating,timestamp,user_mean_rating,liked_by_user
100,4,3798,4.0,2003-01-15 23:16:20,3.5,True
176,7,80463,3.5,2017-02-05 00:16:35,3.367925,True
217,8,1343,3.0,2002-02-11 16:06:36,2.99115,True
255,8,3175,5.0,2002-02-11 16:09:56,2.99115,True
294,8,5002,2.0,2002-02-11 15:56:53,2.99115,False
...,...,...,...,...,...,...
24847949,270896,1923,3.0,2009-10-31 23:30:45,3.980769,False
24847968,270896,2502,4.5,2009-11-01 00:02:51,3.980769,True
24848017,270896,4995,5.0,2009-10-31 23:25:03,3.980769,True
24848032,270896,5989,4.5,2009-11-01 00:02:19,3.980769,True


In [81]:
# Check the distribution of ratings
print(df_ratings_aligned_subset['rating'].value_counts().sort_index())


rating
0.5     3457
1.0     6999
1.5     3370
2.0    14365
2.5     9722
3.0    41574
3.5    22212
4.0    58800
4.5    17252
5.0    46646
Name: count, dtype: Int64


In [85]:
# Number of ratings per movie
ratings_per_movie = df_ratings_aligned_subset['movieId'].value_counts()
print(ratings_per_movie.describe())

# Number of ratings per user
ratings_per_user = df_ratings_aligned_subset['userId'].value_counts()
print(ratings_per_user.describe())


count      385.000000
mean       582.849351
std       3969.955469
min          1.000000
25%          2.000000
50%          7.000000
75%         49.000000
max      69393.000000
Name: count, dtype: float64
count    98428.000000
mean         2.279809
std          2.420443
min          1.000000
25%          1.000000
50%          1.000000
75%          3.000000
max        144.000000
Name: count, dtype: float64
