# Model Experiments

## Imports

In [1]:
import pandas as pd
import numpy as np

In [2]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
from surprise import accuracy

## Files

In [3]:
### ratings.csv

ratings = pd.read_csv('DATA/ratings.csv', index_col=False)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119


In [4]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33832162 entries, 0 to 33832161
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 1.0 GB


In [5]:
### movies.csv

movies = pd.read_csv('DATA/movies.csv', index_col=False)
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86537 entries, 0 to 86536
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  86537 non-null  int64 
 1   title    86537 non-null  object
 2   genres   86537 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.0+ MB


### Merge

In [7]:
mr = pd.merge(ratings, movies, on='movieId', how='outer')
mr.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1.0,1,4.0,1225735000.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2.0,1,5.0,835816000.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7.0,1,4.0,974518000.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,10.0,1,3.0,1430666000.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,12.0,1,5.0,862500700.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [8]:
mr = mr.drop('timestamp', axis=1)
mr

Unnamed: 0,userId,movieId,rating,title,genres
0,1.0,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2.0,1,5.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7.0,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,10.0,1,3.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,12.0,1,5.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...
33835455,47791.0,288967,3.5,State of Siege: Temple Attack (2021),Action|Drama
33835456,98408.0,288971,0.5,Ouija Japan (2021),Action|Horror
33835457,154483.0,288975,4.0,The Men Who Made the Movies: Howard Hawks (1973),Documentary
33835458,291389.0,288977,3.0,Skinford: Death Sentence (2023),Crime|Thriller


In [9]:
mr.isna().sum()

userId     3298
movieId       0
rating     3298
title         0
genres        0
dtype: int64

In [10]:
mr = mr.dropna()

In [11]:
mr.isna().sum()

userId     0
movieId    0
rating     0
title      0
genres     0
dtype: int64

In [12]:
mr['userId'] = mr['userId'].astype(int)
mr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mr['userId'] = mr['userId'].astype(int)


Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,1,5.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,10,1,3.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,12,1,5.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...
33835455,47791,288967,3.5,State of Siege: Temple Attack (2021),Action|Drama
33835456,98408,288971,0.5,Ouija Japan (2021),Action|Horror
33835457,154483,288975,4.0,The Men Who Made the Movies: Howard Hawks (1973),Documentary
33835458,291389,288977,3.0,Skinford: Death Sentence (2023),Crime|Thriller


### Filter

In [13]:
sample = mr.sample(n=3000000, random_state=1)
sample

Unnamed: 0,userId,movieId,rating,title,genres
33182245,243485,190973,2.5,Hidden Reserves (2017),Drama|Sci-Fi
16176049,71415,3005,4.0,"Bone Collector, The (1999)",Thriller
32084944,171084,138036,4.0,The Man from U.N.C.L.E. (2015),Action|Adventure|Comedy
6082545,256240,805,5.0,"Time to Kill, A (1996)",Drama|Thriller
16125818,93175,2997,3.5,Being John Malkovich (1999),Comedy|Drama|Fantasy
...,...,...,...,...,...
29704749,312288,90645,3.0,Anonymous (2011),Drama
11211111,40095,1704,5.0,Good Will Hunting (1997),Drama|Romance
3233117,22946,357,3.0,Four Weddings and a Funeral (1994),Comedy|Romance
9600547,217248,1307,3.0,When Harry Met Sally... (1989),Comedy|Romance


In [14]:
count = sample['userId'].value_counts()

In [15]:
valid_user = count[count > 150].index

In [16]:
sample_f = sample[sample['userId'].isin(valid_user)]

In [17]:
sample_f['userId'].value_counts()

userId
189614    2949
48766      851
76618      837
207216     817
175998     809
          ... 
9012       151
136471     151
328058     151
278369     151
36774      151
Name: count, Length: 1006, dtype: int64

In [18]:
sample_f.head(20)

Unnamed: 0,userId,movieId,rating,title,genres
18886269,20459,4034,3.5,Traffic (2000),Crime|Drama|Thriller
21129870,10129,5459,3.5,Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (...,Action|Comedy|Sci-Fi
30090305,211422,96110,4.0,"Campaign, The (2012)",Comedy
33624911,114227,224869,1.0,Winnie the Pooh: Springtime with Roo (2004),Animation|Children
27288136,189614,59273,3.0,Delirious (2006),Comedy|Drama
31174192,213479,112552,4.5,Whiplash (2014),Drama
11877966,153377,1939,3.0,"Best Years of Our Lives, The (1946)",Drama|War
23038858,166208,7139,5.0,In America (2002),Drama|Romance
24568186,49000,27728,0.5,Ghost in the Shell 2: Innocence (a.k.a. Innoce...,Action|Animation|Drama|Sci-Fi|Thriller
30352038,262716,99415,2.5,Parental Guidance (2012),Comedy


## Model

In [19]:
reader = Reader(rating_scale=(0, 5))  # Define the rating scale
data = Dataset.load_from_df(sample_f[['userId', 'movieId', 'rating']], reader)

In [20]:
trainset, testset = train_test_split(data, test_size=0.2)

In [21]:
svd = SVD(n_epochs=50)

In [22]:
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x11e7931a0>

In [23]:
prediction = svd.test(testset)

In [24]:
rmse = accuracy.rmse(prediction)
print(f"RMSE: {rmse:.4f}")

RMSE: 0.8811
RMSE: 0.8811


In [25]:
print(accuracy.mae(prediction))

MAE:  0.6730
0.6729893456746462


---------------------

### Cross Validation

In [26]:
results = cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8755  0.8695  0.8731  0.8784  0.8745  0.8742  0.0029  
MAE (testset)     0.6701  0.6642  0.6665  0.6695  0.6669  0.6674  0.0021  
Fit time          3.82    3.73    3.62    3.60    3.57    3.67    0.09    
Test time         0.19    0.18    0.18    0.18    0.18    0.18    0.00    


In [27]:
print("\nAverage RMSE with default parameters: ", results['test_rmse'].mean())
print("Average MAE with default parameters: ", results['test_mae'].mean())


Average RMSE with default parameters:  0.8742076269472958
Average MAE with default parameters:  0.6674361124209323


In [28]:
param_grid = {
    'n_factors': [50, 100, 150],
    'reg_all': [0.01, 0.1, 0.2],
    'lr_all': [0.005, 0.01, 0.05]
}


In [29]:
grid_search = GridSearchCV(SVD, param_grid, measures=['RMSE', 'MAE'], cv=5)
grid_search.fit(data)

In [30]:
print("Best parameters found: ", grid_search.best_params)
print("Best RMSE: ", grid_search.best_score['rmse'])

Best parameters found:  {'rmse': {'n_factors': 150, 'reg_all': 0.1, 'lr_all': 0.05}, 'mae': {'n_factors': 150, 'reg_all': 0.1, 'lr_all': 0.05}}
Best RMSE:  0.8421249024932635


In [32]:
best_svd = grid_search.best_estimator['rmse']
best_svd.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1069c9190>

In [33]:
trainset = best_svd.trainset
testset = trainset.build_testset()
predictions = best_svd.test(testset)

In [34]:
print("RMSE after tuning: ", accuracy.rmse(predictions))
print("MAE after tuning: ", accuracy.mae(predictions))

RMSE: 0.4494
RMSE after tuning:  0.4493743071800926
MAE:  0.3424
MAE after tuning:  0.3423865979568008


## Recommendation

--------------------------

In [1]:
def get_movie_recommendations(user_id, top_n=5):
    # Get all movie IDs
    all_movie_ids = sample_f['movieId'].unique()
    
    # Get the movies the user has already rated
    rated_movie_ids = sample_f[sample_f['userId'] == user_id]['movieId'].tolist()
    
    # Predict ratings for all movies the user has not rated
    predictions = []
    for movie_id in all_movie_ids:
        if movie_id not in rated_movie_ids:
            pred = best_svd.predict(user_id, movie_id)
            predictions.append((movie_id, pred.est))
    
    # Sort predictions by estimated rating (highest first)
    predictions.sort(key=lambda x: x[1], reverse=True)
    
    # Get the top N recommended movies
    top_predictions = predictions[:top_n]
    
    # Get movie titles for the top N recommended movies
    recommendations = []
    for movie_id, _ in top_predictions:
        movie_title = sample_f[sample_f['movieId'] == movie_id]['title'].iloc[0]  # get movie
        movie_genres = sample_f[sample_f['movieId'] == movie_id]['genres'].iloc[0] # get genre
        recommendations.append(movie_title)
    
    return recommendations

In [2]:
user_id = 20459
recommended_movies = get_movie_recommendations(user_id, top_n=5)

NameError: name 'sample_f' is not defined

In [37]:
print(f"Recommended Movies for User {user_id}:")
for movie in recommended_movies:
    print(f"- {movie}")

Recommended Movies for User 20459:
- Human Condition III, The (Ningen no joken III) (1961)
- The Fool (2014)
- Crazy About Tiffany's (2016)
- The Boy and the World (2013)
- Last Lions, The (2011)
