# RECOMMENDATION SYSTEM
* ratings.csv

-----------

## Imports

In [23]:
import pandas as pd
import numpy as np

In [47]:
from surprise import Dataset, Reader
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import train_test_split, GridSearchCV

------------

## Load Needed Files

In [25]:
ratings = pd.read_csv('DATA/ratings.csv', index_col=False)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119


-------------

In [26]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33832162 entries, 0 to 33832161
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 1.0 GB


In [27]:
ratings.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

### Create Sample

In [28]:
ratings_sample = ratings.sample(n=100000, random_state=1)
ratings_sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 33179850 to 21265524
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100000 non-null  int64  
 1   movieId    100000 non-null  int64  
 2   rating     100000 non-null  float64
 3   timestamp  100000 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.8 MB


--------------

## Model

In [29]:
# reader
reader = Reader(rating_scale=(1,5))
reader

<surprise.reader.Reader at 0x133c529f0>

In [30]:
# Dataset
data_gen = Dataset.load_from_df(ratings_sample[['userId', 'movieId', 'rating']], reader)
data_gen

<surprise.dataset.DatasetAutoFolds at 0x133c53da0>

In [31]:
trainset, testset = train_test_split(data_gen, test_size=0.2, random_state=42)

### SVD base

In [32]:
SVD_model = SVD()

In [33]:
SVD_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x133c53320>

In [34]:
prediction = SVD_model.test(testset)

In [35]:
print(accuracy.mae(prediction))

MAE:  0.7630
0.7629697110962704


In [36]:
print(accuracy.fcp(prediction))

FCP:  0.4791
0.47913708356089574


We are getting pairwise item ranking preferences for each user right ~ 62% of the time.

#### getting predictions

In [37]:
pred = pd.DataFrame(prediction)
pred

Unnamed: 0,uid,iid,r_ui,est,details
0,320615,457,4.0,3.953768,{'was_impossible': False}
1,85899,48516,4.0,3.965865,{'was_impossible': False}
2,16521,4896,3.5,3.345697,{'was_impossible': False}
3,199688,67255,4.5,3.351295,{'was_impossible': False}
4,274004,1672,4.0,3.702138,{'was_impossible': False}
...,...,...,...,...,...
19995,243459,513,2.0,3.510620,{'was_impossible': False}
19996,82623,72369,3.5,3.541059,{'was_impossible': False}
19997,1011,56511,3.0,3.552293,{'was_impossible': False}
19998,226523,318,5.0,4.456570,{'was_impossible': False}


In [38]:
pred['uid'].value_counts()[:10]

uid
189614    19
76618     12
207216     9
233891     8
186916     8
116432     7
299380     7
20932      6
193414     6
268068     6
Name: count, dtype: int64

------------

In [39]:
user_id = 189614
movie_id = 364

In [40]:
predictions = SVD_model.predict(user_id, movie_id)

In [41]:
print(f"Predicted rating for user {user_id} and movie {movie_id}: {predictions.est}")

Predicted rating for user 189614 and movie 364: 3.504219280959764


-------------

#### Top 5's

In [42]:
def get_top_n_recommendations(prediction, n=5):
    # Build a dictionary of predictions for each user
    top_n = {}
    for uid, iid, true_r, est, _ in prediction:
        if uid not in top_n:
            top_n[uid] = []
        top_n[uid].append((iid, est))

    # Now sort the predictions for each user and get the n highest rated items
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [43]:
top_n_recommendations = get_top_n_recommendations(prediction, n=5)

In [46]:
user_id = 76618
top_5_for_user = top_n_recommendations.get(user_id, [])
print(f"Top 5 recommendations for user {user_id}:")
for movie_id, est_rating in top_5_for_user:
    print(f"Movie ID: {movie_id}, Predicted Rating: {est_rating:.2f}")

Top 5 recommendations for user 76618:
Movie ID: 1259, Predicted Rating: 3.21
Movie ID: 4642, Predicted Rating: 2.71
Movie ID: 55875, Predicted Rating: 2.67
Movie ID: 157881, Predicted Rating: 2.59
Movie ID: 2315, Predicted Rating: 2.59


In [45]:
movie_id_list = np.array(list(zip(*top_n_recommendations[76618]))[0], dtype = 'int')
movie_id_list

array([  1259,   4642,  55875, 157881,   2315])

### GridSearch

In [53]:
param_grid = {
    'n_factors': [50, 100, 200],  # Latent factors
    'reg_all': [0.02, 0.1, 0.2],   # Regularization parameter
    'lr_all': [0.002, 0.005]     # Learning rate          
}

In [57]:
grid_search = GridSearchCV(SVD, param_grid, measures=["mae", "fcp"], cv=5)
grid_search.fit(data_gen)

In [58]:
grid_search.best_params

{'mae': {'n_factors': 50, 'reg_all': 0.02, 'lr_all': 0.005},
 'fcp': {'n_factors': 50, 'reg_all': 0.1, 'lr_all': 0.005}}

In [59]:
grid_search.best_estimator

{'mae': <surprise.prediction_algorithms.matrix_factorization.SVD at 0x1334b73b0>,
 'fcp': <surprise.prediction_algorithms.matrix_factorization.SVD at 0x126370650>}

In [60]:
grid_search.best_score

{'mae': 0.7630159112975148, 'fcp': 0.48935477698789187}