### Lecture code

### Imports

In [1]:
# imports

import pandas as pd
import numpy as np

In [2]:
from surprise import Dataset, Reader
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import train_test_split

-----------------

### File

In [30]:
df = pd.read_csv('/Users/zachariamwaura/Documents/Flatiron/Phase_4/Phase_4_Project/DATA/ratings.csv', index_col=False)
df.head(20)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119
5,1,381,3.5,1225734105
6,1,596,4.0,1225733524
7,1,1036,5.0,1225735626
8,1,1049,3.0,1225734079
9,1,1066,4.0,1225736961


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33832162 entries, 0 to 33832161
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 1.0 GB


In [5]:
df.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [33]:
df['movieId'].min(), df['movieId'].max()

(1, 288983)

In [34]:
df['movieId'].value_counts()

movieId
318       122296
356       113581
296       108756
2571      107056
593       101802
           ...  
261715         1
257463         1
256693         1
161766         1
269782         1
Name: count, Length: 83239, dtype: int64

 ----------

#### create sample

In [6]:
# create big enough sample
df_sample = df.sample(n=3300000, random_state=1)
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3300000 entries, 33179850 to 33474626
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 125.9 MB


In [31]:
df_sample.head(20)

Unnamed: 0,userId,movieId,rating,timestamp
33179850,324684,3898,1.5,1144760809
16176049,158793,922,4.0,1450700720
32084054,313695,2423,4.0,1133582037
6082545,59202,5218,3.0,1033884899
16125818,158269,1779,2.0,1660913035
27097698,264464,1957,4.0,953067620
1315136,12727,106782,3.5,1653959620
32740492,319929,9018,3.5,1469934763
29604929,289257,166643,3.5,1515856802
6787316,66339,2105,4.0,1553843526


In [7]:
df['userId'].value_counts()[:20]

userId
189614    33332
48766      9554
207216     9178
175998     9016
76618      8919
230765     7719
184775     7535
236260     7488
233891     7372
214831     7266
221500     6380
267315     6199
193414     6074
113052     5976
100696     5806
256904     5805
211359     5784
177589     5693
134353     5653
73700      5649
Name: count, dtype: int64

---------------

### SVD MODEL

In [8]:
reader = Reader(rating_scale=(1,5))
data =  Dataset.load_from_df(df_sample[['userId', 'movieId', 'rating']], reader)

In [9]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [10]:
SVD_model = SVD()

In [11]:
SVD_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x11eb4c320>

In [12]:
prediction = SVD_model.test(testset)

In [13]:
rmse = accuracy.rmse(prediction)

RMSE: 0.9071


In [14]:
print(f'RMSE: {rmse}')

RMSE: 0.9070680312194619


In [29]:
accuracy.fcp(prediction)

FCP:  0.6175


0.6175236440850813

------------

In [15]:
pred = pd.DataFrame(prediction)
pred

Unnamed: 0,uid,iid,r_ui,est,details
0,204354,364,5.0,4.179530,{'was_impossible': False}
1,26901,1968,2.0,3.426747,{'was_impossible': False}
2,126268,4993,4.5,3.906216,{'was_impossible': False}
3,40578,2125,4.0,3.132345,{'was_impossible': False}
4,173765,344,4.0,3.197198,{'was_impossible': False}
...,...,...,...,...,...
659995,270051,83613,2.5,2.685898,{'was_impossible': False}
659996,303606,248,3.0,3.411349,{'was_impossible': False}
659997,81014,2565,4.0,3.361431,{'was_impossible': False}
659998,101226,8965,3.5,2.874387,{'was_impossible': False}


In [16]:
pred['uid'].value_counts()[:10]

uid
189614    692
48766     205
207216    190
175998    177
184775    160
230765    158
76618     155
233891    155
113052    133
236260    131
Name: count, dtype: int64

In [17]:
user_id = 189614
movie_id = 364

In [18]:
predictions = SVD_model.predict(user_id, movie_id)

In [19]:
print(f"Predicted rating for user {user_id} and movie {movie_id}: {predictions.est}")

Predicted rating for user 189614 and movie 364: 3.155539934237252


-----------

#### top 5

In [26]:
def get_top_n_recommendations(prediction, n=5):
    # Build a dictionary of predictions for each user
    top_n = {}
    for uid, iid, true_r, est, _ in prediction:
        if uid not in top_n:
            top_n[uid] = []
        top_n[uid].append((iid, est))

    # Now sort the predictions for each user and get the n highest rated items
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [27]:
top_n_recommendations = get_top_n_recommendations(prediction, n=5)

In [28]:
user_id = 189614
top_5_for_user = top_n_recommendations.get(user_id, [])
print(f"Top 5 recommendations for user {user_id}:")
for movie_id, est_rating in top_5_for_user:
    print(f"Movie ID: {movie_id}, Predicted Rating: {est_rating:.2f}")

Top 5 recommendations for user 189614:
Movie ID: 1193, Predicted Rating: 4.15
Movie ID: 953, Predicted Rating: 3.84
Movie ID: 8014, Predicted Rating: 3.81
Movie ID: 80906, Predicted Rating: 3.80
Movie ID: 134089, Predicted Rating: 3.78


------------