In [24]:
import pandas as pd
import numpy as np

In [25]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

In [26]:
df = pd.read_csv('/Users/zachariamwaura/Documents/Flatiron/Phase_4/Phase_4_Project/DATA/ratings.csv', index_col=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33832162 entries, 0 to 33832161
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 1.0 GB


In [54]:
df_sample = df.sample(n=100000, random_state=1)
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 33179850 to 21265524
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100000 non-null  int64  
 1   movieId    100000 non-null  int64  
 2   rating     100000 non-null  float64
 3   timestamp  100000 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.8 MB


In [55]:
reader = Reader(rating_scale=(1, 5))  # The scale of the ratings is between 1 and 5
data = Dataset.load_from_df(df_sample[['userId', 'movieId', 'rating']], reader)

In [56]:
trainset, testset = train_test_split(data, test_size=0.2)

In [57]:
model = SVD()

In [58]:
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x107bc8920>

In [59]:
predictions = model.test(testset)
rmse = accuracy.rmse(predictions)

RMSE: 0.9929


In [60]:
print(f"RMSE: {rmse}")

RMSE: 0.9929080946038031


In [61]:
user_id = 52584  # Example user
movie_id = 147   # Example movie

prediction = model.predict(user_id, movie_id)
print(f"Predicted rating for user {user_id} and movie {movie_id}: {prediction.est}")

Predicted rating for user 52584 and movie 147: 3.800889359021879


In [62]:
predictions

[Prediction(uid=294985, iid=296, r_ui=5.0, est=4.231484213683604, details={'was_impossible': False}),
 Prediction(uid=310568, iid=34150, r_ui=2.5, est=2.6718194511012423, details={'was_impossible': False}),
 Prediction(uid=137853, iid=741, r_ui=4.5, est=3.940464429628088, details={'was_impossible': False}),
 Prediction(uid=187362, iid=278824, r_ui=3.5, est=3.588151006625263, details={'was_impossible': False}),
 Prediction(uid=312932, iid=64614, r_ui=4.5, est=3.9397001073059155, details={'was_impossible': False}),
 Prediction(uid=327584, iid=1307, r_ui=2.0, est=4.069673956159506, details={'was_impossible': False}),
 Prediction(uid=139084, iid=2065, r_ui=3.0, est=3.7196660578319602, details={'was_impossible': False}),
 Prediction(uid=251903, iid=3033, r_ui=4.0, est=3.4720441237069117, details={'was_impossible': False}),
 Prediction(uid=207897, iid=293, r_ui=5.0, est=3.971480419628877, details={'was_impossible': False}),
 Prediction(uid=267743, iid=308, r_ui=5.0, est=3.8098865959893087, d

In [63]:
pp = pd.DataFrame(predictions)

In [64]:
pp

Unnamed: 0,uid,iid,r_ui,est,details
0,294985,296,5.0,4.231484,{'was_impossible': False}
1,310568,34150,2.5,2.671819,{'was_impossible': False}
2,137853,741,4.5,3.940464,{'was_impossible': False}
3,187362,278824,3.5,3.588151,{'was_impossible': False}
4,312932,64614,4.5,3.939700,{'was_impossible': False}
...,...,...,...,...,...
19995,10746,183699,3.5,3.742512,{'was_impossible': False}
19996,268560,434,1.0,3.119893,{'was_impossible': False}
19997,52600,3361,5.0,3.664409,{'was_impossible': False}
19998,174862,32139,3.0,3.528233,{'was_impossible': False}


In [65]:
pp['uid'].value_counts()

uid
189614    23
326681    10
216749     9
207216     9
76618      8
          ..
179505     1
246416     1
190668     1
248871     1
146395     1
Name: count, Length: 17266, dtype: int64

In [66]:
pp[pp['uid'] == 189614]

Unnamed: 0,uid,iid,r_ui,est,details
1159,189614,4716,3.5,2.946367,{'was_impossible': False}
1547,189614,203236,3.5,2.946367,{'was_impossible': False}
1720,189614,114329,3.0,3.01431,{'was_impossible': False}
2505,189614,171087,2.5,2.946367,{'was_impossible': False}
2507,189614,26875,4.0,2.921441,{'was_impossible': False}
5241,189614,133089,3.0,2.946367,{'was_impossible': False}
6082,189614,2603,3.5,2.946367,{'was_impossible': False}
6648,189614,84876,4.5,2.946367,{'was_impossible': False}
6997,189614,186057,3.0,2.946367,{'was_impossible': False}
9236,189614,163436,2.0,2.946367,{'was_impossible': False}


In [67]:
from collections import defaultdict

# Step 7: Get top 5 recommendations for a specific user
def get_top_n_recommendations(predictions, n=5):
    # Build a dictionary of predictions for each user
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        if uid not in top_n:
            top_n[uid] = []
        top_n[uid].append((iid, est))

    # Now sort the predictions for each user and get the n highest rated items
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [68]:
top_n_recommendations = get_top_n_recommendations(predictions, n=5)

In [69]:
user_id = 189614
top_5_for_user = top_n_recommendations.get(user_id, [])
print(f"Top 5 recommendations for user {user_id}:")
for movie_id, est_rating in top_5_for_user:
    print(f"Movie ID: {movie_id}, Predicted Rating: {est_rating:.2f}")

Top 5 recommendations for user 189614:
Movie ID: 1222, Predicted Rating: 3.37
Movie ID: 114329, Predicted Rating: 3.01
Movie ID: 4716, Predicted Rating: 2.95
Movie ID: 203236, Predicted Rating: 2.95
Movie ID: 171087, Predicted Rating: 2.95


In [70]:
movie_id_list = np.array(
    list(zip(*top_n_recommendations[user_id]))[0], dtype = 'int')
movie_id_list

array([  1222, 114329,   4716, 203236, 171087])

In [None]:
# predict on three users
# watched, rating
# recommend