### cosine_similarity


In [22]:
import pandas as pd
import numpy as np

In [23]:
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import SVD # implementation of Funk's SVD (gradient descent-based matrix factorization)
from surprise import accuracy

In [24]:
from collections import defaultdict

In [25]:
df = pd.read_csv('/Users/zachariamwaura/Documents/Flatiron/Phase_4/Phase_4_Project/DATA/ratings.csv', index_col=False)
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33832162 entries, 0 to 33832161
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 1.0 GB


In [27]:
# taking a small sample to work with

df_sample = df.sample(n=100000, random_state=1)
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 33179850 to 21265524
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100000 non-null  int64  
 1   movieId    100000 non-null  int64  
 2   rating     100000 non-null  float64
 3   timestamp  100000 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.8 MB


In [28]:
df_sample = df_sample.drop('timestamp', axis=1)

In [29]:
df_sample.sample(n=20, random_state=1)

Unnamed: 0,userId,movieId,rating
5427921,52584,147,2.5
9934350,97735,1221,5.0
22097984,215490,5649,4.0
26803234,261621,1221,2.5
10434975,102685,103688,3.0
7316279,71560,8865,3.5
404602,3968,208,3.0
33585414,328746,337,4.0
7341326,71782,1136,4.5
18241386,178768,3528,3.5


-------------

In [30]:
# Reader Object

reader = Reader(rating_scale=(1,5))
reader

<surprise.reader.Reader at 0x12f3bef90>

In [31]:
# Dataset generator

df_gen = Dataset.load_from_df(df_sample, reader=reader)
df_gen

<surprise.dataset.DatasetAutoFolds at 0x12f3bef60>

In [32]:
# train_test

trainset, testset = train_test_split(df_gen, test_size=.2, random_state=42)

In [33]:
# Funk's SVD

svd = SVD()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x156964ec0>

In [34]:
predictions = svd.test(testset)

In [35]:
rmse = accuracy.rmse(predictions)

RMSE: 0.9870


In [36]:
print(f"RMSE: {rmse}")

RMSE: 0.9869758901618504


In [58]:
predictions[0:6]

[Prediction(uid=320615, iid=457, r_ui=4.0, est=3.966313093488326, details={'was_impossible': False}),
 Prediction(uid=85899, iid=48516, r_ui=4.0, est=3.9890119699729993, details={'was_impossible': False}),
 Prediction(uid=16521, iid=4896, r_ui=3.5, est=3.625512249203569, details={'was_impossible': False}),
 Prediction(uid=199688, iid=67255, r_ui=4.5, est=3.6081617538389814, details={'was_impossible': False}),
 Prediction(uid=274004, iid=1672, r_ui=4.0, est=3.7175531321139395, details={'was_impossible': False}),
 Prediction(uid=276446, iid=2792, r_ui=4.0, est=3.1978018351709827, details={'was_impossible': False})]

In [39]:
def get_top_n_recommendations(predictions, n=5):
    # Build a dictionary of predictions for each user
    top_n = {}
    for uid, iid, true_r, est, _ in predictions:
        if uid not in top_n:
            top_n[uid] = []
        top_n[uid].append((iid, est))

    # Now sort the predictions for each user and get the n highest rated items
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [55]:
top_n_recommendations = get_top_n_recommendations(predictions, n=5)

In [None]:
top_n_recommendations[]

[(457, 3.966313093488326)]

In [69]:
user_id = 320615
top_n_recommendations[user_id]

[(457, 3.966313093488326)]

In [70]:
user_id = 320615

top_5_for_user = top_n_recommendations[user_id]

print(f"Top 5 recommendations for user {user_id}:")
for movie_id, est_rating in top_5_for_user:
    print(f"Movie ID: {movie_id}, Predicted Rating: {est_rating:.2f}")


Top 5 recommendations for user 320615:
Movie ID: 457, Predicted Rating: 3.97


In [60]:
top_n_recommendations.get(user_id, [])

[]

In [57]:
print(top_5_for_user)

[]


In [46]:
top_5_for_user

[]

In [52]:
for movie_id, est_rating in top_5_for_user:
    print(f"Movie ID: {movie_id}, Predicted Rating: {est_rating:.2f}")
    #print('1')

TypeError: 'NoneType' object is not iterable

In [None]:
accuracy.fcp(predictions)

In [None]:
top_n = defaultdict(list)

In [None]:
from collections import defaultdict

# given prediction for a set of users, get the top n ranked for each user 

def get_top_n(predictions, n):

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        # print(f"Adding prediction: uid={uid}, iid={iid}, est={est}") 
        top_n[uid].append((iid, est, true_r))
        
    #print("Top N dictionary before sorting:", dict(top_n))
    
    # Then sort the predictions for each user and retrieve the n highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
        #print(top_n[uid])
    return top_n

In [None]:
top_n_preds_test = get_top_n(predictions, 3)

In [None]:
top_n_preds_test['10']