##### Trying the BellKor's Pragmatic Chaos algorithm

In [66]:
import pandas as pd
import numpy as np

In [67]:
from sklearn.metrics import r2_score

In [68]:
from surprise import Dataset, Reader
from surprise import SVD, SVDpp, accuracy
from surprise.model_selection import train_test_split, GridSearchCV

In [69]:
from collections import defaultdict

In [70]:
df = pd.read_csv('/Users/zachariamwaura/Documents/Flatiron/Phase_4/Phase_4_Project/DATA/ratings.csv', index_col=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33832162 entries, 0 to 33832161
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 1.0 GB


In [71]:
df_sample = df.sample(n=100000, random_state=1)
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 33179850 to 21265524
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100000 non-null  int64  
 1   movieId    100000 non-null  int64  
 2   rating     100000 non-null  float64
 3   timestamp  100000 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.8 MB


In [72]:
df_sample['userId'].value_counts()

userId
189614    105
207216     40
76618      30
48766      30
233891     29
         ... 
219707      1
313398      1
219351      1
257006      1
207550      1
Name: count, Length: 61540, dtype: int64

In [73]:
#df_sample = df_sample.drop('timestamp', axis=1)

--------------

In [74]:
reader = Reader(rating_scale=(1,5))

In [75]:
data = Dataset.load_from_df(df_sample[['userId', 'movieId', 'rating']], reader)

In [76]:
trainset, testset = train_test_split(data, test_size=.2, random_state=42)

-----------------

In [77]:
svd = SVD()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x146d8e090>

In [78]:
svdpp = SVDpp()
svdpp.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x146d8fe60>

In [79]:
svd_predictions = svd.test(testset)
svdpp_predictions = svdpp.test(testset)

In [80]:
svd_rmse = accuracy.rmse(svd_predictions)
svdpp_rmse = accuracy.rmse(svdpp_predictions)

RMSE: 0.9865
RMSE: 0.9863


In [81]:
print(f"SVD RMSE: {svd_rmse}")
print(f"SVD++ RMSE: {svdpp_rmse}")

SVD RMSE: 0.9864541434721968
SVD++ RMSE: 0.9862737619811071


--------------

In [82]:
df_sample.sample(n=1)

Unnamed: 0,userId,movieId,rating,timestamp
29264832,285882,173,3.0,835627114


In [83]:
svd_predictions

[Prediction(uid=320615, iid=457, r_ui=4.0, est=3.9543200042948774, details={'was_impossible': False}),
 Prediction(uid=85899, iid=48516, r_ui=4.0, est=3.9747269375406638, details={'was_impossible': False}),
 Prediction(uid=16521, iid=4896, r_ui=3.5, est=3.4102952265844455, details={'was_impossible': False}),
 Prediction(uid=199688, iid=67255, r_ui=4.5, est=3.5513593898113265, details={'was_impossible': False}),
 Prediction(uid=274004, iid=1672, r_ui=4.0, est=3.7256412372523626, details={'was_impossible': False}),
 Prediction(uid=276446, iid=2792, r_ui=4.0, est=3.214325241658067, details={'was_impossible': False}),
 Prediction(uid=180788, iid=3897, r_ui=3.0, est=3.7198311202390144, details={'was_impossible': False}),
 Prediction(uid=276405, iid=135885, r_ui=3.0, est=3.4154138091163726, details={'was_impossible': False}),
 Prediction(uid=302622, iid=1375, r_ui=3.0, est=3.304457732072382, details={'was_impossible': False}),
 Prediction(uid=214968, iid=4886, r_ui=3.0, est=3.755268744294304

In [84]:
pp = pd.DataFrame(svd_predictions)
pp

Unnamed: 0,uid,iid,r_ui,est,details
0,320615,457,4.0,3.954320,{'was_impossible': False}
1,85899,48516,4.0,3.974727,{'was_impossible': False}
2,16521,4896,3.5,3.410295,{'was_impossible': False}
3,199688,67255,4.5,3.551359,{'was_impossible': False}
4,274004,1672,4.0,3.725641,{'was_impossible': False}
...,...,...,...,...,...
19995,243459,513,2.0,3.505424,{'was_impossible': False}
19996,82623,72369,3.5,3.547431,{'was_impossible': False}
19997,1011,56511,3.0,3.544489,{'was_impossible': False}
19998,226523,318,5.0,4.462517,{'was_impossible': False}


In [85]:
pp['uid'].value_counts()

uid
189614    19
76618     12
207216     9
233891     8
186916     8
          ..
269108     1
5133       1
154915     1
110478     1
281002     1
Name: count, Length: 17352, dtype: int64

In [86]:
def get_top_n_recommendations(predictions, n=5):
    # Build a dictionary of predictions for each user
    top_n = {}
    for uid, iid, true_r, est, _ in predictions:
        if uid not in top_n:
            top_n[uid] = []
        top_n[uid].append((iid, est))

    # Now sort the predictions for each user and get the n highest rated items
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [87]:
testset_full = trainset.all_items()  # This will contain all movieId items
predictions_svd = [svd.predict(uid, iid) for uid in df_sample['userId'] for iid in testset_full]

KeyboardInterrupt: 

In [88]:
svd.predict(238308, 5995)

Prediction(uid=238308, iid=5995, r_ui=None, est=4.2042869794098925, details={'was_impossible': False})

In [89]:
top_n_svd = get_top_n_recommendations(svd_predictions, n=5)

In [91]:
user_id = 207216
print(f"Top 5 recommendations for user {user_id} using SVD:")
for iid, rating in top_n_svd[user_id]:
    print(f"MovieId: {iid}, Predicted Rating: {rating:.2f}")

Top 5 recommendations for user 207216 using SVD:
MovieId: 2183, Predicted Rating: 3.79
MovieId: 8502, Predicted Rating: 3.39
MovieId: 67068, Predicted Rating: 3.39
MovieId: 53554, Predicted Rating: 3.39
MovieId: 4418, Predicted Rating: 3.39


In [None]:
MovieId: 84189, Predicted Rating: 3.30
MovieId: 90071, Predicted Rating: 3.14
MovieId: 105855, Predicted Rating: 3.14
MovieId: 151453, Predicted Rating: 3.14
MovieId: 163839, Predicted Rating: 3.14

In [None]:
# top_n_recommendations = get_top_n_recommendations(svd_predictions, n=5)

In [None]:
# user_id = 192762
# top_5_for_user = top_n_recommendations.get(user_id, [])
# print(f"Top 5 recommendations for user {user_id}:")
# for movie_id, est_rating in top_5_for_user:
#     print(f"Movie ID: {movie_id}, Predicted Rating: {est_rating:.2f}")

Top 5 recommendations for user 192762:
Movie ID: 19, Predicted Rating: 3.25


-------------------------------------

In [None]:
user_id = 190105
movie_id = 1544

In [None]:
svd_prediction = svd.predict(user_id, movie_id)
print(f"SVD prediction for user {user_id} and movie {movie_id}: {svd_prediction.est:.2f}")

In [None]:
svdpp_prediction = svdpp.predict(user_id, movie_id)
print(f"SVD++ prediction for user {user_id} and movie {movie_id}: {svdpp_prediction.est:.2f}")

In [None]:
# Combine SVD and SVD++ predictions (simple average)
def ensemble_predictions(pred_svd, pred_svdpp):
    return (pred_svd.est + pred_svdpp.est) / 2

# Get predictions for the same user and movie
ensemble_prediction = ensemble_predictions(svd_prediction, svdpp_prediction)
print(f"Ensemble prediction for user {user_id} and movie {movie_id}: {ensemble_prediction:.2f}")


--------------

In [None]:
def get_top_n(predictions, n=5):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Sort the predictions for each user and retrieve the top N
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
        
    return top_n

In [None]:
# Generate predictions for all user-item pairs
testset_full = trainset.all_items()  # This will contain all movieId items
predictions_svd = [svd.predict(uid, iid) for uid in df['userId'].unique() for iid in testset_full]

In [None]:
# Get top 5 recommendations for each user
# top_n_svd = get_top_n(predictions_svd, n=5)

In [None]:
# user_id = 138297
# print(f"Top 5 recommendations for user {user_id} using SVD:")
# for iid, rating in top_n_svd[user_id]:
#     print(f"MovieId: {iid}, Predicted Rating: {rating:.2f}")

In [None]:
# mae = accuracy.mae(svd_prediction)