## Packages - Installation & Import

In [3]:
#surprise package installation
!pip install surprise



In [1]:
#import relevant packages
import pandas as pd
import time
import sys
import pickle
from surprise import Dataset, NormalPredictor, Reader,SVDpp,KNNBasic,accuracy,CoClustering
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split



colnames=['time', 'userid', 'movieid', 'ratings'] 
df = pd.read_csv('kafka_log_sample.csv', names=colnames, header=None)

In [2]:
df.shape

(1014609, 4)

### Taking the top 1000 popular movies

In [3]:
top_1000_movies=df.groupby('movieid').size().sort_values(ascending=False)[:1000].index.tolist()
with open('top_1000_movie_list_final.pkl', 'wb') as file:
    pickle.dump(top_1000_movies, file)

## Load Data

In [4]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
dataset = Dataset.load_from_df(df[["userid", "movieid", "ratings"]], reader)

trainset, testset = train_test_split(dataset, test_size=0.25)



## Algorithm 1 : Singular Value Decompostition

In [5]:
algo =  SVDpp()

# Train the algorithm on the trainset, and predict ratings for the testset
%time algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

CPU times: user 7.7 s, sys: 42.7 ms, total: 7.74 s
Wall time: 7.75 s
RMSE: 0.6834


0.683399136655341

In [6]:
df.head()

Unnamed: 0,time,userid,movieid,ratings
0,2023-02-07T18:10:53,796421,set+it+off+1996,3
1,2023-02-07T18:10:53,7488,it+follows+2015,4
2,2023-02-07T18:10:54,720622,the+matrix+1999,4
3,2023-02-07T18:10:54,425095,harry+potter+and+the+deathly+hallows+part+1+2010,4
4,2023-02-07T18:10:54,731167,taking+care+of+business+1990,3


## Test Recommendations and inference time for a User ID using SVDpp

In [7]:
%%time
# Generate recommendations for user 1
user_id ="309409"
items_to_predict=top_1000_movies
predicted_ratings = []
for item_id in items_to_predict:
    predicted_rating = algo.predict(user_id, item_id).est
    predicted_ratings.append((item_id, predicted_rating))

# Sort the predicted ratings in descending order
predicted_ratings = sorted(predicted_ratings, key=lambda x: x[1], reverse=True)

# Remove already watched movies
predicted_ratings = pd.DataFrame(predicted_ratings,columns = ['movieid','predicted_rating'])
#predicted_ratings['userid'] == user_id
reference = df[['userid','movieid']].copy()
predicted_ratings_unwatched = predicted_ratings[~predicted_ratings['movieid'].isin(reference[reference['userid'] == user_id]['movieid'])]

print('Top 10 movie recommendations')
#predicted_ratings[:10]
predicted_ratings_unwatched[:20]
                            
                                                
    





Top 10 movie recommendations
CPU times: user 30.4 ms, sys: 10.3 ms, total: 40.7 ms
Wall time: 41 ms


Unnamed: 0,movieid,predicted_rating
0,the+shawshank+redemption+1994,4.428108
1,the+godfather+1972,4.323661
2,the+empire+strikes+back+1980,4.284843
3,the+usual+suspects+1995,4.274249
4,wait+until+dark+1967,4.26041
5,the+wrong+trousers+1993,4.249172
6,the+lord+of+the+rings+the+return+of+the+king+2003,4.240475
7,ordet+1955,4.235991
8,rear+window+1954,4.234633
9,casablanca+1942,4.232123


## Algorithm 2 : Co-Clustering

In [8]:
algo2 =  CoClustering()

# Train the algorithm on the trainset, and predict ratings for the testset
%time algo2.fit(trainset)
predictions2 = algo2.test(testset)

# Then compute RMSE
accuracy.rmse(predictions2)

CPU times: user 39.2 s, sys: 82.9 ms, total: 39.3 s
Wall time: 39.3 s
RMSE: 0.7984


0.7984214776212446

## Test Recommendations and inference time for a User ID using CoClustering

In [9]:
%%time
# Generate recommendations for user 1
user_id = 454384
items_to_predict = top_1000_movies
predicted_ratings = []
for item_id in items_to_predict:
    predicted_rating = algo2.predict(user_id, item_id).est
    predicted_ratings.append((item_id, predicted_rating))

# Sort the predicted ratings in descending order
predicted_ratings = sorted(predicted_ratings, key=lambda x: x[1], reverse=True)

# Remove already watched movies
predicted_ratings = pd.DataFrame(predicted_ratings,columns = ['movieid','predicted_rating'])
#predicted_ratings['userid'] == user_id
reference = df[['userid','movieid']].copy()
predicted_ratings_unwatched = predicted_ratings[~predicted_ratings['movieid'].isin(reference[reference['userid'] == user_id]['movieid'])]

print('Top 10 movie recommendations')
#predicted_ratings[:10]
predicted_ratings_unwatched[:10]
                            
                                                
    






Top 10 movie recommendations
CPU times: user 34.7 ms, sys: 7.29 ms, total: 42 ms
Wall time: 41.9 ms


Unnamed: 0,movieid,predicted_rating
0,death+proof+2007,4.918604
1,django+unchained+2012,4.908837
2,jackie+brown+1997,4.890591
3,kill+bill+vol.+2+2004,4.858979
4,pulp+fiction+1994,4.742707
5,the+wrong+trousers+1993,4.71952
6,das+boot+1981,4.687191
7,the+lord+of+the+rings+the+fellowship+of+the+ri...,4.682292
8,the+secret+in+their+eyes+2009,4.681073
9,raiders+of+the+lost+ark+1981,4.678215


## Loading SVDpp model in a pickle file - Production model

In [10]:
import pickle

with open('movie_recommender_model_final.pkl', 'wb') as file:
    pickle.dump(algo, file)

## Loading Co-clustering model in a pickle file

In [12]:
import pickle

with open('movie_recommender_model_co_clustering.pkl', 'wb') as file:
    pickle.dump(algo2, file)