In [None]:
!pip install -r requirements.txt

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import surprise
import numpy as np
from compress_pickle import load,dump
import time

In [None]:
rating1 = pd.read_csv('../data/data_v2/ratings_v2_1.csv')
rating2 = pd.read_csv('../data/data_v2/ratings_v2_2.csv')
ratings = pd.concat([rating1,rating2])
ratings.head()
ratings.shape

In [None]:
print(ratings.nunique(axis=0))
sns.barplot(x = ratings.columns, y = ratings.nunique(axis=0),palette='Blues_d')

In [None]:
# univariate analysis
plt.figure(1, figsize = (16,4))
ratings['movie_id'].value_counts()[:50].plot(kind = 'bar') #take top 50 movies
plt.figure(2, figsize = (16,4))
ratings['user_id'].value_counts()[:50].plot(kind = 'bar') #take top 50 users
plt.figure(3, figsize = (8,4))
ratings['rate'].plot(kind = 'hist')

In [None]:
ratings_per_user = ratings.groupby('user_id')['movie_id'].count() 
ratings_per_user.hist() 

In [None]:
ratings_per_movie = ratings.groupby('movie_id')['user_id'].count() 
ratings_per_movie.hist() 

# Collaborative Filtering Algos

In [None]:
from surprise import KNNWithMeans
from surprise import SVD

#kNN
similarity = {
    "name":"cosine",
    "user_based": False #item-based similarity
}
algo_KNN = KNNWithMeans(sim_options=similarity)

#SVD
algo_SVD = SVD()

In [None]:
ratings_small = ratings.sample(50000)

# movie_rating_set = pd.crosstab(index = ratings_small.user_id, columns = ratings_small.movie_id, values = ratings_small.rate, aggfunc = np.mean)

In [None]:
print("Number of unique users in the dataset:",ratings.user_id.unique().size)
print("Number of unique movies in the dataset:",ratings.movie_id.unique().size)

# Train-Test Split

#### Cross Validation

In [None]:
from surprise import Dataset
from surprise import Reader

# load df into Surprise Reader object
reader = Reader(rating_scale = (0,5))
rating_df = Dataset.load_from_df(ratings[['user_id', 'movie_id', 'rate']], reader)

In [None]:
from surprise.model_selection import train_test_split
from surprise import accuracy

# define train test function
def train_test_algo(algo, label):
    training_set, testing_set = train_test_split(rating_df, test_size = 0.2)
    start_time = time.time()
    algo.fit(training_set)
    end_time = time.time()
    Training_time = end_time - start_time
    print("Training Time :",Training_time)
    test_output = algo.test(testing_set)
    test_df = pd.DataFrame(test_output)
    
    print("RMSE -",label, accuracy.rmse(test_output, verbose = False))
    print("MAE -", label, accuracy.mae(test_output, verbose=False))
    print("MSE -", label, accuracy.mse(test_output, verbose=False))
    
    return algo,test_df

In [None]:
# get test result
knn,train_test_KNN = train_test_algo(algo_KNN, "algo_KNN")

#compressing the pickle files
dump(knn, 'KNN/algo_KNN.pkl', compression="lzma", set_default_extension=False)

# surprise.dump.dump('KNN/algo_KNN.pkl',algo=knn)
print(train_test_KNN.head())

svd,train_test_SVD = train_test_algo(algo_SVD, "algo_SVD")

#compressing the pickle files
dump(svd, 'SVD/algo_SVD.pkl', compression="lzma", set_default_extension=False)

# surprise.dump.dump('SVD/algo_SVD.pkl',algo=svd)
print(train_test_SVD.head())

# Provide Top Recommendation

In [None]:
movie_df = pd.read_csv("../data/data_v2/movie_info_v2.csv")

In [None]:
def prediction(algo, user_id):
    pred_list = []
    for movieId in range(26826):
        rating = algo.predict(user_id, movieId).est
        pred_list.append([user_id, movieId, rating])
    pred_df = pd.DataFrame(pred_list, columns = ['user_id', 'movie_id', 'rate'])
    return pred_df

In [None]:
def top_recommendations(pred_df, top_N):
    recommended_movie = pd.merge(pred_df, movie_df, how='left', left_on='movie_id', right_on='movie_id')[['user_id', 'movie_id', 'rate', 'title']]
    sorted_df = recommended_movie.groupby(('user_id'), as_index = False).apply(lambda x: x.sort_values(['rate'], ascending = False)).reset_index(drop=True)
    top_recommended_movies = sorted_df.groupby('user_id').head(top_N)
    return sorted_df, top_recommended_movies

In [None]:
#uncompress and load the pickle files
algo_KNN = load('KNN/algo_KNN.pkl', compression="lzma", set_default_extension=False)

algo_SVD = load('SVD/algo_SVD.pkl', compression="lzma", set_default_extension=False)

In [None]:
import os
algo_KNN_size = os.path.getsize('KNN/algo_KNN.pkl')
algo_KNN_size /=(1024*1024)
print(algo_KNN_size)
algo_SVD_size = os.path.getsize('SVD/algo_SVD.pkl')
algo_SVD_size /=(1024*1024)
print(algo_SVD_size)

In [None]:
# KNN predictions
import time
total_inference_time_knn = 0
total_inference_time_svd = 0
random_user_ids = list(ratings['user_id'].sample(n=10, replace=False))
for query in random_user_ids:
    start_time = time.time()
    pred_KNN = prediction(algo_KNN, query)
    recommended_movies_KNN, top_recommended_movies_KNN = top_recommendations(pred_KNN, 20)
    end_time = time.time()
    inference_time_knn = end_time - start_time
    total_inference_time_knn +=inference_time_knn
    
for query in random_user_ids:
    start_time = time.time()
    pred_SVD = prediction(algo_SVD, query)
    recommended_movies_SVD, top_recommended_movies_SVD = top_recommendations(pred_SVD, 20)
    end_time = time.time()
    inference_time_svd = end_time - start_time
    total_inference_time_svd +=inference_time_svd
    
average_inference_time_svd = total_inference_time_svd/len(random_user_ids)
average_inference_time_knn =total_inference_time_knn / len(random_user_ids)
print("knn time :",average_inference_time_knn)
print("SVD time :",average_inference_time_svd)

In [None]:
top_recommended_movies_SVD

In [None]:
top_recommended_movies_KNN