In [None]:
!pip install surprise



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import surprise
from surprise import SVD
from surprise import accuracy
from surprise import KNNBasic, KNNWithMeans, KNNBaseline
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise import  Dataset
from surprise.model_selection import cross_validate,GridSearchCV
import re
import random  

In [None]:
movie_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/movie_lens/movie.csv')
rating_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/movie_lens/rating.csv')
link_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/movie_lens/link.csv')
tag_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/movie_lens/tag.csv')
genome_tags_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/movie_lens/genome_tags.csv')
genome_scores_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/movie_lens/genome_scores.csv')

In [None]:
movie_df = pd.concat([movie_df,link_df],axis=1)
movie_df = movie_df.loc[:,~movie_df.columns.duplicated()]

In [None]:
movie_df.head(5)

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0
4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0


### Cleaning Data


In [None]:
#function returns a small sample of rating and movie data combined (5 million data)
def movie_rating_sample_data(movie_df,rating_df):
  movie_rating_combined = pd.merge(movie_df,rating_df,on='movieId')
  combined_small = movie_rating_combined
  return combined_small

In [None]:
#function to remove rating counts less than 1000
def filter_less_thrsh(combined_small,thresh = 1000):
  combined_small = combined_small[combined_small.groupby('title').rating.transform('count')>thresh]
  return combined_small

In [None]:
#function to remove and rename columns
def remove_rename(combined_small):
  combined_small = combined_small[['userId','movieId','rating']]
  combined_small = combined_small.rename(columns={'userId':'user','movieId':'item',})
  return combined_small

In [None]:
combined_small = movie_rating_sample_data(movie_df,rating_df)
combined_small = filter_less_thrsh(combined_small)

In [None]:
#lets check our data
combined_small.tail()

Unnamed: 0,user,item,rating
19985523,28195,114240,4.0
19985524,51334,114240,3.0
19985525,120575,114240,2.5
19985526,124998,114240,2.5
19985527,138177,114240,4.0


## Building Collaborative filtering

In [None]:
print(len(combined_small['title'].unique()))

3159


In [None]:
#remove columns and rename them
combined_small = remove_rename(combined_small)

In [None]:
#combined_small.drop(['genres','title'],inplace=True,axis=1)
combined_small.tail(5)

Unnamed: 0,user,item,rating
19985523,28195,114240,4.0
19985524,51334,114240,3.0
19985525,120575,114240,2.5
19985526,124998,114240,2.5
19985527,138177,114240,4.0


In [None]:
# combined_small.to_csv('/content/drive/MyDrive/Colab Notebooks/movie_lens/combined_small_v2.csv')

In [None]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(combined_small, reader)
raw_ratings  = data.raw_ratings

#shuffle ratings
random.shuffle(raw_ratings)

#90% trainset, 10% testset
threshold = int(.9 * len(raw_ratings))
trainset_raw_ratings = raw_ratings[:threshold]
test_raw_ratings = raw_ratings[threshold:]

data.raw_ratings = trainset_raw_ratings #data is your trainset

#selecting algorithm and grid search
print('grid search ....')

param_grid = {
    'n_epochs':[5],
    'lr_all':[0.1,0.8]
}
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse','mae'], joblib_verbose=2)
grid_search.fit(data)
algo = grid_search.best_estimator['rmse']


grid search ....


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  6.3min remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 61.5min finished


In [None]:
# retrain on the whole train set                                           
trainset = data.build_full_trainset()                                      
algo.fit(trainset)  
# now test on the testset                                                  
testset = data.construct_testset(test_raw_ratings)                         
predictions = algo.test(testset)                                           
print('Accuracy on the testset:')                                          
print(accuracy.rmse(predictions))
surprise.dump.dump('/content/drive/MyDrive/Colab Notebooks/movie_lens/my_model_v3',predictions=predictions)  

Accuracy on the testset:
RMSE: 0.9801
0.9800557814867045


In [None]:
predictions[1]

Prediction(uid=85685, iid=329, r_ui=5.0, est=4.181642435783127, details={'was_impossible': False})