In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
! pip install surprise

Collecting surprise
  Downloading https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl
Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/f5/da/b5700d96495fb4f092be497f02492768a3d96a3f4fa2ae7dea46d4081cfa/scikit-surprise-1.1.0.tar.gz (6.4MB)
[K     |████████████████████████████████| 6.5MB 2.8MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.0-cp36-cp36m-linux_x86_64.whl size=1678551 sha256=5b3d7060c744bf1c42d93f9322f54b62ce2890052ee190e22350aef23208d8c5
  Stored in directory: /root/.cache/pip/wheels/cc/fa/8c/16c93fccce688ae1bde7d979ff102f7bee980d9cfeb8641bcf
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.0 surprise-0.1


In [0]:
ROOT_DIR = '/content/drive/My Drive/Recommendation Systems/movielens/'
DATA_DIR = '/content/drive/My Drive/Recommendation Systems/movielens/data/'

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [5]:
#Load the u.data file into a dataframe
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(DATA_DIR+'u.data', sep='\t', names=r_cols,encoding='latin-1')
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [0]:
#Drop the timestamp column
ratings = ratings.drop('timestamp', axis=1)

In [7]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [8]:
print('No of distinct users: ',ratings['user_id'].nunique())
print('No of distinct movies: ',ratings['movie_id'].nunique())

No of distinct users:  943
No of distinct movies:  1682


In [0]:
'''
X = ratings.copy()
y  = ratings['user_id']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,stratify=y,random_state=42)
'''

In [0]:
'''
print('Train set size: ',X_train.shape)
print('Test set size: ',X_test.shape)
print('No of users in train set: ',X_train['user_id'].nunique())
print('No of users in test set: ',X_test['user_id'].nunique())
'''

# **Model based Collaborative Filtering**

In [0]:
from surprise import Reader,Dataset
from surprise import KNNWithMeans
from surprise.model_selection import GridSearchCV,train_test_split
from surprise import accuracy

In [0]:
reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(ratings,reader)
#data = Dataset.load_builtin('ml-100k')

In [0]:
trainset, testset = train_test_split(data, test_size=.25)

In [14]:
param_grid = {'k':[20,30,40,50],
              'sim_options':{
                              'name':['cosine','msd'],
                              'user_based':[True]              
                            }
              }

algo = KNNWithMeans

gs = GridSearchCV(algo,param_grid,measures=['rmse','mae'],cv=3)

gs.fit(data)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarit

In [15]:
#Grid Search Results
gs.cv_results

{'mean_fit_time': array([0.73962267, 0.23431253, 0.72840341, 0.2367533 , 0.73529212,
        0.24068721, 0.71583096, 0.23812675]),
 'mean_test_mae': array([0.7675249 , 0.76011001, 0.76165387, 0.75577752, 0.7589158 ,
        0.7538079 , 0.75782618, 0.75327059]),
 'mean_test_rmse': array([0.97016999, 0.96351343, 0.96397314, 0.95851224, 0.96137129,
        0.95654228, 0.96031564, 0.95602028]),
 'mean_test_time': array([4.32756154, 4.32173292, 4.81202157, 4.83022936, 5.31991275,
        5.24883731, 5.63941844, 5.6654768 ]),
 'param_k': [20, 20, 30, 30, 40, 40, 50, 50],
 'param_sim_options': [{'name': 'cosine', 'user_based': True},
  {'name': 'msd', 'user_based': True},
  {'name': 'cosine', 'user_based': True},
  {'name': 'msd', 'user_based': True},
  {'name': 'cosine', 'user_based': True},
  {'name': 'msd', 'user_based': True},
  {'name': 'cosine', 'user_based': True},
  {'name': 'msd', 'user_based': True}],
 'params': [{'k': 20, 'sim_options': {'name': 'cosine', 'user_based': True}},
  {'

In [16]:
gs.best_params

{'mae': {'k': 50, 'sim_options': {'name': 'msd', 'user_based': True}},
 'rmse': {'k': 50, 'sim_options': {'name': 'msd', 'user_based': True}}}

In [17]:
#Train the model with best params
algo = KNNWithMeans(k=50,sim_options={'name': 'msd', 'user_based': True})
algo.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f9d1d3545f8>

In [0]:
predictions = algo.test(testset)

In [23]:
accuracy.rmse(predictions)

RMSE: 0.9444


0.9443506314956355

In [41]:
uid = str(176)
iid = str(303)
# get a prediction for specific users and items.
pred = algo.predict(uid, iid, r_ui=4, verbose=False).est
pred

3.528653333333333

In [43]:
#Load the u.items file into a dataframe
i_cols = ['movie_id', 'title' ,'release date','video release date', 'IMDbURL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama','Fantasy','Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 
'Sci-Fi','Thriller', 'War', 'Western']
movies_df = pd.read_csv(DATA_DIR+'u.item', sep='|', names=i_cols,encoding='latin-1')
movies_df.head()

Unnamed: 0,movie_id,title,release date,video release date,IMDbURL,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [0]:
#Recommend movies to an user with user_id = 200

from collections import defaultdict

movies_watched = list(ratings[ratings['user_id']==200]['movie_id'].values)
movies = list(ratings['movie_id'].unique())
movies_not_watched = list(set(movies)-set(movies_watched))

pred_movie_ratings = defaultdict(int)

for movie in movies_not_watched:
  pred_movie_ratings[movie]=algo.predict(200,movie,verbose=False).est

pred_movie_ratings = [(k,v) for (k,v) in pred_movie_ratings.items()]
pred_movie_ratings_sorted = sorted(pred_movie_ratings,key=lambda x:x[1],reverse=True)
recommended_movies_ids = [k for (k,v) in pred_movie_ratings_sorted][:20] #Fetch top 20 movies to recommend
recommended_movies = [movies_df[movies_df['movie_id']==movie_id]['title'].values[0] for movie_id in recommended_movies_ids]

In [51]:
recommended_movies

['Maya Lin: A Strong Clear Vision (1994)',
 'Love and Death on Long Island (1997)',
 'Celestial Clockwork (1994)',
 'Prefontaine (1997)',
 'Star Kid (1997)',
 'Story of Xinghua, The (1993)',
 'Search for One-eye Jimmy, The (1996)',
 'Mina Tannenbaum (1994)',
 'Mondo (1996)',
 'Anna (1996)',
 'Pather Panchali (1955)',
 'Saint of Fort Washington, The (1993)',
 'Santa with Muscles (1996)',
 'Aiqing wansui (1994)',
 "Someone Else's America (1995)",
 'Nightwatch (1997)',
 "Some Mother's Son (1996)",
 'Butcher Boy, The (1998)',
 'Butcher Boy, The (1998)',
 'Spanish Prisoner, The (1997)']