In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD, evaluate

import warnings; warnings.simplefilter('ignore')

In [2]:
md=pd.read_csv('data/movies.dat',sep="::",header=None,names=['id','title','genres'],index_col='id')
md.head()

Unnamed: 0_level_0,title,genres
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Animation|Children's|Comedy
2,Jumanji (1995),Adventure|Children's|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama
5,Father of the Bride Part II (1995),Comedy


In [3]:
md['genres']=md['genres'].fillna('[]').apply(lambda x : x.split(sep='|'))

In [4]:
full_ratings=pd.read_csv('data/ratings.dat',sep="::",header=None,names=['user_id','movie_id','stars','timestamp'])

In [5]:
train_ratings=full_ratings[full_ratings['user_id']<6030]

In [6]:
test_ratings=full_ratings[full_ratings['user_id']>6029]

In [7]:
reader=Reader()

In [8]:
data = Dataset.load_from_df(train_ratings[['user_id', 'movie_id', 'stars']], reader)
data.split(n_folds=5)

In [9]:
svd=SVD()
evaluate(svd, data, measures=['RMSE', 'MAE'])

Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.8731
MAE:  0.6854
------------
Fold 2
RMSE: 0.8725
MAE:  0.6837
------------
Fold 3
RMSE: 0.8752
MAE:  0.6874
------------
Fold 4
RMSE: 0.8724
MAE:  0.6851
------------
Fold 5
RMSE: 0.8735
MAE:  0.6859
------------
------------
Mean RMSE: 0.8733
Mean MAE : 0.6855
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [0.8730527384344431,
                             0.8724622172208251,
                             0.8752302195987951,
                             0.8724477058971948,
                             0.8735401291814835],
                            'mae': [0.6853783728990456,
                             0.6836824412445587,
                             0.6874229898616545,
                             0.6850702304327845,
                             0.6858600286171043]})

In [10]:
trainset = data.build_full_trainset()
svd.train(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1d0b07d8128>

In [59]:
##To predict rating for a particular user
uid=6
preds=[]
row_list=[]
sum_e=0
single_user=full_ratings[full_ratings['user_id']==uid]
for j in single_user['movie_id']:
        preds.append(round(svd.predict(uid,j).est,2))
# t=md['title'].iloc[single_user['movie_id'].iloc[5]]
for i in range(len(preds)):
    err=round(abs(preds[i]-single_user['stars'].iloc[i]),2)
    sum_e=sum_e+err
    t=md['title'].iloc[single_user['movie_id'].iloc[i]]
    dict1={'title':t,'predicted':preds[i],'actual':single_user['stars'].iloc[i],'difference':err}
    row_list.append(dict1)
    
print("approx. error:",sum_e/len(preds),"  for user: ",uid )

approx. error: 0.4056338028169013   for user:  6


In [43]:
##To calculate error of all users
all_error=[]
for k in range(1,6000):
    preds=[]
    uid=k
    single_user=full_ratings[full_ratings['user_id'] == uid]
    for j in single_user['movie_id']:
        preds.append(round(svd.predict(uid,j).est,2))
    sum_e=0
    row_list=[]
    for i in range(len(preds)):
        err=round(abs(preds[i]-single_user['stars'].iloc[i]),2)
        sum_e=sum_e+err
    all_error.append(sum_e/len(preds))
#     print("approx. error:",s/len(preds),"  for user: ",k )

In [44]:
min(all_error)

0.17230769230769227

In [60]:
pd.DataFrame(row_list,columns=['title','actual','predicted','difference'])

Unnamed: 0,title,actual,predicted,difference
0,52 Pick-Up (1986),5,4.02,0.98
1,"Eighth Day, The (Le Huitième jour ) (1996)",4,4.21,0.21
2,But I'm a Cheerleader (1999),4,3.66,0.34
3,To Gillian on Her 37th Birthday (1996),4,3.83,0.17
4,3 Ninjas: High Noon On Mega Mountain (1998),5,4.46,0.54
5,"Ghost and the Darkness, The (1996)",5,4.38,0.62
6,Running Free (2000),4,4.34,0.34
7,Maverick (1994),4,4.21,0.21
8,"Last September, The (1999)",5,4.18,0.82
9,"Two Jakes, The (1990)",4,3.97,0.03


In [56]:
svd.predict(6,2406)

Prediction(uid=6, iid=2406, r_ui=None, est=4.017133367591548, details={'was_impossible': False})