In [4]:
%matplotlib inline
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, SVD, evaluate

import warnings; warnings.simplefilter('ignore')

### Loading Dataset

In [5]:
md=pd.read_csv('data/movies.dat',sep="::",header=None,names=['id','title','genres'],index_col='id')
md.head()

Unnamed: 0_level_0,title,genres
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Animation|Children's|Comedy
2,Jumanji (1995),Adventure|Children's|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama
5,Father of the Bride Part II (1995),Comedy


In [6]:
full_ratings=pd.read_csv('data/ratings.dat',sep="::",header=None,names=['user_id','movie_id','stars','timestamp'])

In [7]:
md['genres']=md['genres'].fillna('[]').apply(lambda x : x.split(sep='|'))

### Spliting into trainset and testset

In [8]:
train_ratings=full_ratings[full_ratings['user_id']<6030]

In [9]:
test_ratings=full_ratings[full_ratings['user_id']>6029]

In [10]:
reader=Reader()

Here we will split data into n folds for cross validation

In [11]:
data = Dataset.load_from_df(train_ratings[['user_id', 'movie_id', 'stars']], reader)
data.split(n_folds=5)

Loading Single Value Decomposition Model

In [12]:
svd=SVD()
evaluate(svd, data, measures=['RMSE', 'MAE'])

Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.8738
MAE:  0.6852
------------
Fold 2
RMSE: 0.8770
MAE:  0.6881
------------
Fold 3
RMSE: 0.8722
MAE:  0.6848
------------
Fold 4
RMSE: 0.8724
MAE:  0.6851
------------
Fold 5
RMSE: 0.8750
MAE:  0.6863
------------
------------
Mean RMSE: 0.8741
Mean MAE : 0.6859
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [0.873776494121988,
                             0.8770076218138065,
                             0.8721562473763003,
                             0.8724106311458024,
                             0.8749977111557727],
                            'mae': [0.6851812054880649,
                             0.6881366058748285,
                             0.6848222166691932,
                             0.685104961126699,
                             0.6863256256401152]})

Applying on trainset

In [13]:
trainset = data.build_full_trainset()
svd.train(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1788be80240>

### Prediction

In [14]:
##To predict rating for a particular user
uid=5
preds=[]
row_list=[]
sum_e=0
single_user=full_ratings[full_ratings['user_id']==uid]
for j in single_user['movie_id']:
        preds.append(round(svd.predict(uid,j).est,2))
# t=md['title'].iloc[single_user['movie_id'].iloc[5]]
for i in range(len(preds)):
    err=round(abs(preds[i]-single_user['stars'].iloc[i]),2)
    sum_e=sum_e+err
    t=md['title'].iloc[single_user['movie_id'].iloc[i]]
    dict1={'title':t,'predicted':preds[i],'actual':single_user['stars'].iloc[i],'difference':err}
    row_list.append(dict1)
    
print("approx. error:",sum_e/len(preds),"  for user: ",uid )

approx. error: 0.5910606060606063   for user:  5


In [15]:
pd.DataFrame(row_list,columns=['title','actual','predicted','difference'])[:25]

Unnamed: 0,title,actual,predicted,difference
0,Oxygen (1999),4,3.57,0.43
1,Rambo: First Blood Part II (1985),4,3.57,0.43
2,Paris Is Burning (1990),5,4.44,0.56
3,"Cry, the Beloved Country (1995)",3,3.4,0.4
4,Poison Ivy II (1995),2,2.99,0.99
5,Romancing the Stone (1984),5,3.36,1.64
6,Gabbeh (1996),4,2.95,1.05
7,"Thieves (Voleurs, Les) (1996)",4,3.5,0.5
8,Velvet Goldmine (1998),2,2.07,0.07
9,Inventing the Abbotts (1997),3,3.0,0.0


In [16]:
##To calculate error of all users
all_error=[]
for k in range(1,6000):
    preds=[]
    uid=k
    single_user=full_ratings[full_ratings['user_id'] == uid]
    for j in single_user['movie_id']:
        preds.append(round(svd.predict(uid,j).est,2))
    sum_e=0
    row_list=[]
    for i in range(len(preds)):
        err=round(abs(preds[i]-single_user['stars'].iloc[i]),2)
        sum_e=sum_e+err
    all_error.append(sum_e/len(preds))
#     print("approx. error:",s/len(preds),"  for user: ",k )

In [18]:
max(all_error)

1.2311111111111108

In [19]:
svd.predict(6,2406)

Prediction(uid=6, iid=2406, r_ui=None, est=3.995279106775284, details={'was_impossible': False})