## Surprise library

In [4]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, KNNWithMeans, accuracy, SVD
from plotly.offline import plot
from plotly.graph_objs import *


In [5]:
filepath = 'data/u.data'
reader = Reader(line_format='user item rating timestamp', sep='\t')
movie_data = Dataset.load_from_file(filepath, reader=reader)
movie_data.split(n_folds=5)

In [6]:
algo = KNNWithMeans()
for trainset, testset in movie_data.folds():
    algo.train(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions, verbose=True)
    

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9528
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9478
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9500
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9487
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9501


In [7]:
predict_df = pd.DataFrame(predictions)
predict_df[predict_df.r_ui == 0]

Unnamed: 0,uid,iid,r_ui,est,details


In [8]:
predict_df['error'] = predict_df.est - predict_df.r_ui 

In [9]:
data = [Box(y=predict_df['error'], boxpoints='all', jitter=0.3, pointpos=-3)]
plot(data)

'file:///home/user/ml_regression/RecommendationMovieRatings/temp-plot.html'

In [10]:
algo_svd = SVD()

In [11]:
for trainset, testset in movie_data.folds():
    algo_svd.train(trainset)
    predictions_svd = algo_svd.test(testset)
    rmse = accuracy.rmse(predictions_svd, verbose=True)


RMSE: 0.9380
RMSE: 0.9360
RMSE: 0.9407
RMSE: 0.9390
RMSE: 0.9378


In [12]:
svd_df = pd.DataFrame(predictions_svd)
svd_df['error'] = svd_df.est - svd_df.r_ui
svd_df

Unnamed: 0,uid,iid,r_ui,est,details,error
0,222,379,1.0,2.837505,{'was_impossible': False},1.837505
1,693,939,4.0,2.858925,{'was_impossible': False},-1.141075
2,244,208,5.0,4.228797,{'was_impossible': False},-0.771203
3,224,699,4.0,3.049328,{'was_impossible': False},-0.950672
4,502,294,3.0,3.443030,{'was_impossible': False},0.443030
5,727,1615,1.0,3.050635,{'was_impossible': False},2.050635
6,645,168,4.0,4.219484,{'was_impossible': False},0.219484
7,184,401,3.0,2.713879,{'was_impossible': False},-0.286121
8,271,83,4.0,3.958219,{'was_impossible': False},-0.041781
9,780,511,5.0,4.191922,{'was_impossible': False},-0.808078


In [13]:
trace0 = Box(y=predict_df['error'], boxpoints='all', jitter=0.3, pointpos=-3)
trace1 = Box(y=svd_df['error'], boxpoints='all', jitter=0.3, pointpos=0)
data = [trace0, trace1]
plot(data)


'file:///home/user/ml_regression/RecommendationMovieRatings/temp-plot.html'

In [14]:
svd_df.head()

Unnamed: 0,uid,iid,r_ui,est,details,error
0,222,379,1.0,2.837505,{'was_impossible': False},1.837505
1,693,939,4.0,2.858925,{'was_impossible': False},-1.141075
2,244,208,5.0,4.228797,{'was_impossible': False},-0.771203
3,224,699,4.0,3.049328,{'was_impossible': False},-0.950672
4,502,294,3.0,3.44303,{'was_impossible': False},0.44303


In [15]:
algo_svd.predict('1', '1672', r_ui = 0, verbose=True)

user: 1          item: 1672       r_ui = 0.00   est = 3.65   {'was_impossible': False}


Prediction(uid='1', iid='1672', r_ui=0, est=3.6507151352643006, details={'was_impossible': False})