In [1]:
# load from file
import pandas as pd
import numpy as np
filename = 'ml-latest-small/ratings.csv'

#headers = ['userId', 'movieId', 'rating', 'timestamp']
#original data 
rating_data = pd.read_csv(filename)

In [2]:
# extract key information
users = rating_data['userId'].values
movies = rating_data['movieId'].values
ratings = rating_data['rating'].values
movie_set = set(movies)
user_set = set(users)

movieId_map_col = {} # key is the movieId, value is column index
def create_R(users, movies, ratings):
    user_num = len(user_set)
    movie_num = len(movie_set)

    for i in range(movie_num):
        movieId_map_col[list(movie_set)[i]] = i
        
    matrix_R = np.zeros((user_num, movie_num),dtype = float)
    for i in range(len(users)):
        matrix_R[users[i]-1][movieId_map_col[movies[i]]] = ratings[i]
    return matrix_R

matrix_R = create_R(users, movies, ratings)
print(matrix_R.shape)

(610, 9724)


In [4]:
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import KFold

pred_matrix = [] # save the prediction values(the mean of each users rating)

for row in matrix_R:
    valid = np.nonzero(row)
    pred_matrix.append(np.mean(row[valid]))

pred_RMSE1 = [] # RMSE in each fold

def get_test_result(pred_RMSE,dataset):
    kf = KFold(n_splits = 10, random_state = None, shuffle = False)
    for train_test, test_index in kf.split(dataset):
        pred_results = []
        gt_results = dataset.loc[test_index]['rating'].as_matrix()
        for index in test_index:
            test_userid = int(dataset.loc[index]['userId']) - 1
            pred_results.append(float(pred_matrix[test_userid]))
        pred_RMSE.append(sqrt(mean_squared_error(gt_results,pred_results)))
        avg_RMSE = np.mean(pred_RMSE)
    return avg_RMSE

results = get_test_result(pred_RMSE1,rating_data)
print("The Average RMSE for Original Test Set:")
print(results)



The Average RMSE for Original Test Set:
0.9341694523212585


In [5]:
rare_movies = [] # save unpopular movies
movies = list(movies)
for movie in movie_set:
    if(movies.count(movie) <= 2):
        rare_movies.append(movie)

pop_rating_data = pd.DataFrame(columns = rating_data.columns)     
for index in rating_data.index:
    if(rating_data.loc[index]['movieId'] not in rare_movies):
        pop_rating_data.loc[index] = rating_data.loc[index]

pop_rating_data.index = range(len(pop_rating_data)) # reset index from 0-len(pop_rating_data)

pred_RMSE2 = []    
results = get_test_result(pred_RMSE2, pop_rating_data)
print("The Average RMSE for Popular Movie Trimmed Test Set:")
print(results)



The Average RMSE for Popular Movie Trimmed Test Set:
0.931652900928051


In [6]:
unpop_rating_data = pd.DataFrame(columns = rating_data.columns)     
for index in rating_data.index:
    if(rating_data.loc[index]['movieId'] in rare_movies):
        unpop_rating_data.loc[index] = rating_data.loc[index]
unpop_rating_data.index = range(len(unpop_rating_data)) # reset index from 0-len(pop_rating_data)

pred_RMSE3 = []    
results = get_test_result(pred_RMSE3, unpop_rating_data)
print("The Average RMSE for Unpopular Movie Trimmed Test Set:")
print(results)



The Average RMSE for Unpopular Movie Trimmed Test Set:
0.9651697469437426


In [9]:
import matplotlib.pyplot as plt
var_matrix = []  # save the variance of each movies

for i in range(matrix_R.shape[1]):
    valid = np.nonzero(matrix_R[:,i])
    var_matrix.append(np.var(matrix_R[valid,i]))


print("max variance of the rating score: %f" % max(var_matrix))
print("min variance of the rating score: %f" % min(var_matrix))

plt.hist(var_matrix, bins = [0,0.5,1,1.5,2,2.5,3,3.5,4,4.5,5], align='mid')

plt.title("Frequency of Movie Rating Variance")
plt.ylabel("The Number of movies")

plt.show()

max variance of the rating score: 5.062500
min variance of the rating score: 0.000000


<Figure size 640x480 with 1 Axes>

In [10]:
filtered_movies = []
movies = list(movies)
for movie in movie_set:
    if(movies.count(movie) >= 5 and var_matrix[movieId_map_col[movie]] >= 2):
        filtered_movies.append(movie)

highvar_rating_data = pd.DataFrame(columns = rating_data.columns)     
for index in rating_data.index:
    if(rating_data.loc[index]['movieId'] in filtered_movies) :
        highvar_rating_data .loc[index] = rating_data.loc[index]
highvar_rating_data .index = range(len(highvar_rating_data )) 

pred_RMSE4 = []    
results = get_test_result(pred_RMSE4, highvar_rating_data)
print("The Average RMSE for High Variance Movie Trimmed Test Set:")
print(results)


The Average RMSE for High Variance Movie Trimmed Test Set:
1.4676561249836158




In [4]:
import plotly.offline as py
import plotly.graph_objs as go

py.init_notebook_mode(connected=True)

def draw_curve(x, y, name, title='', xlabel='', ylabel='', ROC=True):
    data = []
    width, height = 800, 600
    
    if type(name) != list or len(name) == 1:
        width, height = 600, 450
        trace1 = go.Scatter(x=x, y=y, 
                            mode='lines', 
                            line=dict(color='darkorange', width=2),
                            name=name
                           )
        data.append(trace1)
    else:
        for i in range(len(name)):
            trace1 = go.Scatter(x=x[i], y=y[i], 
                                mode='lines', 
                                line=dict(width=2),
                                name=name[i]
                               )
            data.append(trace1)
    
    if ROC:
        title = 'Receiver Operating Characteristic'
        xlabel = 'False Positive Rate'
        ylabel = 'True Positive Rate' 
        trace2 = go.Scatter(x=[0, 1], y=[0, 1], 
                            mode='lines', 
                            line=dict(color='navy', width=2, dash='dash'),
                            showlegend=False)
        data.append(trace2)
    
    layout = go.Layout(title=title,
                       autosize=False,
                       width=width,
                       height=height,
                       xaxis=dict(title=xlabel, ticks='outside', mirror=True, linewidth=1),
                       yaxis=dict(title=ylabel, ticks='outside', mirror=True, linewidth=1),
                       legend=dict(x=.5, y=.2, bordercolor='#D3D3D3', borderwidth=1))
    
    if ROC:
        layout.update(yaxis=dict(title=ylabel, ticks='outside', mirror=True, linewidth=1, range=[0, 1.05]))
    
    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig)

In [3]:
from surprise import Dataset
from surprise import Reader

file_path = './ml-latest-small/ratings.csv'

reader = Reader(line_format='user item rating timestamp', sep=',', rating_scale=(0.5, 5), skip_lines=1)

data = Dataset.load_from_file(file_path, reader=reader)
# data.raw_ratings

In [8]:
import pandas as pd
from surprise.model_selection import train_test_split
from collections import namedtuple
from sklearn import metrics
from surprise import KNNWithMeans
import numpy as np
from surprise import KNNWithMeans
from surprise.model_selection import cross_validate

sim_options = {'name': 'pearson', 'user_based': True}
k_best = 20
threshold = [2.5, 3, 3.5, 4]
kNN_threshold_3_fpr = None
kNN_threshold_3_tpr = None

#savedStdout = sys.stdout
for theta in threshold:
    # load using pandas
    df = pd.read_csv('./ml-latest-small/ratings.csv')
    # df['rating'] = df['rating'].apply(lambda x: 0 if x < theta else 1)
    reader = Reader(rating_scale=(0.5, 5))
    data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)

    # split into train set and test set
    trainset, testset = train_test_split(data, test_size=.1)

    algo = KNNWithMeans(k=k_best, sim_options=sim_options)
    algo.fit(trainset)
    predictions = algo.test(testset)
    #sys.stdout.flush()
    #sys.stdout = savedStdout

    trues = [0 if getattr(row, 'r_ui') < theta else 1 for row in predictions]
    scores = [getattr(row, 'est') for row in predictions]
    fpr, tpr, thresholds = metrics.roc_curve(trues, scores)
    if theta == 3:
        kNN_threshold_3_fpr = fpr
        kNN_threshold_3_tpr = tpr
    roc_auc = metrics.auc(fpr, tpr)

    name = 'K-NN θ=%.1f (area = %0.2f)' % (theta, roc_auc)
    draw_curve(fpr, tpr, name, ROC=True)

Computing the pearson similarity matrix...
Done computing similarity matrix.


Computing the pearson similarity matrix...
Done computing similarity matrix.


Computing the pearson similarity matrix...
Done computing similarity matrix.


Computing the pearson similarity matrix...
Done computing similarity matrix.


In [16]:
from surprise.model_selection import train_test_split
from collections import namedtuple
from sklearn import metrics
from surprise.prediction_algorithms import NMF
from surprise.prediction_algorithms import SVD
from surprise import Reader, Dataset
from surprise.model_selection import KFold
from surprise.model_selection import cross_validate
from surprise import accuracy

bestLF = 20
thresholds = [2.5, 3, 3.5, 4]
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(df[['userId', 'itemId', 'rating']], reader)
NNMF_threshold_3_fpr = None
NNMF_threshold_3_tpr = None
for threshold in thresholds:
    train_set, test_set = train_test_split(data, test_size=0.1)
    algo = NMF(n_factors=bestLF, biased=False)
    algo.fit(train_set)
    predictions = algo.test(test_set)
    trues = [0 if getattr(row, 'r_ui') < threshold else 1 for row in predictions]
    scores = [getattr(row, 'est') for row in predictions]
    fpr, tpr, _ = metrics.roc_curve(trues, scores)
    if threshold == 3:
        NNMF_threshold_3_fpr = fpr
        NNMF_threshold_3_tpr = tpr
    roc_auc = metrics.auc(fpr, tpr)
    name = 'NNMF theta=%.1f (area = %0.2f)' % (threshold, roc_auc)
    draw_curve(fpr, tpr, name, ROC=True)

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from surprise.prediction_algorithms import NMF
from surprise.prediction_algorithms import SVD
from surprise import Reader, Dataset
from surprise.model_selection import KFold
from surprise.model_selection import cross_validate
from surprise import accuracy
df = pd.read_csv('ml-latest-small/ratings.csv', names = ['userId', 'itemId', 'rating', 'timestamp'],header=0)
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(df[['userId', 'itemId', 'rating']], reader)
kf = KFold(n_splits=10)

In [13]:
from surprise.model_selection import train_test_split
from collections import namedtuple
from sklearn import metrics
bestLF = 20
thresholds = [2.5, 3, 3.5, 4]
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(df[['userId', 'itemId', 'rating']], reader)
NNMF_threshold_3_fpr = None
NNMF_threshold_3_tpr = None
for threshold in thresholds:
    train_set, test_set = train_test_split(data, test_size=0.1)
    algo = NMF(n_factors=bestLF, biased=False)
    algo.fit(train_set)
    predictions = algo.test(test_set)
    trues = [0 if getattr(row, 'r_ui') < threshold else 1 for row in predictions]
    scores = [getattr(row, 'est') for row in predictions]
    fpr, tpr, _ = metrics.roc_curve(trues, scores)
    if threshold == 3:
        NNMF_threshold_3_fpr = fpr
        NNMF_threshold_3_tpr = tpr
    roc_auc = metrics.auc(fpr, tpr)
    name = 'NNMF theta=%.1f (area = %0.2f)' % (threshold, roc_auc)
    draw_curve(fpr, tpr, name, ROC=True)

KeyError: "['itemId'] not in index"

In [9]:
draw_curve([kNN_threshold_3_fpr, NNMF_threshold_3_fpr, MF_threshold_3_fpr], [kNN_threshold_3_tpr, NNMF_threshold_3_tpr, MF_threshold_3_tpr], ['kNN', 'NNMF', 'MF'], ROC=True)

NameError: name 'NNMF_threshold_3_fpr' is not defined