In [1]:
import matplotlib.pyplot as plt
import pickle
import time

from helpers import load_data
from utils import split_data

from baselines import *
from matrix_factorization import matrix_factorization_sgd, write_sgd_prediction, matrix_factorization_als, \
    write_als_prediction

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
path_dataset = "../data/data_train.csv"
ratings = load_data(path_dataset)

In [3]:
_, train, test = split_data(ratings, 10, verbose=True)
with open('../data/pickle/train.pickle', 'wb') as file:
    pickle.dump(train, file)
with open('../data/pickle/test.pickle', 'wb') as file:
    pickle.dump(test, file)

Shape of original ratings : (10000, 1000)
Shape of valid ratings (and of train and test data) : (9990, 999)
Total number of nonzero elements in original data : 1176952
Total number of nonzero elements in train data : 1068523
Total number of nonzero elements in test data : 108350


In [3]:
with open('../data/pickle/train.pickle', 'rb') as file:
    train = pickle.load(file)
with open('../data/pickle/test.pickle', 'rb') as file:
    test = pickle.load(file)

## Global Mean baseline

#### Test of the global mean baseline using the split training set

In [5]:
start_time = time.time()
global_mean_rmse = global_mean_test(ratings, min_num_ratings=10)
print("--- %s seconds ---" % (time.time() - start_time))
print('Global mean RMSE : {}'.format(global_mean_rmse))

--- 25.886598825454712 seconds ---
Global mean RMSE : 1.1183506557779654


#### Output prediction to test on kaggle

In [6]:
start_time = time.time()
mean = global_mean(ratings)
print("--- %s seconds ---" % (time.time() - start_time))
write_global_mean_prediction(mean)

--- 16.67897653579712 seconds ---


Score on kaggle for this baseline : 1.11785

## User Mean baseline

#### Test of the user mean baseline using the split training set

In [8]:
start_time = time.time()
user_mean_rmse = user_mean_test(ratings, min_num_ratings=10)
print("--- %s seconds ---" % (time.time() - start_time))
print('User mean RMSE : {}'.format(user_mean_rmse))

--- 286.12546944618225 seconds ---
User mean RMSE : 1.0289888944873853


#### Output prediction to test on kaggle

In [9]:
start_time = time.time()
means = compute_user_means(ratings)
print("--- %s seconds ---" % (time.time() - start_time))
write_user_mean_prediction(means)

--- 110.12382626533508 seconds ---


Score on kaggle for this baseline : 1.02982

## Item Mean baseline

#### Test of the user mean baseline using the split training set

In [10]:
start_time = time.time()
item_mean_rmse = item_mean_test(ratings, min_num_ratings=10)
print("--- %s seconds ---" % (time.time() - start_time))
print('Item mean RMSE : {}'.format(item_mean_rmse))

--- 33.12778425216675 seconds ---
Item mean RMSE : 1.0938352842783858


#### Output prediction to test on kaggle

In [11]:
start_time = time.time()
means = compute_item_means(ratings)
print("--- %s seconds ---" % (time.time() - start_time))
write_item_mean_prediction(means)

--- 24.0707004070282 seconds ---


Score on kaggle for this baseline : 1.09267

## Matrix Factorization using SGD

#### Test of the model

In [5]:
start_time = time.time()
train_rmse, test_rmse, _, _ = matrix_factorization_sgd(train, test, gamma=0.012, verbose=True, 
                                                       lambda_user=0.011, lambda_item=0.25,
                                                       num_epochs=50)
print("--- %s seconds ---" % (time.time() - start_time))
train_rmse, test_rmse

Learning the matrix factorization using SGD...
Final RMSE on train data: 0.9912955138845942
Final RMSE on test data: 1.0001463113122229.
--- 1706.9034678936005 seconds ---


(0.99129551388459425, 1.0001463113122229)

#### Building a prediction

In [8]:
start_time = time.time()
_, user_features, item_features = matrix_factorization_sgd(ratings, None, gamma=0.012, verbose=True, 
                                                           lambda_user=0.011, lambda_item=0.25,
                                                           num_epochs=50)
print("--- %s seconds ---" % (time.time() - start_time))
write_sgd_prediction(user_features, item_features)

Learning the matrix factorization using SGD...
--- 1662.4102020263672 seconds ---


Score on kaggle for this prediction method : 1.00032

In [None]:
with open(b'../data/pickle/blending/sgd_item_ratings.pickle', 'wb') as f:
    pickle.dump(item_features_sgd, f)
with open(b'../data/pickle/blending/sgd_user_ratings.pickle', 'wb') as f:
    pickle.dump(user_features_sgd, f)

## Matrix Factorization using Alternating Least Squares

#### Test of the model

In [6]:
start_time = time.time()
train_rmse, test_rmse, _, _ = matrix_factorization_als(train, test, verbose=True, stop_criterion=0.00001,
                                                       lambda_user=0.014, lambda_item=0.575, num_features=20)
print("--- %s seconds ---" % (time.time() - start_time))
train_rmse, test_rmse

Learning the matrix factorization using ALS...
Final RMSE on train data: 0.9050427076545173
Final RMSE on test data: 0.983912165799094.
--- 803.992819070816 seconds ---


(0.90504270765451733, 0.98391216579909402)

#### Building a prediction

In [10]:
start_time = time.time()
_, user_features, item_features = matrix_factorization_als(ratings, None, verbose=True, stop_criterion=0.00001, 
                                                              lambda_user=0.014, lambda_item=0.575, num_features=20)
print("--- %s seconds ---" % (time.time() - start_time))
write_als_prediction(user_features, item_features)

Learning the matrix factorization using ALS...
--- 939.8009564876556 seconds ---


Score on kaggle for this prediction method : 0.98194

In [11]:
with open(b'../data/pickle/blending/als_item_ratings.pickle', 'wb') as f:
    pickle.dump(item_features, f)
with open(b'../data/pickle/blending/als_user_ratings.pickle', 'wb') as f:
    pickle.dump(user_features, f)