In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%load_ext autoreload
%autoreload 2

# 1. Data Read
## 1.1 Create dataframe and clean it

In [3]:
data = pd.read_csv('data/data_train.csv')
print(data)

                   Id  Prediction
0              r44_c1           4
1              r61_c1           3
2              r67_c1           4
3              r72_c1           3
4              r86_c1           5
5              r90_c1           4
6             r108_c1           3
7             r114_c1           3
8             r120_c1           2
9             r135_c1           5
10            r152_c1           4
11            r165_c1           3
12            r182_c1           3
13            r310_c1           3
14            r318_c1           1
15            r333_c1           3
16            r355_c1           2
17            r390_c1           4
18            r401_c1           4
19            r410_c1           2
20            r418_c1           3
21            r457_c1           2
22            r470_c1           4
23            r497_c1           3
24            r516_c1           3
25            r566_c1           3
26            r595_c1           2
27            r670_c1           3
28            

In [4]:
data1 = data.copy()
data2 = data.copy()

data1['Id'] = data1['Id'].apply(lambda x: x.split('_')[0])
data2['Id'] = data2['Id'].apply(lambda x: x.split('_')[1])
data1['user_Id'] = data1['Id'].apply(lambda x: x.replace("r",""))
data2['item_Id'] = data2['Id'].apply(lambda x: x.replace("c",""))

In [5]:
data_final = pd.concat([data1['user_Id'],data2['item_Id'],data1['Prediction']],axis=1)
data_final.head()

Unnamed: 0,user_Id,item_Id,Prediction
0,44,1,4
1,61,1,3
2,67,1,4
3,72,1,3
4,86,1,5


## 1.2 Load custom Surprise Dataset

In [7]:
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import accuracy

In [9]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(data_final[['user_Id','item_Id', 'Prediction']], reader)

In [10]:
#Test loading with a simple SVD based prediction

from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.25)
algo = SVD()
algo.fit(trainset)
predictions = algo.test(testset)

accuracy.rmse(predictions)

RMSE: 1.0298


1.0298230457076354

# 2. SVD Model 

In [196]:
#first we're going to chose the compare some parameters such as k, learning rate
#then we will do a cross_validation analysis

from surprise.model_selection import GridSearchCV

param_grid = {'n_epochs':[10],'n_factors':[40,50], 'lr_all': [0.005],
              'reg_all': [0.4]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

1.01748609883
{'n_epochs': 10, 'n_factors': 40, 'lr_all': 0.005, 'reg_all': 0.4}


In [197]:
# 80/20 % split with k=50 factors

trainset, testset = train_test_split(data, test_size=.20)
algo = SVD()
algo.n_factors=50
algo.fit(trainset)
predictions = algo.test(testset)

accuracy.rmse(predictions)

RMSE: 1.0195


1.0194742667613892

In [64]:
from surprise.model_selection import cross_validate

algo.n_epochs=10
cross_validate(algo,data, measures=['rmse'], cv=5, return_train_measures=True)

{'test_rmse': array([1.0057343 , 1.00590485, 1.00414723, 1.00610889, 1.00656548]),
 'train_rmse': array([0.95014636, 0.95113164, 0.95125446, 0.95076042, 0.95077297]),
 'fit_time': (22.478854656219482,
  22.728386402130127,
  24.341591119766235,
  25.603251218795776,
  25.84378147125244),
 'test_time': (2.7198452949523926,
  2.6083309650421143,
  2.649336338043213,
  2.6043307781219482,
  2.0342586040496826)}

# 3. NMF Model

In [198]:
#first we're going to chose the compare some parameters such as k, learning rate
#then we will do a cross_validation analysis

from surprise.model_selection import GridSearchCV

param_grid = {'n_epochs':[50],'n_factors':[2], 'biased': [True],'reg_pu':[0.06,0.1]}
gs = GridSearchCV(NMF, param_grid, measures=['rmse'], cv=4)
gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

1.00378330181
{'n_epochs': 50, 'n_factors': 2, 'biased': True, 'reg_pu': 0.06}


In [36]:
trainset, testset = train_test_split(data, test_size=.50)
algo = NMF()
algo.n_factors=2
algo.biased=True
algo.fit(trainset)
predictions = algo.test(testset)

accuracy.rmse(predictions)

RMSE: 1.0048


1.0048211848844557

# 4. Output submission 

## 4.1 Load and preprocessing of sampleSubmission

In [168]:
sub_data = pd.read_csv('sampleSubmission.csv')
sub_data.head()

Unnamed: 0,Id,Prediction
0,r37_c1,3
1,r73_c1,3
2,r156_c1,3
3,r160_c1,3
4,r248_c1,3


In [136]:
sub_data1 = sub_data.copy()
sub_data2 = sub_data.copy()

sub_data1['Id'] = sub_data1['Id'].apply(lambda x: x.split('_')[0])
sub_data2['Id'] = sub_data2['Id'].apply(lambda x: x.split('_')[1])
sub_data1['user_Id'] = sub_data1['Id'].apply(lambda x: x.replace("r",""))
sub_data2['item_Id'] = sub_data2['Id'].apply(lambda x: x.replace("c",""))

In [137]:
sub_data_final = pd.concat([sub_data1['user_Id'],sub_data2['item_Id'],sub_data1['Prediction']],axis=1)
sub_data_final.head()


Unnamed: 0,user_Id,item_Id,Prediction
0,37,1,3
1,73,1,3
2,156,1,3
3,160,1,3
4,248,1,3


In [138]:
reader = Reader(rating_scale=(1, 5))
sub_dataset = Dataset.load_from_df(sub_data_final[['user_Id','item_Id', 'Prediction']], reader)

## 4.2 Final Model

In [139]:
fullset=data.build_full_trainset()
final_algo = NMF()
final_algo.n_factors=2
final_algo.biased=True
final_algo.fit(fullset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x204fc7be630>

In [142]:
sub_data_set = sub_dataset.build_full_trainset()
pred = final_algo.test(sub_data_set.build_testset())

## 4.3 Write output submission to CSV

In [167]:
import csv
with open('submit.csv', 'w',newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['Id','Prediction'])
    for p in pred:
        writer.writerow(['r'+p[0]+'_c'+p[1],round(p[3])])