In [22]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as matlab

import warnings 
warnings.filterwarnings("ignore")


In [23]:
#Reading Data from CSV
df = pd.read_csv('train.csv', error_bad_lines=False, sep ='\\t', engine ='python')
df.head()

Unnamed: 0,user_id,book_id,rating
0,12726,7784,5
1,23770,104293,4
2,15669,29291,4
3,649,420180,0
4,10980,7089179,0


In [24]:
#Checking shape of the data 
df.shape

(700000, 3)

In [25]:
#Checking the for NULL Values
df.book_id.isnull().sum()

0

In [26]:
#Checking the for NULL Values
df.user_id.isnull().sum()

0

In [27]:
#Checking the for NULL Values
df.rating.isnull().sum()

0

In [29]:
#Dropping the duplicate data
df = df.drop_duplicates()
df

Unnamed: 0,user_id,book_id,rating
0,12726,7784,5
1,23770,104293,4
2,15669,29291,4
3,649,420180,0
4,10980,7089179,0
...,...,...,...
699995,723,370493,0
699996,18071,5,0
699997,37159,423156,5
699998,31306,34002075,4


In [30]:
 #Getting counts
df["rating"].value_counts()

0    239027
5    169506
4    160219
3    105017
2     20821
1      4443
Name: rating, dtype: int64

In [31]:
print('No. users had read at least one book from my list : ', df['user_id'].nunique())

No. users had read at least one book from my list :  35280


#Surprise Lib - Attempt with Surprise Library


In [32]:
!pip install surprise
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection.split import train_test_split
from surprise.model_selection import cross_validate, GridSearchCV
import pandas as pd
import numpy as np
import os, io
from surprise import KNNBasic, KNNWithMeans
from surprise import SVDpp
from surprise import SVD
from surprise import accuracy
import surprise



In [33]:
#Creating data for surprise library
reader = Reader(rating_scale=(0,5))  #invoke reader instance of surprise library
data=Dataset.load_from_df(df,reader)

In [15]:
#create training set
trainingSet, testSet = train_test_split(data, test_size=0.2, train_size=None, random_state=None, shuffle=True)

In [16]:
#Setting up similary option
sim_options = {
    'name': 'cosine', #similarity measure default is MSD
    'user_based': False #user-based CF
}

In [None]:
#Apply KNN model for predicting Recommenndationn
knn = KNNBasic(sim_options=sim_options,k=3,min_k=1) #k=neighbours=3, other parameters set as above
knn.fit(trainingSet) #fit model to the training set
predictions_knn = knn.test(testSet)

In [None]:
accuracy.rmse(predictions_knn, verbose=True) 

In [None]:
#KNN takes longer time So trying with SVD

In [34]:
#SVD VERSION 1
# https://towardsdatascience.com/how-you-can-build-simple-recommender-systems-with-surprise-b0d32a8e4802 
svd = SVD(verbose=True, n_epochs=10)
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.6408  1.6419  1.6418  1.6415  0.0005  
MAE (testset)     1.3628  1.3645  1.3633  1.3635  0.0007  
Fit time          13.35   13.67   13.38   13.47   0.14    
Test time         2.54    2.25    1.75    2.18    0.33    


{'test_rmse': array([1.64082875, 1.64187001, 1.64182289]),
 'test_mae': array([1.36275061, 1.36448038, 1.36331458]),
 'fit_time': (13.353708982467651, 13.673139810562134, 13.381335973739624),
 'test_time': (2.540102243423462, 2.2503907680511475, 1.7519710063934326)}

In [63]:
#Checking for best model in suprise library
#https://towardsdatascience.com/building-and-testing-recommender-systems-with-surprise-step-by-step-d4ba702ef80b
from surprise import SVD, NMF, KNNBaseline,KNNWithZScore,KNNWithMeans, CoClustering
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import Reader

benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    # Perform cross validation
    print(algorithm)
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=True)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')    

Evaluating RMSE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.6339  1.6314  1.6313  1.6322  0.0012  
Fit time          25.65   25.25   25.13   25.34   0.22    
Test time         2.48    2.17    2.15    2.27    0.15    
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBaseline on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.6502  1.6530  1.6517  1.6516  0.0011  
Fit time          178.85  202.42  220.80  200.69  17.17   
Test time         110.91  114.79  130.03  118.58  8.25    
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix..

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVD,1.632175,25.344996,2.266957
BaselineOnly,1.642057,2.843603,2.447097
KNNBaseline,1.651626,200.690036,118.577888
KNNWithZScore,1.685833,240.489401,206.457502
KNNWithMeans,1.690201,209.337468,153.887509
CoClustering,1.718008,19.24077,2.155857
KNNBasic,1.809834,194.757202,116.322995


In [48]:
#Applyinng ASL model to 
bsl_options = {'method': 'als',
               'n_epochs': 30,
               'reg_u': 0.00125
               #'reg_i': 50
               }
algo = BaselineOnly(bsl_options=bsl_options)
cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=False)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


{'test_rmse': array([1.61307671, 1.61936998, 1.61394231, 1.61283887, 1.61285122]),
 'fit_time': (8.529582262039185,
  8.334342002868652,
  8.685697793960571,
  8.526430130004883,
  8.692172050476074),
 'test_time': (0.6529779434204102,
  1.2903947830200195,
  0.636268138885498,
  0.5827772617340088,
  0.6037192344665527)}

In [49]:
test_df= pd.read_csv('test.csv', error_bad_lines=False, sep ='\\t', engine ='python')

In [68]:
with open('output_svd_norounding_1.csv', 'w') as fw:
    fieldnames = ['user_id-book_id', 'rating']
    writer = csv.DictWriter(fw, fieldnames=fieldnames)
    writer.writeheader()
    for i in range(len(test_df)):
        userId = test_df['user_id'][i]
        bookId = test_df['book_id'][i]
       #var1 = userId-bookId
        rating = algo.predict(uid=userId, iid=bookId)
        prediction = rating.est
        writer.writerow({'user_id-book_id': "{0}{1}{2}".format(userId,'-',bookId), 'rating': prediction})

In [None]:
#Did not improve - 1.62765

In [80]:
#SVD Version - 2
#https://surprise.readthedocs.io/en/stable/matrix_factorization.html
algo = SVD(n_epochs=20, lr_all=0.1, reg_all=0.1, random_state=122, verbose=True)
cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)

Using ALS for SVD
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing e

{'test_rmse': array([1.63170798, 1.62707737, 1.63292774]),
 'fit_time': (26.09475827217102, 26.567713022232056, 27.150519132614136),
 'test_time': (2.416621208190918, 2.9454798698425293, 2.2639827728271484)}

In [64]:
#SVD version - three

In [70]:
from surprise import SVD, NMF
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import Reader

reader = Reader()

data = Dataset.load_from_df(df[['user_id', 'book_id', 'rating']], reader)

In [71]:
#Setting up the parameters.
svd = SVD(verbose=True, n_epochs=40,  reg_all=0.125 )
svd.fit(trainingSet)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Processing epoch 30
Processing epoch 31
Processing epoch 32
Processing epoch 33
Processing epoch 34
Processing epoch 35
Processing epoch 36
Processing epoch 37
Processing epoch 38
Processing epoch 39


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1a6d43b210>

In [72]:
#Predicting for the validation dataset
predictions = svd.test(testSet)

In [73]:
#Checking RMSE for the validation dataset.
accuracy.rmse(predictions, verbose=True)

RMSE: 1.5858


1.585755941369008

In [74]:
#Fiiting the model for whole data.
svd.fit(data.build_full_trainset())

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Processing epoch 30
Processing epoch 31
Processing epoch 32
Processing epoch 33
Processing epoch 34
Processing epoch 35
Processing epoch 36
Processing epoch 37
Processing epoch 38
Processing epoch 39


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1a6d43b210>

In [75]:
#Getting the test data 
test_df= pd.read_csv('test.csv', error_bad_lines=False, sep ='\\t', engine ='python')

In [64]:
#Printing the csv files for submission
import csv
with open('output_svd_retrying.csv', 'w') as fw:
    fieldnames = ['user_id-book_id', 'rating']
    writer = csv.DictWriter(fw, fieldnames=fieldnames)
    writer.writeheader()
    for i in range(len(test_df)):
        userId = test_df['user_id'][i]
        bookId = test_df['book_id'][i]
       #var1 = userId-bookId
        rating = svd.predict(uid=userId, iid=bookId)
        prediction = rating.est
        writer.writerow({'user_id-book_id': "{0}{1}{2}".format(userId,'-',bookId), 'rating': prediction})

In [35]:
#SVD version- Four. Trying with some extra paramter 
#https://surprise.readthedocs.io/en/stable/matrix_factorization.html
svd = SVD(verbose=True, n_epochs=100,  reg_all=0.125, n_factors =150)
cross_validate(svd, data, measures=['RMSE','MAE'], cv=5, verbose=True)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Processing epoch 30
Processing epoch 31
Processing epoch 32
Processing epoch 33
Processing epoch 34
Processing epoch 35
Processing epoch 36
Processing epoch 37
Processing epoch 38
Processing epoch 39
Processing epoch 40
Processing epoch 41
Processing epoch 42
Processing epoch 43
Processing epoch 44
Processing epoch 45
Processing epoch 46
Processing epoch 47
Processing epoch 48
Processing epoch 49
Processing

Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Processing epoch 30
Processing epoch 31
Processing epoch 32
Processing epoch 33
Processing epoch 34
Processing epoch 35
Processing epoch 36
Processing epoch 37
Processing epoch 38
Processing epoch 39
Processing epoch 40
Processing epoch 41
Processing epoch 42
Processing epoch 43
Processing epoch 44
Processing epoch 45
Processing epoch 46
Processing epoch 47
Processing epoch 48
Processing epoch 49
Processing epoch 50
Processing epoch 51
Processing epoch 52
Processing epoch 53
Processing epoch 54
Processing epoch 55
Processing epoch 56
Processing epoch 57
Processing epoch 58
Processing epoch 59
Processing epoch 60
Processing epoch 61
Processing epoch 62


{'test_rmse': array([1.55388724, 1.55250831, 1.55374882, 1.55414564, 1.55370437]),
 'test_mae': array([1.24104202, 1.23959824, 1.24062624, 1.23925389, 1.24230883]),
 'fit_time': (202.007581949234,
  211.99237704277039,
  208.33990788459778,
  212.6008710861206,
  206.5236039161682),
 'test_time': (1.3159286975860596,
  1.3819084167480469,
  1.427799940109253,
  1.1654610633850098,
  1.4618620872497559)}

In [36]:
#Fiiting the model for whole data.
svd.fit(data.build_full_trainset())

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Processing epoch 30
Processing epoch 31
Processing epoch 32
Processing epoch 33
Processing epoch 34
Processing epoch 35
Processing epoch 36
Processing epoch 37
Processing epoch 38
Processing epoch 39
Processing epoch 40
Processing epoch 41
Processing epoch 42
Processing epoch 43
Processing epoch 44
Processing epoch 45
Processing epoch 46
Processing epoch 47
Processing epoch 48
Processing epoch 49
Processing

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x13bec24f0>

In [37]:
#Getting the test data 
test_df= pd.read_csv('test.csv', error_bad_lines=False, sep ='\\t', engine ='python')

In [38]:
test_df

Unnamed: 0,user_id,book_id
0,20989,1832332
1,37040,191139
2,36167,28449164
3,9398,24693869
4,29848,8127
...,...,...
299601,15976,38709
299602,24853,11312
299603,29982,10697427
299604,6324,157993


In [40]:
#Printing the csv files for submission
import csv
with open('output_svd_E_100_r_0.125_n_factor_150.csv', 'w') as fw:
    fieldnames = ['user_id-book_id', 'rating']
    writer = csv.DictWriter(fw, fieldnames=fieldnames)
    writer.writeheader()
    for i in range(len(test_df)):
        userId = test_df['user_id'][i]
        bookId = test_df['book_id'][i]
        rating = svd.predict(uid=userId, iid=bookId)
        prediction = rating.est
        writer.writerow({'user_id-book_id': "{0}{1}{2}".format(userId,'-',bookId), 'rating': prediction})