In [59]:
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection.split import train_test_split
import pandas as pd
import numpy as np
import os, io
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise import accuracy
from surprise import SVDpp
from surprise import SVD

# Load and preprocess the data

In [60]:
train_df = pd.read_csv('../data/train.csv') # import training data
test_df = pd.read_csv('../data/test.csv') # import testing data

train_df.dropna() # drop N/A in dataset
train_df.drop_duplicates(subset=['user_id', 'item_id'], keep='last') # drop duplicate user-item pairs
test_df.dropna()
test_df.drop_duplicates(subset=['user_id', 'item_id'], keep='last')

# merging meta data to train data
# meta_df = pd.read_csv('../data/metadata.csv')
# merged_df = pd.merge(train_df, meta_df, on=['user_id','item_id'], how='left')

Unnamed: 0,user_id,item_id
0,52491,1455
1,27265,91
2,226,5725
3,26368,1072
4,103122,2953
...,...,...
20993,90123,1547
20994,82829,4807
20995,96521,1526
20996,89904,1363


# Reading a file

In [61]:
reader = Reader(rating_scale=(1,10)) # invoke reader instance of surprise library
data = Dataset.load_from_df(train_df,reader) # load dataset into Surprise datastructure Dataset
trainset = data.build_full_trainset() # take the whole train.csv as training data

# Grid Search

In [65]:
# The result of grid search is kind of weird so I spend a lot of time changing the params manually to get the best parameters.

In [66]:
param_grid = {'k': [5, 10, 20, 25, 30, 35, 40, 45, 50],
              'sim_options': {'name': ['pearson', 'cosine','pearson_baseline'],
                              'min_support': [1, 5, 10, 15, 20, 25],   #the minimum number of common items needed between users to consider them for similarity. For the item-based approach, this corresponds to the minimum number of common users for two items.
                              'user_based': [False]}
              }

In [67]:
from surprise.model_selection import cross_validate, GridSearchCV

gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse'], cv=5) 

In [68]:
gs.fit(data)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.

In [69]:
print(gs.best_score['rmse'])

2.8721813284436704


In [70]:
print(gs.best_params['rmse'])

{'k': 5, 'sim_options': {'name': 'cosine', 'min_support': 10, 'user_based': False}}


# Training the model

In [71]:
sim_options = {
    'name': 'cosine', # Cosine similarity measure 
    'user_based': False # item-based CF
}

In [72]:
#KNN
knn = KNNBasic(sim_options=sim_options,k=25,min_k=15) # neighbours = 25, min neighbours = 15
knn.fit(trainset) #fit model to the training set

#SVD
#svd = SVD(n_factors = 150, n_epochs = 30)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7ff30dda40a0>

# Predict the unknown rating and output as a csv file

In [74]:
# Output the predictions of rating for each user-item pair 

ratings = {'Id': [],
        'Rating': []
        }  # creating columns same as columns in sample.csv

for i in range(len(test_df)):
    predict = knn.predict(test_df.loc[i, "user_id"], test_df.loc[i, "item_id"]) # prediction for each pair
    id_ = ""
    id_ = str(predict[0]) + "-" + str(predict[1]) # creating ID (user_id - item_id)
    ratings['Id'].append(id_)
    ratings['Rating'].append(predict[3])
output_df = pd.DataFrame(ratings, columns = ['Id','Rating']) # forming the dataframe

output_df.to_csv(r'/Users/brian80433/Desktop/hwRS_014337284/src/solution.csv', index = False, header=True) # output file