# import Library 

In [1]:
from surprise import KNNBasic, KNNWithMeans
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate, GridSearchCV
from surprise.model_selection.split import train_test_split
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from surprise import dataset
from surprise import get_dataset_dir
from surprise import accuracy

# load in data & preprocessing

In [2]:
df  = pd.read_csv('data/automotive.csv',delimiter=",")# read csv into ratings_df dataframe

## remove duplicates( same uid + iid but different rating) 

In [3]:
df = df.drop_duplicates(['reviewerID','asin'])

In [4]:
# we had to reduce the size drastically because of not enough computer memory( using hpc was a failure)
df = df.iloc[:50000,:]

In [5]:
# two datasets; one for surprise library ( no unixReviewTime) and the other for my own time based knn (includes unixReviewTime)
df_time = df.copy()

In [6]:
#clean up unused columns leaving only uid,iid,rating, and unix time for my use
df_time = df_time.drop(['Unnamed: 0', 'reviewTime','style','verified'], axis=1)

In [7]:
#clean up unused columns leaving only uid,iid,rating for surprise
df = df.drop(['Unnamed: 0', 'reviewTime','style','verified','unixReviewTime'], axis=1)

In [8]:
print(df.shape)

(50000, 3)


In [9]:
print(df_time.shape)

(50000, 4)


In [10]:
# change name and order of column for better readability
df = df[['reviewerID','asin','overall']]

In [11]:
df_time =df_time[['reviewerID','asin','overall','unixReviewTime']]

In [12]:
df.columns = ['uid', 'iid','rating']

In [13]:
df_time.columns = ['uid', 'iid','rating','time']

In [None]:
#run dataset through reader to use it for surprise

In [14]:
reader = Reader(rating_scale=(1,5))
data=Dataset.load_from_df(df,reader)

In [16]:
#split data into training and test(8:2)

In [15]:
trainingSet, testSet = train_test_split(data, test_size=0.2, train_size=None, random_state=None, shuffle=True)
#fulltrainset = data.build_full_trainset()

In [17]:
# using user-based and similarity measure of cosine

In [18]:
sim_options = {'name':'cosine','user_based':True}

In [19]:
#run through knn basic algorithm (fit)

In [20]:
algo = KNNBasic( k = 3, sim_options = sim_options)
algo.fit(trainingSet)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7fe71d1104c0>

In [21]:
#get predictions and rmse from library to compaer to our custom time-based knn

In [22]:
predictions = algo.test(testSet)

In [23]:
accuracy.rmse(predictions, verbose=True) 

RMSE: 0.9942


0.9942284422080769

# my own algorithm encompassing time into finding better neighbor and estimate rating  

In [31]:
fulltrainset = data.build_full_trainset()

In [33]:
## this knn algorithm from surprise will only be used to find neighbors of the testSet rows and not to predict ratings
algo_custonknn = KNNBasic( k = 3, sim_options = sim_options)

In [34]:
algo_custonknn.fit(fulltrainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7fdfb946d6a0>

In [63]:
# start looking into neighbors and sorting result based on unix time and choosing top k neighbor to find average rating on 

In [None]:
# this will be the defualt global_mean used for predicted rating when finding estimate is not possible 
#(not enough neighbors have ratings for item we want to predict rating of). This is also how surprise deals with 
#ratings it is not able to predict.

In [87]:
default = trainingSet.global_mean

In [88]:
myPredictions=[]

In [89]:
rmse =0

In [92]:
k=3

In [94]:
for i in range(len(testSet)):
#find inner id of user to use to retrieve its neighbors
    uid = fulltrainset.to_inner_uid(testSet[i][0])
    iid = testSet[i][1]
    real_rating = testSet[i][2]
    # retrieve inner id of users nearest to id
    neighbors = algo_custonknn.get_neighbors(uid, k=(k+5))
    neighbors_rawid =[]
    # change inner neighbor id from surprise to raw id to be used by us 
    for id in neighbors:
        neighbors_rawid.append(fulltrainset.to_raw_uid(id))
    #new dataframe that only holds the k nearest neighbor's rating of corresponding item
    temp = df_time.loc[df_time['uid'].isin(neighbors_rawid)]
    temp = temp.loc[temp['iid'] == iid]
    # if there is enough neighbors, we will find basic average to make our rating prediction
    rating =0
    if(len(temp)>=k):
        for j in range(k):
            rating += temp['rating'].values[j]
        rating = rating/k
    # if there is not enough neighbors to make a prediction, we use global average ( how surprise library does it)
    else:
        rating = default
    #calculate square error of each prediction
    rmse += ((real_rating - rating)**2)
    


In [95]:
# finally compute rmse by finding mean and squaring it
rmse= rmse/len(testSet)
rmse = rmse**0.5

In [96]:
print(rmse)

1.0459295705729792
