Get the KNN for a given item

In [2]:
# basics
import argparse
import os
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.sparse import csr_matrix as sparse_matrix

# sklearn imports
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize

# our code
import linear_model
import utils

url_amazon = "https://www.amazon.com/dp/%s"

def load_dataset(filename):
    with open(os.path.join('..','data',filename), 'rb') as f:
        return pickle.load(f)

In [3]:
filename = "ratings_Patio_Lawn_and_Garden.csv"
with open(os.path.join("..", "data", filename), "rb") as f:
    ratings = pd.read_csv(f,names=("user","item","rating","timestamp"))
X, user_mapper, item_mapper, user_inverse_mapper, item_inverse_mapper, user_ind, item_ind = utils.create_user_item_matrix(ratings)
X_binary = X != 0

grill_brush = "B00CFM0P7Y"
grill_brush_ind = item_mapper[grill_brush]
grill_brush_vec = X[:,grill_brush_ind]

print(url_amazon % grill_brush)

https://www.amazon.com/dp/B00CFM0P7Y


In [4]:
def nearest_ids(nbrs_list):
    nearest_id = np.empty(len(nbrs_list), dtype='S15')
    for key,value in item_inverse_mapper.items():
            for i in range(0,len(nbrs_list)):
                if key == nbrs_list[i]:
                    nearest_id[i] = value 
    return nearest_id

## Eucliadian distance

In [5]:
#Transpose so KNN iterates over columns not rows
X_train = np.transpose(X)
print(X_train.shape);print(X.shape)

(105984, 714791)
(714791, 105984)


In [6]:
#Fit model
model = NearestNeighbors(n_neighbors=6)
model.fit(X_train)

#Apply knn to get the index of the nearest items
nbrs = model.kneighbors(np.transpose(grill_brush_vec), n_neighbors=6, return_distance=False)

#get the item id's of the nearest items
nbrs_idx = nbrs[0]
nearest_id = nearest_ids(nbrs_idx)
item_list = nearest_id[1:]
print(item_list)

[b'B00IJB5MCS' b'B00IJB4MLA' b'B00EXE4O42' b'B00743MZCM' b'B00HVXQY9A']


## Normalized Euclidian Distance

In [7]:
#Normalize data and train model
X_train_norm = normalize(X_train)
model_norm = NearestNeighbors(n_neighbors=6)
model_norm.fit(X_train_norm)

#Knn to get nearest items from normned data
nbrs_normed = model_norm.kneighbors(np.transpose(grill_brush_vec), n_neighbors=6, return_distance=False)

#Get item id's of nearest items
nbrs_idx_norm = nbrs_normed[0]
nearest_norm_id = nearest_ids(nbrs_idx_norm)
norm_item_list = nearest_norm_id[1:]
print(norm_item_list)

[b'B00IJB5MCS' b'B00IJB8F3G' b'B00IJB4MLA' b'B00EF45AHU' b'B00EF3YF0Y']


## Cosine similarity

In [8]:
#Fit model based on cosine similarity
model_cosine = NearestNeighbors(n_neighbors=6,metric='cosine')
model_cosine.fit(X_train)

#Knn to get nearest items
nbrs_cosine = model_cosine.kneighbors(np.transpose(grill_brush_vec), n_neighbors=6, return_distance=False)

#Get item id's of nearest items
nbrs_idx_cosine = nbrs_cosine[0]
nearest_cosine_id = nearest_ids(nbrs_idx_cosine)
cosine_item_list = nearest_cosine_id[1:]
print(cosine_item_list)

[b'B00IJB5MCS' b'B00IJB8F3G' b'B00IJB4MLA' b'B00EF45AHU' b'B00EF3YF0Y']


In [9]:
print(cosine_item_list == norm_item_list)

[ True  True  True  True  True]


## Finding the number of reviews for each item

In [16]:
list1 = [xi.decode() for xi in item_list]
list2 = [xi.decode() for xi in cosine_item_list]
list3 = [xi.decode() for xi in norm_item_list]
print(list1);print(list2);print(list3)

['B00IJB5MCS', 'B00IJB4MLA', 'B00EXE4O42', 'B00743MZCM', 'B00HVXQY9A']
['B00IJB5MCS', 'B00IJB8F3G', 'B00IJB4MLA', 'B00EF45AHU', 'B00EF3YF0Y']
['B00IJB5MCS', 'B00IJB8F3G', 'B00IJB4MLA', 'B00EF45AHU', 'B00EF3YF0Y']


In [11]:
items_groups = ratings.groupby(['item']).size().reset_index(name='count')
items_groups.sort_values(by='count',ascending=False)[0:5]

Unnamed: 0,item,count
10959,B000HCLLMM,3180
1622,B000071NUS,2348
400,B00004R9VV,2091
895,B00004SD7B,1707
19184,B000WEOQV8,1466


In [12]:
count_L1 = [items_groups.loc[items_groups['item'] == xi] for xi in list1]
count_L2 = [items_groups.loc[items_groups['item'] == xi] for xi in list2]
print(count_L1);print(count_L2)
#Makes sense.Cosine same as normalized euclidian. Eculidan looks to find product bought by most similar user, but
#cosine takes advantage of normalization and reflects items that are bought by similar user but also takes into
#account the relative popularity of an item.

[              item  count
103866  B00IJB5MCS     55,               item  count
103865  B00IJB4MLA     45,              item  count
98897  B00EXE4O42      1,              item  count
72226  B00743MZCM      1,               item  count
102810  B00HVXQY9A      1]
[              item  count
103866  B00IJB5MCS     55,               item  count
103867  B00IJB8F3G     91,               item  count
103865  B00IJB4MLA     45,              item  count
98068  B00EF45AHU     66,              item  count
98066  B00EF3YF0Y    110]


In [13]:
v1 = np.ones(400)
v2 = np.full((100),0.1)
v = np.append(v1,v2)

In [14]:
v = np.append(v1,v2)

In [15]:
v.shape

(500,)