# Nearest Neighbor product based Collaborative Filtering

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

In [2]:
combined=pd.read_csv("Combined.csv")

In [3]:
combined.head()

Unnamed: 0.1,Unnamed: 0,order_id,order_item_id,product_id,seller_id,price,freight_value,product_category_name,customer_id,review_id,review_score,review_comment_title,review_creation_date,review_answer_timestamp,customer_unique_id,customer_city,customer_state,seller_city,seller_state,product_category_name_english
0,0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,58.9,13.29,cool_stuff,3ce436f183e68e07877b285a838db11a,97ca439bc427b48bc1cd7177abe71365,5,Super recomended,2017-09-21 00:00:00,2017-09-22 10:57:03,871766c5855e863f6eccc05f988b23cb,campos dos goytacazes,RJ,volta redonda,SP,cool_stuff
1,1,130898c0987d1801452a8ed92a670612,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,55.9,17.96,cool_stuff,e6eecc5a77de221464d1c4eaff0a9b64,b11cba360bbe71410c291b764753d37f,5,Super recomended,2017-07-14 00:00:00,2017-07-17 12:50:07,0fb8e3eab2d3e79d92bb3fffbb97f188,jatai,GO,volta redonda,SP,cool_stuff
2,2,532ed5e14e24ae1f0d735b91524b98b9,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,64.9,18.33,cool_stuff,4ef55bf80f711b372afebcb7c715344a,af01c4017c5ab46df6cc810e069e654a,4,Recomended,2018-06-05 00:00:00,2018-06-06 21:41:12,3419052c8c6b45daf79c1e426f9e9bcb,belo horizonte,MG,volta redonda,SP,cool_stuff
3,3,6f8c31653edb8c83e1a739408b5ff750,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,58.9,16.17,cool_stuff,30407a72ad8b3f4df4d15369126b20c9,8304ff37d8b16b57086fa283fe0c44f8,5,Super recomended,2017-08-10 00:00:00,2017-08-13 03:35:17,e7c828d22c0682c1565252deefbe334d,sao jose dos pinhais,PR,volta redonda,SP,cool_stuff
4,4,7d19f4ef4d04461989632411b7e588b9,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,58.9,13.29,cool_stuff,91a792fef70ecd8cc69d3c7feb3d12da,426f43a82185969503fb3c86241a9535,5,Super recomended,2017-08-25 00:00:00,2017-08-28 00:51:18,0bb98ba72dcc08e95f9d8cc434e9a2cc,conselheiro lafaiete,MG,volta redonda,SP,cool_stuff


In [4]:
df1=pd.read_csv("combined.csv",usecols=['product_id','customer_unique_id','product_category_name_english','review_score'])

In [5]:
df1.head()

Unnamed: 0,product_id,review_score,customer_unique_id,product_category_name_english
0,4244733e06e7ecb4970a6e2683c13e61,5,871766c5855e863f6eccc05f988b23cb,cool_stuff
1,4244733e06e7ecb4970a6e2683c13e61,5,0fb8e3eab2d3e79d92bb3fffbb97f188,cool_stuff
2,4244733e06e7ecb4970a6e2683c13e61,4,3419052c8c6b45daf79c1e426f9e9bcb,cool_stuff
3,4244733e06e7ecb4970a6e2683c13e61,5,e7c828d22c0682c1565252deefbe334d,cool_stuff
4,4244733e06e7ecb4970a6e2683c13e61,5,0bb98ba72dcc08e95f9d8cc434e9a2cc,cool_stuff


Combine total product review and count the total reviews

In [6]:
combine_product_review = df1.dropna(axis = 0, subset = ['product_category_name_english'])
product_reviewCount = (combine_product_review.
     groupby(by = ['product_category_name_english'])['review_score'].
     count().
     reset_index().
     rename(columns = {'review_score': 'totalReviewCount'})
     [['product_category_name_english','totalReviewCount']]
    )
product_reviewCount.head()

Unnamed: 0,product_category_name_english,totalReviewCount
0,agro_industry_and_commerce,241
1,air_conditioning,279
2,art,209
3,arts_and_craftmanship,19
4,audio,362


# After merging Total review count column into dataset

In [7]:
review_with_totalReviewCount  = combine_product_review.merge(product_reviewCount, left_on = 'product_category_name_english', right_on = 'product_category_name_english', how = 'left')
review_with_totalReviewCount.head()

Unnamed: 0,product_id,review_score,customer_unique_id,product_category_name_english,totalReviewCount
0,4244733e06e7ecb4970a6e2683c13e61,5,871766c5855e863f6eccc05f988b23cb,cool_stuff,3847
1,4244733e06e7ecb4970a6e2683c13e61,5,0fb8e3eab2d3e79d92bb3fffbb97f188,cool_stuff,3847
2,4244733e06e7ecb4970a6e2683c13e61,4,3419052c8c6b45daf79c1e426f9e9bcb,cool_stuff,3847
3,4244733e06e7ecb4970a6e2683c13e61,5,e7c828d22c0682c1565252deefbe334d,cool_stuff,3847
4,4244733e06e7ecb4970a6e2683c13e61,5,0bb98ba72dcc08e95f9d8cc434e9a2cc,cool_stuff,3847


In [8]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(product_reviewCount['totalReviewCount'].describe())

count      71.000
mean     1522.141
std      2518.406
min         2.000
25%        92.000
50%       279.000
75%      1788.000
max     10647.000
Name: totalReviewCount, dtype: float64


filtering out products that have less than 100 reviews 

In [9]:
popularity_threshold =100
review_popular_product= review_with_totalReviewCount.query('totalReviewCount >= @popularity_threshold')
review_popular_product.head()

Unnamed: 0,product_id,review_score,customer_unique_id,product_category_name_english,totalReviewCount
0,4244733e06e7ecb4970a6e2683c13e61,5,871766c5855e863f6eccc05f988b23cb,cool_stuff,3847
1,4244733e06e7ecb4970a6e2683c13e61,5,0fb8e3eab2d3e79d92bb3fffbb97f188,cool_stuff,3847
2,4244733e06e7ecb4970a6e2683c13e61,4,3419052c8c6b45daf79c1e426f9e9bcb,cool_stuff,3847
3,4244733e06e7ecb4970a6e2683c13e61,5,e7c828d22c0682c1565252deefbe334d,cool_stuff,3847
4,4244733e06e7ecb4970a6e2683c13e61,5,0bb98ba72dcc08e95f9d8cc434e9a2cc,cool_stuff,3847


In [10]:
review_popular_product.shape

(107383, 5)

# create a Pivot matrix

In [11]:
product_features_df=review_popular_product.pivot_table(index='product_category_name_english',columns='customer_unique_id',values='review_score').fillna(0)
product_features_df.head()

customer_unique_id,0000366f3b9a7992bf8c76cfdf3221e2,0000b849f77a49e4a4ce2b2a4ca5be3f,0000f46a3911fa3c0805444483337064,0000f6ccb0745a6a4b88665a16c9f078,0004aac84e0df4da2b147fca70cf8255,0004bd2a26a76fe21f786e4fbd80607f,00050ab1314c0e55a6ca13cf7181fecf,00053a61a98854899e70ed204dd4bafe,0005e1862207bf6ccc02e4228effd9a0,0005ef4cd20d2893f0d9fbd94d3c0d97,...,fff7219c86179ca6441b8f37823ba3d3,fff96bc586f78b1f070da28c4977e810,fffa431dd3fcdefea4b1777d114144f2,fffb09418989a0dbff854a28163e47c6,fffcc512b7dfecaffd80f13614af1d16,fffcf5a5ff07b0908bd4e2dbc735a684,fffea47cd6d3cc0a88bd621562a9d061,ffff371b4d645b6ecea244b27531430a,ffff5962728ec6157033ef9805bacc48,ffffd2657e2aad2907e67c3e9daecbeb
product_category_name_english,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
agro_industry_and_commerce,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
air_conditioning,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
art,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
audio,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
auto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0


In [12]:
from scipy.sparse import csr_matrix

product_features_df_matrix = csr_matrix(product_features_df.values)

In [13]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(product_features_df_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [14]:
product_features_df.shape

(53, 90515)

# choose a random product

In [15]:
query_index = np.random.choice(product_features_df.shape[0])
print(query_index)
distances, indices = model_knn.kneighbors(product_features_df.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)

34


# Based on that product what we have choosen 5 nearest distance product get recommended

In [16]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(product_features_df.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, product_features_df.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for home_appliances_2:

1: bed_bath_table, with distance of 0.9996340565324598:
2: home_construction, with distance of 1.0:
3: industry_commerce_and_business, with distance of 1.0:
4: housewares, with distance of 1.0:
5: home_confort, with distance of 1.0:
