In [1]:
# Data processing
import pandas as pd
import numpy as np
import scipy.stats
# Visualization
import seaborn as sns
# Similarity
from sklearn.metrics.pairwise import cosine_similarity
from product_data import ProductData
from save_file import save_file

In [2]:
file_name = "recom_pivot.csv"
prod = ProductData(file_name)
prod.set_pivot_dataframe_data_types()
prod.df_recommender.head()

Unnamed: 0,Customer_ID,5000.5,5001.0,5001.5,5002.0,5002.5,5003.0,5004.0,5004.5,5005.0,...,200035.5,200037.0,200037.5,200038.0,200038.5,200045.5,200046.0,200046.5,200047.0,350027.5
0,00024de6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,00084856,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0008e848,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,00096930,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,000c66b7,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
prod.df_recommender.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28514 entries, 0 to 28513
Columns: 334 entries, Customer_ID to 350027.5
dtypes: Int64(333), string(1)
memory usage: 81.7 MB


In [4]:
prod.df_products.head()

Unnamed: 0,5000.5,5001.0,5001.5,5002.0,5002.5,5003.0,5004.0,5004.5,5005.0,5007.0,...,200035.5,200037.0,200037.5,200038.0,200038.5,200045.5,200046.0,200046.5,200047.0,350027.5
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# This code takes a long time to run, so better not to run it.

is_execute_long_time_code = False
if is_execute_long_time_code:
    # Customer similarity matrix using Pearson correlation
    df_customer_similarity = prod.df_products.T.corr()

    # This code takes a long time to run, so save the dataframe to file
    output_file = "customer_similarity.csv"
    save_file(output_file, df_customer_similarity)

In [10]:
# Convert the dataframe to a numpy array. Processing is too slow for dataframes
x1 = prod.df_products.astype(float).to_numpy()
np_customer_similarity = np.corrcoef(x1)


In [21]:
for i in range(0,12):   
    customer_id = prod.dict_customers[i]
    row = [round(item, 2) for item in np_customer_similarity[i]]
    print(customer_id, row[0:12])

00024de6 [1.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0]
00084856 [-0.0, 1.0, -0.0, 0.71, -0.01, 0.95, -0.0, -0.0, -0.01, -0.0, -0.0, -0.0]
0008e848 [-0.0, -0.0, 1.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0]
00096930 [-0.0, 0.71, -0.0, 1.0, -0.0, 0.89, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0]
000c66b7 [-0.0, -0.01, -0.0, -0.0, 1.0, -0.01, -0.0, -0.0, -0.01, -0.0, -0.0, -0.0]
000e98ee [-0.0, 0.95, -0.0, 0.89, -0.01, 1.0, -0.0, -0.0, -0.01, -0.0, -0.0, -0.0]
00165e2e [-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 1.0, -0.0, -0.0, -0.0, -0.0, -0.0]
00171ffd [-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 1.0, -0.0, -0.0, -0.0, -0.0]
0019e439 [-0.0, -0.01, -0.0, -0.0, -0.01, -0.01, -0.0, -0.0, 1.0, -0.0, -0.0, -0.0]
001a93a4 [-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 1.0, -0.0, -0.0]
001b1dab [-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 1.0, -0.0]
001dfadd [-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 1.0]


In [12]:
# Customer similarity matrix using cosine similarity
np_customer_similarity_cosine = cosine_similarity(prod.df_products)


In [22]:
for i in range(0,12):   
    customer_id = prod.dict_customers[i]
    row = [round(item, 2) for item in np_customer_similarity_cosine[i]]
    print(customer_id, row[0:12])

00024de6 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
00084856 [0.0, 1.0, 0.0, 0.71, 0.0, 0.95, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
0008e848 [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
00096930 [0.0, 0.71, 0.0, 1.0, 0.0, 0.89, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
000c66b7 [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
000e98ee [0.0, 0.95, 0.0, 0.89, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
00165e2e [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
00171ffd [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]
0019e439 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]
001a93a4 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]
001b1dab [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]
001dfadd [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]


In [34]:
# Number of similar users
n = 10
# User similarity threshold
customer_similarity_threshold = 0.3
picked_customer_id = "000e98ee"
picked_customer_ind = prod.dict_customers_rvs[picked_customer_id]
# Get top n similar users
customer_row = np_customer_similarity[picked_customer_ind]
list_results = []
for customer_ind, corr_val in enumerate(customer_row):
    if corr_val < customer_similarity_threshold:
        continue
    
    customer = prod.dict_customers[customer_ind]
    if customer == picked_customer_id:
        continue

    correlation = round(corr_val, 2)
    list_results.append((customer, correlation))

list_results.sort(key = lambda x: x[1], reverse=True)

# Print out top n similar users
print(f'The similar customers for customer {picked_customer_id} are:', )
for customer, correlation in list_results[:n]:
    print(customer, correlation)

The similar customers for customer 000e98ee are:
15b48f75 1.0
26678038 1.0
29e44035 1.0
4326cc2b 1.0
4bf15238 1.0
50b4a246 1.0
64ff0aba 1.0
6ded3f04 1.0
73b89da1 1.0
783372ab 1.0


In [None]:
# Need to recommend products for the customer
# It is OK to recommend products that the customer has already bought
# Need to recommend products that similar customers have bought
