In [55]:
# Data processing
import pandas as pd
import numpy as np
import scipy.stats
# Visualization
import seaborn as sns
# Similarity
from sklearn.metrics.pairwise import cosine_similarity
from product_data import ProductData
from save_file import save_file
from collections import OrderedDict

In [56]:
file_name = "recom_pivot.csv"
prod = ProductData(file_name)
prod.set_pivot_dataframe_data_types()
prod.df_recommender.head()

Unnamed: 0,Customer_ID,5000.5,5001.0,5001.5,5002.0,5002.5,5003.0,5004.0,5004.5,5005.0,...,200035.5,200037.0,200037.5,200038.0,200038.5,200045.5,200046.0,200046.5,200047.0,350027.5
0,00024de6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,00084856,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0008e848,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,00096930,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,000c66b7,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
prod.df_recommender.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28514 entries, 0 to 28513
Columns: 334 entries, Customer_ID to 350027.5
dtypes: Int64(333), string(1)
memory usage: 81.7 MB


In [58]:
prod.df_products.head()

Unnamed: 0,5000.5,5001.0,5001.5,5002.0,5002.5,5003.0,5004.0,5004.5,5005.0,5007.0,...,200035.5,200037.0,200037.5,200038.0,200038.5,200045.5,200046.0,200046.5,200047.0,350027.5
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [59]:
# This code takes a long time to run, so better not to run it.

is_execute_long_time_code = False
if is_execute_long_time_code:
    # Customer similarity matrix using Pearson correlation
    df_customer_similarity = prod.df_products.T.corr()

    # This code takes a long time to run, so save the dataframe to file
    output_file = "customer_similarity.csv"
    save_file(output_file, df_customer_similarity)

In [60]:
# Convert the dataframe to a numpy array. Processing is too slow for dataframes
x1 = prod.df_products.astype(float).to_numpy()
np_customer_similarity = np.corrcoef(x1)


In [61]:
for i in range(0,12):   
    customer_id = prod.dict_customers[i]
    row = [round(item, 2) for item in np_customer_similarity[i]]
    print(customer_id, row[0:12])

00024de6 [1.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0]
00084856 [-0.0, 1.0, -0.0, 0.71, -0.01, 0.95, -0.0, -0.0, -0.01, -0.0, -0.0, -0.0]
0008e848 [-0.0, -0.0, 1.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0]
00096930 [-0.0, 0.71, -0.0, 1.0, -0.0, 0.89, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0]
000c66b7 [-0.0, -0.01, -0.0, -0.0, 1.0, -0.01, -0.0, -0.0, -0.01, -0.0, -0.0, -0.0]
000e98ee [-0.0, 0.95, -0.0, 0.89, -0.01, 1.0, -0.0, -0.0, -0.01, -0.0, -0.0, -0.0]
00165e2e [-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 1.0, -0.0, -0.0, -0.0, -0.0, -0.0]
00171ffd [-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 1.0, -0.0, -0.0, -0.0, -0.0]
0019e439 [-0.0, -0.01, -0.0, -0.0, -0.01, -0.01, -0.0, -0.0, 1.0, -0.0, -0.0, -0.0]
001a93a4 [-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 1.0, -0.0, -0.0]
001b1dab [-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 1.0, -0.0]
001dfadd [-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 1.0]


In [62]:
# Customer similarity matrix using cosine similarity
# This takes up memory and produces a very similar result, so no need to execute it
get_customer_similarity_cosine = False
if get_customer_similarity_cosine:
    np_customer_similarity_cosine = cosine_similarity(prod.df_products)
    for i in range(0,12):   
        customer_id = prod.dict_customers[i]
        row = [round(item, 2) for item in np_customer_similarity_cosine[i]]
        print(customer_id, row[0:12])


In [63]:
# Number of similar users
n = 10
# User similarity threshold
customer_similarity_threshold = 0.8
picked_customer_id = "000e98ee"
picked_customer_ind = prod.dict_customers_rvs[picked_customer_id]

# Get top n similar users
# Also exclude similar users where the correlation = 1.
# Correlation = 1 means the customer record is precisely the same as the picked customer.
# In which case no new products will be recommended from that customer
customer_row = np_customer_similarity[picked_customer_ind]
list_customer_results = []
for customer_ind, corr_val in enumerate(customer_row):
    if corr_val < customer_similarity_threshold and corr_val < 1:
        continue

    customer = prod.dict_customers[customer_ind]
    if customer == picked_customer_id:
        continue

    correlation = round(corr_val, 2)
    list_customer_results.append((customer, correlation))

list_customer_results.sort(key = lambda x: x[1], reverse=True)

# Print out top n similar users
print(f'The similar customers for customer {picked_customer_id} are:', )
for customer, correlation in list_customer_results[:n]:
    print(customer, correlation)

The similar customers for customer 000e98ee are:
15b48f75 1.0
26678038 1.0
29e44035 1.0
4326cc2b 1.0
4bf15238 1.0
50b4a246 1.0
64ff0aba 1.0
6ded3f04 1.0
73b89da1 1.0
783372ab 1.0


In [64]:
np_products = prod.df_products.to_numpy()
np_products.shape

(28514, 333)

In [74]:
# Need to recommend products for the customer
# Do not recommend products that the customer has already bought (ignore where correlation = 1)
# Need to recommend products that similar customers have bought
# Remove products that none of the customers have bought or the picked customer has already bought

np_products = prod.df_products.to_numpy()
product_names = prod.df_products.columns.values
# convert product names list to dictionary
product_dict = {key: i for i, key in enumerate(product_names)}
product_dict_rvs = dict([(value, key) for key, value in product_dict.items()])
# get product_ids for the picked customer
picked_customer_products = prod.df_products.iloc[picked_customer_ind]
dict_picked_customer_products = {}
list_picked_customer_products = []
for i, val in enumerate(picked_customer_products.values):
    if val > 0:
        product_id = product_dict_rvs[i]
        dict_picked_customer_products[product_id] = val
        list_picked_customer_products.append(product_id)

print("Products already bought by the picked customer")
dict_picked_customer_products



Products already bought by the picked customer


{'49291.5': 2, '49292.0': 1}

In [69]:
np_selected_customers = np.empty((0, prod.df_products.shape[1]))
for customer, correlation in list_customer_results:
    customer_ind = prod.dict_customers_rvs[customer]
    row = np_products[customer_ind]
    np_selected_customers = np.vstack([np_selected_customers, row])

print(np_selected_customers)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [1 1 0 ... 0 0 0]
 [2 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [72]:
product_totals = np_selected_customers.sum(axis=0)
print("These are the total number of sales per product for all the similar customers")
product_totals

These are the total number of sales per product for all the similar customers


array([20, 4, 5, 10, 0, 0, 0, 0, 0, 0, 0, 6, 0, 4, 9, 0, 0, 0, 4, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 8, 3, 0, 0, 0, 0, 0, 1, 2, 1,
       0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 9, 6, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 1, 2, 1, 0, 0, 0, 0, 1,
       2, 0, 2, 4, 1, 0, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 2, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 7, 3,
       4, 2, 0, 12, 1, 0, 0, 12, 0, 0, 0, 9, 0, 3, 0, 0, 0, 0, 0, 3, 0, 0,
       0, 0, 2, 0, 2, 0, 1, 0, 0, 3483, 159, 1, 8, 0, 0, 4, 0, 0, 0, 0, 0,
       2, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

Locate the best selling products for all the similar customers, these will be the recommended products for the selected customer. Exclude products that the customer has already bought.

In [73]:
dict_product_totals = dict((product_dict_rvs[key], value) for key, value in enumerate(product_totals, 0) if value > 3 and product_dict_rvs[key] not in list_picked_customer_products)
dict_product_totals = sorted(dict_product_totals.items(), key=lambda x:x[1], reverse=True)
print(f"Products recommended for the picked customer {picked_customer_id}")
dict_product_totals

Products recommended for the picked customer 000e98ee


[('5000.5', 20),
 ('45004.0', 12),
 ('48504.5', 12),
 ('5002.0', 10),
 ('5012.0', 9),
 ('25003.0', 9),
 ('48513.0', 9),
 ('10013.0', 8),
 ('49356.0', 8),
 ('45001.0', 7),
 ('5009.0', 6),
 ('25003.5', 6),
 ('5001.5', 5),
 ('49567.5', 5),
 ('5001.0', 4),
 ('5011.5', 4),
 ('5017.0', 4),
 ('35000.5', 4),
 ('35076.0', 4),
 ('45002.5', 4),
 ('49427.5', 4)]