In [20]:
# Import libraries
import pandas as pd
import pickle

# Import Feature Engineered Sales Transaction file
sales_df = pd.read_csv('Sales-Transactions-Edited.csv')

## Build Correlation Matrix for the Product-Customer relation

In [21]:
# Find the total qty purchased by each customer of each product
prod_cust_qty_df = sales_df.groupby(['Product','Party']).agg({'Qty':'sum'})

# Reset the index by converting the Party and Product into columns
prod_cust_qty_df.reset_index(inplace=True)

# Find the no of unique customers purchased each product
prod_cust_count_df = sales_df.groupby(['Product']).agg({'Party':'nunique'})

# Set the customer count column
prod_cust_count_df.columns=['No_of_Customers']

# Reset the index by converting the Party and Product into columns
prod_cust_count_df.reset_index(inplace=True)

# Merge the unique customer count and qty purchased of each product
prod_cust_df = pd.merge(prod_cust_qty_df,prod_cust_count_df,how='inner',on='Product')

# Create a pivot table with all Products on columns and Customers on rows, and Qty as values
prod_cust_pivot_df = prod_cust_df.pivot(index='Party',columns='Product',values='Qty').fillna(0)

# Used Spearman method in identifying the correlation. Pearson was not providing better results and Kendall is taking a long time for execution. Library issues as mentioned before.
# Plus Kendall method requires additional libraries, otherwise raises error
prod_correlation_df = prod_cust_pivot_df.corr(method='pearson',min_periods=5)
prod_correlation_df.head(10)

Product,1.25 COOLDRINKS,"10"" CLASSIFOAM-1200","10"" ESSFOAM LOOSE","10"" GREEN","10"" SILVER HEAVY","10"" THERMOCOL PRINT",10*10 CITIZEN,10*10 DHAVAT,10*10 JANATHA,10*10 MORE,...,WATER DISPENSERS,WATER GLASS,WATER GLASS(300),WELCOME GLASS,WINE GLASS,ZEN-D CHEAP,ZEN-REALPACK,ZEND-1ST,ZEND-CLASSIC,ZEND-PREMIUM
Product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.25 COOLDRINKS,1.0,-0.002911,-0.001453,-0.001406,-0.001406,-0.001752,0.060341,-0.001912,-0.001406,0.197877,...,-0.001406,0.014849,-0.003028,-0.002202,0.274311,-0.001829,-0.002407,-0.004745,-0.001837,-0.004961
"10"" CLASSIFOAM-1200",-0.002911,1.0,-0.003007,-0.002911,-0.002911,-0.003626,-0.004583,-0.003958,-0.002911,-0.008453,...,-0.002911,0.026177,-0.006267,0.42887,0.112072,-0.003786,-0.004982,0.093742,-0.003802,-0.002773
"10"" ESSFOAM LOOSE",-0.001453,-0.003007,1.0,-0.001453,0.999445,-0.001809,0.935564,0.958691,-0.001453,0.894049,...,-0.001453,0.657887,0.24816,-0.002274,0.526687,-0.001889,-0.002486,0.257225,0.156396,-0.005124
"10"" GREEN",-0.001406,-0.002911,-0.001453,1.0,-0.001406,-0.001752,-0.002214,-0.001912,-0.001406,-0.004084,...,-0.001406,-0.004977,-0.003028,0.182266,-0.006074,-0.001829,-0.002407,-0.004745,-0.001837,-0.004961
"10"" SILVER HEAVY",-0.001406,-0.002911,0.999445,-0.001406,1.0,-0.001752,0.936113,0.959242,-0.001406,0.89464,...,-0.001406,0.658387,0.248387,-0.002202,0.527157,-0.001829,-0.002407,0.257514,0.156537,-0.004961
"10"" THERMOCOL PRINT",-0.001752,-0.003626,-0.001809,-0.001752,-0.001752,1.0,-0.002758,-0.002382,-0.001752,0.004589,...,-0.001752,-0.006199,-0.003771,-0.002743,-0.007566,-0.002278,-0.002998,0.289732,-0.002288,0.036621
10*10 CITIZEN,0.060341,-0.004583,0.935564,-0.002214,0.936113,-0.002758,1.0,0.922618,-0.002214,0.850823,...,-0.002214,0.628333,0.230812,-0.003467,0.507596,-0.002879,-0.003789,0.240197,0.145506,0.001119
10*10 DHAVAT,-0.001912,-0.003958,0.958691,-0.001912,0.959242,-0.002382,0.922618,1.0,-0.001912,0.879257,...,-0.001912,0.651301,0.237192,-0.002994,0.521059,-0.002487,-0.003273,0.248967,0.14951,0.007545
10*10 JANATHA,-0.001406,-0.002911,-0.001453,-0.001406,-0.001406,-0.001752,-0.002214,-0.001912,1.0,0.012746,...,-0.001406,-0.004977,-0.003028,-0.002202,-0.006074,-0.001829,-0.002407,-0.004745,-0.001837,-0.004961
10*10 MORE,0.197877,-0.008453,0.894049,-0.004084,0.89464,0.004589,0.850823,0.879257,0.012746,1.0,...,-0.004084,0.59232,0.239477,-0.006394,0.565211,0.01345,-0.006988,0.222098,0.137397,0.026506


## Write the Product to Product Correlation Matrix to a (.csv) file

In [22]:
prod_correlation_df.to_csv('Product-Product-Correlation-Matrix.csv')

## Create a Pickle (.pkl) file with the Correlation Matrix dataframe

In [23]:
pickle.dump(prod_correlation_df, open('prod_correlation_model.pkl','wb'))