In [69]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

## Data Loading

In [5]:
customers = pd.read_csv('data/Customers.csv')
products = pd.read_csv('data/Products.csv')
transactions = pd.read_csv('data/Transactions.csv')

In [7]:
merged_data = pd.merge(transactions, customers, 'left', on='CustomerID')
merged_data = pd.merge(merged_data, products, 'left', on='ProductID')
merged_data['Price'] = merged_data['Price_x']
merged_data.drop(columns=['Price_x', 'Price_y'], inplace=True)
merged_data.columns

Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'CustomerName', 'Region', 'SignupDate',
       'ProductName', 'Category', 'Price'],
      dtype='object')

## Feature Engineering:
* ### Existing features - Region
* ### Category wise total spent
* ### Categorywise Quantity


In [34]:
customer_new = customers.copy()
for cat in products.Category.unique():
    df = merged_data[merged_data.Category==cat].groupby(['CustomerID'])['TotalValue'].sum().reset_index()
    df = df.rename(columns={'TotalValue': f'{cat}_TotalValue'})
    customer_new = pd.merge(customer_new, df, 'left', on='CustomerID', )
customer_new = customer_new.fillna(0)
customer_new.head(4)

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,Books_TotalValue,Electronics_TotalValue,Home Decor_TotalValue,Clothing_TotalValue
0,C0001,Lawrence Carroll,South America,2022-07-10,114.6,2827.3,412.62,0.0
1,C0002,Elizabeth Lutz,Asia,2022-02-13,0.0,0.0,837.28,1025.46
2,C0003,Michael Rivera,South America,2024-03-07,0.0,1385.2,1217.82,122.36
3,C0004,Kathleen Rodriguez,South America,2022-10-09,1888.48,1355.74,2110.66,0.0


In [40]:
for cat in products.Category.unique():
    df = merged_data[merged_data.Category==cat].groupby(['CustomerID'])['Quantity'].sum().reset_index()
    df = df.rename(columns={'Quantity': f'{cat}_Quantity'})
    customer_new = pd.merge(customer_new, df, 'left', on='CustomerID')
customer_new = customer_new.fillna(0)
customer_new.head(4)

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,Books_TotalValue,Electronics_TotalValue,Home Decor_TotalValue,Clothing_TotalValue,Books_Quantity,Electronics_Quantity,Home Decor_Quantity,Clothing_Quantity
0,C0001,Lawrence Carroll,South America,2022-07-10,114.6,2827.3,412.62,0.0,2.0,7.0,3.0,0.0
1,C0002,Elizabeth Lutz,Asia,2022-02-13,0.0,0.0,837.28,1025.46,0.0,0.0,6.0,4.0
2,C0003,Michael Rivera,South America,2024-03-07,0.0,1385.2,1217.82,122.36,0.0,4.0,6.0,4.0
3,C0004,Kathleen Rodriguez,South America,2022-10-09,1888.48,1355.74,2110.66,0.0,8.0,6.0,9.0,0.0


In [51]:
customer_new.drop(columns=['CustomerName', 'SignupDate'], inplace=True)
customer_new.columns

Index(['CustomerID', 'Region', 'Books_TotalValue', 'Electronics_TotalValue',
       'Home Decor_TotalValue', 'Clothing_TotalValue', 'Books_Quantity',
       'Electronics_Quantity', 'Home Decor_Quantity', 'Clothing_Quantity'],
      dtype='object')

In [59]:
# One Hot Encoding
customer_new = pd.get_dummies(customer_new, columns=['Region'], dtype='int')
customer_new.head(4)

Unnamed: 0,CustomerID,Books_TotalValue,Electronics_TotalValue,Home Decor_TotalValue,Clothing_TotalValue,Books_Quantity,Electronics_Quantity,Home Decor_Quantity,Clothing_Quantity,Region_Asia,Region_Europe,Region_North America,Region_South America
0,C0001,114.6,2827.3,412.62,0.0,2.0,7.0,3.0,0.0,0,0,0,1
1,C0002,0.0,0.0,837.28,1025.46,0.0,0.0,6.0,4.0,1,0,0,0
2,C0003,0.0,1385.2,1217.82,122.36,0.0,4.0,6.0,4.0,0,0,0,1
3,C0004,1888.48,1355.74,2110.66,0.0,8.0,6.0,9.0,0.0,0,0,0,1


In [67]:
customer_ids = customer_new['CustomerID'].values
customer_features = customer_new.drop(columns=['CustomerID'], inplace=False).values
print(f"Feature matrix has shape - {customer_features.shape}")

Feature matrix has shape - (200, 12)


In [71]:
# standardisation
customer_features = StandardScaler().fit_transform(customer_features)

## CoSine similarity based retrieval

In [91]:
# Normalised customer feature
norms = np.linalg.norm(customer_features, axis=1).reshape((-1,1))
normalised_features = customer_features/normalised_feature
similarity_matrix = normalised_features@normalised_features.T

In [127]:
top_3_matches = np.argsort(similarity_matrix, axis=1)[:,-2:-5:-1]

In [155]:
output_dict = {}
for i in range(20):
    top_3 = top_3_matches[i]
    match_list = []
    for match in top_3:
        match_list.append((customer_new['CustomerID'][match], similarity_matrix[i, match]))
    customer = customer_new['CustomerID'][i]
    output_dict[customer] = match_list
output_dict
    
    

{'C0001': [('C0120', 0.9266967728447603),
  ('C0184', 0.890560235547983),
  ('C0181', 0.8699657227756472)],
 'C0002': [('C0159', 0.9843876963185766),
  ('C0178', 0.9439721552035605),
  ('C0134', 0.8885396667929267)],
 'C0003': [('C0031', 0.9015732784600285),
  ('C0195', 0.8432421192137682),
  ('C0152', 0.8326792257386539)],
 'C0004': [('C0148', 0.8555820718946241),
  ('C0113', 0.8513036099683192),
  ('C0012', 0.8375567950393362)],
 'C0005': [('C0007', 0.9803202745178001),
  ('C0140', 0.9196436453350602),
  ('C0146', 0.8342877259478129)],
 'C0006': [('C0108', 0.8950571311017018),
  ('C0169', 0.8282567083827392),
  ('C0187', 0.7424143772571247)],
 'C0007': [('C0005', 0.9803202745178001),
  ('C0140', 0.8477782114789901),
  ('C0146', 0.8075833540171649)],
 'C0008': [('C0109', 0.7845598957715225),
  ('C0059', 0.7301303414640772),
  ('C0079', 0.7283303847831789)],
 'C0009': [('C0198', 0.978137225566066),
  ('C0060', 0.9311144855033424),
  ('C0014', 0.9205202859033832)],
 'C0010': [('C0111', 

In [157]:
output = pd.DataFrame.from_dict(output_dict, orient='index')
output.to_csv('Lookalike.csv')