# Task 2: Lookalike Model

### Step 1: Data preparation

Import and explore fields which can be used as features to recommend similar customers

In [3]:
import pandas as pd

In [4]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [5]:
customers.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [6]:
products.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [7]:
transactions.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


Merge fields to create our main dataframe

In [8]:
merged_df = transactions.merge(customers, how="inner", on="CustomerID").merge(products, how="inner", on="ProductID")
merged_df.head

<bound method NDFrame.head of     TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0          T00001      C0199      P067  2024-08-25 12:38:23         1   
1          T00112      C0146      P067  2024-05-27 22:23:54         1   
2          T00166      C0127      P067  2024-04-25 07:38:55         1   
3          T00272      C0087      P067  2024-03-26 22:55:37         2   
4          T00363      C0070      P067  2024-03-21 15:10:10         3   
..            ...        ...       ...                  ...       ...   
995        T00630      C0031      P093  2024-10-08 23:58:14         2   
996        T00672      C0165      P044  2024-07-28 00:09:49         4   
997        T00711      C0165      P044  2024-06-11 15:51:14         4   
998        T00878      C0165      P044  2024-09-24 21:15:21         3   
999        T00157      C0169      P044  2024-11-09 09:07:36         2   

     TotalValue  Price_x     CustomerName         Region  SignupDate  \
0        300.68   300

### Step 2: Feature Engineering

We need to build a customer profile that allows model to calculate similarity between customers. To build this profile we need features that can used to match similarities.

Features I've considered from Customers & Transactions dataset:
1. Total amount spent by each customer across all transactions.
2. Number of transactions made by each customer.

Features I've considered from Products dataset:
1. Number of purchases by each customer in each product category

From Customer Dataset

In [9]:
# Total Spend per customer
total_spend_per_customer = merged_df.groupby('CustomerID')['TotalValue'].sum().reset_index()
total_spend_per_customer

Unnamed: 0,CustomerID,TotalValue
0,C0001,3354.52
1,C0002,1862.74
2,C0003,2725.38
3,C0004,5354.88
4,C0005,2034.24
...,...,...
194,C0196,4982.88
195,C0197,1928.65
196,C0198,931.83
197,C0199,1979.28


In [10]:
# No. of transactions made by each customer
transaction_frequency_per_customer = merged_df.groupby('CustomerID').size().reset_index(name='TransactionFrequency')
transaction_frequency_per_customer

Unnamed: 0,CustomerID,TransactionFrequency
0,C0001,5
1,C0002,4
2,C0003,4
3,C0004,8
4,C0005,3
...,...,...
194,C0196,4
195,C0197,3
196,C0198,2
197,C0199,4


From Products dataset

In [11]:
# Creating a pivot table to count the number of purchases by each customer in each product category

category_purchases = pd.pivot_table(
    merged_df, 
    index='CustomerID', 
    columns='Category', 
    values='Quantity', 
    aggfunc='sum', 
    fill_value=0
)
category_purchases.reset_index(inplace=True)
category_purchases

Category,CustomerID,Books,Clothing,Electronics,Home Decor
0,C0001,2,0,7,3
1,C0002,0,4,0,6
2,C0003,0,4,4,6
3,C0004,8,0,6,9
4,C0005,0,0,4,3
...,...,...,...,...,...
194,C0196,3,4,0,5
195,C0197,0,0,6,3
196,C0198,0,2,1,0
197,C0199,0,0,3,6


Merging the feature columns to form our complete profile for a customer

In [12]:
customer_features = pd.merge(total_spend_per_customer, transaction_frequency_per_customer, on='CustomerID', how='left')
customer_profile = pd.merge(customer_features, category_purchases, on='CustomerID', how='left')

customer_profile

Unnamed: 0,CustomerID,TotalValue,TransactionFrequency,Books,Clothing,Electronics,Home Decor
0,C0001,3354.52,5,2,0,7,3
1,C0002,1862.74,4,0,4,0,6
2,C0003,2725.38,4,0,4,4,6
3,C0004,5354.88,8,8,0,6,9
4,C0005,2034.24,3,0,0,4,3
...,...,...,...,...,...,...,...
194,C0196,4982.88,4,3,4,0,5
195,C0197,1928.65,3,0,0,6,3
196,C0198,931.83,2,0,2,1,0
197,C0199,1979.28,4,0,0,3,6


### Step 3: Feature Scaling

Since I will be using distance-based similarity methods, I will first use feature scaling as the similarity methods are sensitive to the magnitude of the features and it also ensures all features contribute equally to the calculation.

In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

features_to_scale = ['TotalValue', 'TransactionFrequency', 'Books', 'Electronics', 'Clothing', 'Home Decor']
customer_profile[features_to_scale] = scaler.fit_transform(customer_profile[features_to_scale])

customer_profile.head()


Unnamed: 0,CustomerID,TotalValue,TransactionFrequency,Books,Clothing,Electronics,Home Decor
0,C0001,-0.061701,-0.011458,-0.464594,-0.963893,1.255863,-0.069051
1,C0002,-0.877744,-0.467494,-1.117981,0.336546,-1.027971,0.912454
2,C0003,-0.405857,-0.467494,-1.117981,0.336546,0.277077,0.912454
3,C0004,1.032547,1.35665,1.495566,-0.963893,0.929601,1.893958
4,C0005,-0.783929,-0.92353,-1.117981,-0.963893,0.277077,-0.069051


### Step 4: Calculating Similarity

I will be using Cosine Similarity between the customer profiles as we have a high-dimentional data to work with. This will essentially give a matrix similar to a correlation matrix.

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(customer_profile[features_to_scale])

# Convert the similarity matrix into a DataFrame
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profile['CustomerID'], columns=customer_profile['CustomerID'])

similarity_df

CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.000000,-0.323057,0.189517,0.222293,0.583006,-0.836235,0.596059,-0.109688,0.027800,-0.643218,...,-0.077121,0.449518,-0.278544,0.215080,-0.085935,-0.634708,0.756302,0.026611,0.422915,-0.827686
C0002,-0.323057,1.000000,0.740619,-0.413772,0.426889,0.137827,0.455898,0.031128,0.519927,0.401687,...,-0.556709,-0.130186,-0.402595,-0.242688,0.569862,0.419573,0.243675,0.516771,0.674523,0.165733
C0003,0.189517,0.740619,1.000000,-0.201259,0.536835,-0.345058,0.620168,0.233947,0.253767,0.135473,...,-0.862575,-0.163888,-0.799409,-0.024242,0.741657,0.156716,0.541170,0.282780,0.713044,-0.003583
C0004,0.222293,-0.413772,-0.201259,1.000000,-0.427631,-0.192366,-0.292203,0.300643,-0.861581,-0.778517,...,0.135306,-0.545208,0.079346,0.603137,0.281570,-0.030786,-0.306051,-0.866601,-0.083595,-0.079335
C0005,0.583006,0.426889,0.536835,-0.427631,1.000000,-0.411362,0.973490,-0.500065,0.729690,-0.125315,...,-0.203782,0.650493,-0.239080,-0.410871,-0.040999,-0.236742,0.953873,0.754683,0.832463,-0.688715
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C0196,-0.634708,0.419573,0.156716,-0.030786,-0.236742,0.767494,-0.095277,-0.024904,-0.154349,0.151788,...,-0.274920,-0.623100,-0.044287,-0.221552,0.430400,1.000000,-0.427319,-0.086120,-0.013601,0.717306
C0197,0.756302,0.243675,0.541170,-0.306051,0.953873,-0.609053,0.928162,-0.395049,0.583230,-0.209660,...,-0.228731,0.652505,-0.314196,-0.292057,-0.046753,-0.427319,1.000000,0.611092,0.752184,-0.757833
C0198,0.026611,0.516771,0.282780,-0.866601,0.754683,0.051145,0.631322,-0.581932,0.986509,0.467836,...,0.000672,0.734725,0.043898,-0.707962,-0.318752,-0.086120,0.611092,1.000000,0.467730,-0.312720
C0199,0.422915,0.674523,0.713044,-0.083595,0.832463,-0.386748,0.885537,-0.211418,0.464839,-0.273427,...,-0.394069,0.208061,-0.382079,-0.117749,0.403881,-0.013601,0.752184,0.467730,1.000000,-0.536114


### Step 5: Testing Similarity Model and Saving Results

Now that we have a similarity model, we will loop through each of the first 20 customers to find their top 3 lookalikes.

In [15]:
# Get first 20 customers
customer_ids = [f'C000{i}' for i in range(1,10)] + [f'C00{i}' for i in range(10,21)]
customer_ids

['C0001',
 'C0002',
 'C0003',
 'C0004',
 'C0005',
 'C0006',
 'C0007',
 'C0008',
 'C0009',
 'C0010',
 'C0011',
 'C0012',
 'C0013',
 'C0014',
 'C0015',
 'C0016',
 'C0017',
 'C0018',
 'C0019',
 'C0020']

In [47]:
# Dictionary to hold the lookalike data
lookalike_data = {}

for customer_id in customer_ids:
    # Get the similarity scores for the current customer
    similarity_scores = similarity_df[customer_id]

    # Sort the similarity scores in descending order (excluding the customer itself)
    sorted_similarity_scores = similarity_scores.drop(customer_id).sort_values(ascending=False)

    # Get the top 3 most similar customers (excluding the customer itself)
    top_3_similar_customers = sorted_similarity_scores.head(3)
    
    # Store result
    lookalike_data[customer_id] = [
        {"CustomerID": similar_customer, "Score": score} 
        for similar_customer, score in top_3_similar_customers.items()
    ]

Convert the data into the desired format and save result

In [59]:
lookalike_csv_data = []

# Loop over the `lookalike_data` dictionary and format the output
for customer_id, similar_customers in lookalike_data.items():
    lookalike_list = '[{}]'.format(', '.join([f"{item['CustomerID']},{item['Score']:.2f}" for item in similar_customers]))
    lookalike_csv_data.append([f"{customer_id}, {lookalike_list}"])

# Convert the list of data into a DataFrame
lookalike_df = pd.DataFrame(lookalike_csv_data, columns=['Lookalikes'])

In [60]:
# Save results
lookalike_df.to_csv('Yash_Jagdale_Lookalike.csv', index=False)