In [40]:
import pandas as pd 

In [41]:
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

In [42]:
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])

Create Features for customers

In [43]:
merged_df= transactions_df.merge(customers_df,on="CustomerID").merge(products_df,on="ProductID")

In [44]:
merged_df.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [45]:
total_Spending = merged_df.groupby('CustomerID')['TotalValue'].sum().reset_index(name='TotalSpending')

In [46]:
total_Spending.head()

Unnamed: 0,CustomerID,TotalSpending
0,C0001,3354.52
1,C0002,1862.74
2,C0003,2725.38
3,C0004,5354.88
4,C0005,2034.24


In [47]:
category_pref = merged_df.groupby(['CustomerID','Category'])['Quantity'].sum().unstack(fill_value=0)
category_pref.columns = [f'Category_{col}' for col in category_pref.columns]
category_pref.reset_index(inplace=True)
category_pref

Unnamed: 0,CustomerID,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor
0,C0001,2,0,7,3
1,C0002,0,4,0,6
2,C0003,0,4,4,6
3,C0004,8,0,6,9
4,C0005,0,0,4,3
...,...,...,...,...,...
194,C0196,3,4,0,5
195,C0197,0,0,6,3
196,C0198,0,2,1,0
197,C0199,0,0,3,6


In [48]:
avg_Transaction_val = merged_df.groupby('CustomerID')['TotalValue'].mean().reset_index(name='AverageTransactionValue')
avg_Transaction_val

Unnamed: 0,CustomerID,AverageTransactionValue
0,C0001,670.904000
1,C0002,465.685000
2,C0003,681.345000
3,C0004,669.360000
4,C0005,678.080000
...,...,...
194,C0196,1245.720000
195,C0197,642.883333
196,C0198,465.915000
197,C0199,494.820000


In [49]:
latest_transaction_date = transactions_df.groupby('CustomerID')['TransactionDate'].max()
# latest_transaction_date
recent = (latest_transaction_date.max()-latest_transaction_date).dt.days.reset_index(name='Recency')
recent

Unnamed: 0,CustomerID,Recency
0,C0001,55
1,C0002,25
2,C0003,125
3,C0004,4
4,C0005,54
...,...,...
194,C0196,13
195,C0197,0
196,C0198,84
197,C0199,63


Combining all the features in a single data frame 

In [50]:
customer_features = customers_df[['CustomerID','Region']].merge(total_Spending,on='CustomerID',how='left')\
.merge(category_pref,on='CustomerID',how ='left')\
.merge(avg_Transaction_val,on='CustomerID',how ='left')\
.merge(recent,on='CustomerID',how ='left')
customer_features

Unnamed: 0,CustomerID,Region,TotalSpending,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor,AverageTransactionValue,Recency
0,C0001,South America,3354.52,2.0,0.0,7.0,3.0,670.904000,55.0
1,C0002,Asia,1862.74,0.0,4.0,0.0,6.0,465.685000,25.0
2,C0003,South America,2725.38,0.0,4.0,4.0,6.0,681.345000,125.0
3,C0004,South America,5354.88,8.0,0.0,6.0,9.0,669.360000,4.0
4,C0005,Asia,2034.24,0.0,0.0,4.0,3.0,678.080000,54.0
...,...,...,...,...,...,...,...,...,...
195,C0196,Europe,4982.88,3.0,4.0,0.0,5.0,1245.720000,13.0
196,C0197,Europe,1928.65,0.0,0.0,6.0,3.0,642.883333,0.0
197,C0198,Europe,931.83,0.0,2.0,1.0,0.0,465.915000,84.0
198,C0199,Europe,1979.28,0.0,0.0,3.0,6.0,494.820000,63.0


One-hot encode the region for categorical data


In [51]:
customer_features = pd.get_dummies(customer_features, columns=['Region'],prefix='Region')

In [52]:
customer_features

Unnamed: 0,CustomerID,TotalSpending,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor,AverageTransactionValue,Recency,Region_Asia,Region_Europe,Region_North America,Region_South America
0,C0001,3354.52,2.0,0.0,7.0,3.0,670.904000,55.0,0,0,0,1
1,C0002,1862.74,0.0,4.0,0.0,6.0,465.685000,25.0,1,0,0,0
2,C0003,2725.38,0.0,4.0,4.0,6.0,681.345000,125.0,0,0,0,1
3,C0004,5354.88,8.0,0.0,6.0,9.0,669.360000,4.0,0,0,0,1
4,C0005,2034.24,0.0,0.0,4.0,3.0,678.080000,54.0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
195,C0196,4982.88,3.0,4.0,0.0,5.0,1245.720000,13.0,0,1,0,0
196,C0197,1928.65,0.0,0.0,6.0,3.0,642.883333,0.0,0,1,0,0
197,C0198,931.83,0.0,2.0,1.0,0.0,465.915000,84.0,0,1,0,0
198,C0199,1979.28,0.0,0.0,3.0,6.0,494.820000,63.0,0,1,0,0


Customer Features Include:

1.Total Spending:The total amount spent by each customer.

2.Category Preferences:Quantities purchased in each product category (Books, Clothing, Electronics, Home Decor).

3.Average Transaction Value:Average value of transactions per customer.

4.Recency:The number of days since the customer's last transaction.

5.Region:One-hot encoded regions to account for geographic differences.


In [55]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [58]:
numeric_features = customer_features.drop(columns=['CustomerID'])
numeric_features

Unnamed: 0,TotalSpending,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor,AverageTransactionValue,Recency,Region_Asia,Region_Europe,Region_North America,Region_South America
0,3354.52,2.0,0.0,7.0,3.0,670.904000,55.0,0,0,0,1
1,1862.74,0.0,4.0,0.0,6.0,465.685000,25.0,1,0,0,0
2,2725.38,0.0,4.0,4.0,6.0,681.345000,125.0,0,0,0,1
3,5354.88,8.0,0.0,6.0,9.0,669.360000,4.0,0,0,0,1
4,2034.24,0.0,0.0,4.0,3.0,678.080000,54.0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
195,4982.88,3.0,4.0,0.0,5.0,1245.720000,13.0,0,1,0,0
196,1928.65,0.0,0.0,6.0,3.0,642.883333,0.0,0,1,0,0
197,931.83,0.0,2.0,1.0,0.0,465.915000,84.0,0,1,0,0
198,1979.28,0.0,0.0,3.0,6.0,494.820000,63.0,0,1,0,0


In [60]:
numeric_features = numeric_features.fillna(0)

In [62]:
similarity_matrix = cosine_similarity(numeric_features)
similarity_matrix

array([[1.        , 0.99885658, 0.99846426, ..., 0.9628075 , 0.99875678,
        0.99991814],
       [0.99885658, 1.        , 0.99950368, ..., 0.97398756, 0.99983717,
        0.99881971],
       [0.99846426, 0.99950368, 1.        , ..., 0.9756372 , 0.99990604,
        0.99802837],
       ...,
       [0.9628075 , 0.97398756, 0.9756372 , ..., 1.        , 0.97504014,
        0.96191735],
       [0.99875678, 0.99983717, 0.99990604, ..., 0.97504014, 1.        ,
        0.9984897 ],
       [0.99991814, 0.99881971, 0.99802837, ..., 0.96191735, 0.9984897 ,
        1.        ]])

In [68]:
customer_ids = customer_features['CustomerID']
# customer_id
top_20_customers = customer_ids[:20]
# top_20_customers

In [69]:
lookalike_results = {}

In [70]:
for i, customer_id in enumerate(top_20_customers):
    similarity_scores = list(enumerate(similarity_matrix[i]))
    sorted_Scores = sorted(similarity_scores, key= lambda x: x[1], reverse=True)
    top_3 = sorted_Scores[1:4]

    lookalike_results[customer_id] = [
        (customer_ids[idx], round(score, 4)) for idx, score in top_3
    ]

In [72]:
lookalike_df = pd.DataFrame([{"CustomerID":customer_id,"Lookalikes":lookalikes} for customer_id, lookalikes in lookalike_results.items()])
lookalike_df

Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[(C0114, 1.0), (C0148, 1.0), (C0187, 1.0)]"
1,C0002,"[(C0031, 1.0), (C0168, 1.0), (C0133, 1.0)]"
2,C0003,"[(C0052, 1.0), (C0094, 1.0), (C0125, 1.0)]"
3,C0004,"[(C0028, 1.0), (C0188, 1.0), (C0068, 1.0)]"
4,C0005,"[(C0085, 1.0), (C0036, 1.0), (C0066, 1.0)]"
5,C0006,"[(C0177, 1.0), (C0031, 1.0), (C0026, 1.0)]"
6,C0007,"[(C0074, 1.0), (C0032, 1.0), (C0040, 1.0)]"
7,C0008,"[(C0141, 1.0), (C0065, 1.0), (C0175, 1.0)]"
8,C0009,"[(C0043, 1.0), (C0112, 1.0), (C0071, 1.0)]"
9,C0010,"[(C0135, 1.0), (C0026, 1.0), (C0177, 1.0)]"


In [73]:
output_path = "Anubhav_Sood_Lookalike.csv"
lookalike_df.to_csv(output_path,index=False)
