In [1]:
import pandas as pd
from sklearn.preprocessing import RobustScaler, TargetEncoder
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import manhattan_distances
import numpy as np

In [2]:
df_cust = pd.read_csv('/content/drive/MyDrive/zeo_tap/Customers.csv', parse_dates=['SignupDate'])
df_prod = pd.read_csv('/content/drive/MyDrive/zeo_tap/Products.csv')
df_trans = pd.read_csv('/content/drive/MyDrive/zeo_tap/Transactions.csv', parse_dates=['TransactionDate'])

In [3]:
df_trans_prod = pd.merge(df_trans, df_prod, on='ProductID', how='left')

In [4]:
df_trans_prod_cust = pd.merge(df_trans_prod, df_cust, on='CustomerID', how='left')

In [5]:
df_trans_prod_cust.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,Asia,2024-09-04
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Kathryn Stevens,Europe,2024-04-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Travis Campbell,South America,2024-04-11
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Timothy Perez,Europe,2022-03-15


In [6]:
Total_spending =df_trans_prod_cust.groupby('CustomerID')['TotalValue'].sum().reset_index()
Total_spending.columns = ["CustomerID", "TotalSpending"]



In [7]:
Transaction_count = df_trans_prod_cust.groupby("CustomerID")["TransactionID"].nunique().reset_index()
Transaction_count.columns = ["CustomerID", "TransactionCount"]

In [8]:
favorite_category = df_trans_prod_cust.groupby(["CustomerID", "Category"]).size().reset_index(name="Count")
favorite_category = favorite_category.loc[favorite_category.groupby("CustomerID")["Count"].idxmax()][["CustomerID", "Category"]]
favorite_category.columns = ["CustomerID", "FavoriteCategory"]

In [9]:
data = df_cust.merge(Total_spending, on="CustomerID", how="left")
data = data.merge(Transaction_count, on="CustomerID", how="left")
data = data.merge(favorite_category, on="CustomerID", how="left")
data.fillna({"TotalSpending": 0, "TransactionCount": 0, "FavoriteCategory": "Unknown"}, inplace=True)

In [10]:
encoded_data = pd.get_dummies(data[["Region", "FavoriteCategory"]])
numerical_data = data[["TotalSpending", "TransactionCount"]]

In [11]:
from sklearn.preprocessing import StandardScaler

In [12]:
scaler = StandardScaler()
numerical_data_scaled = scaler.fit_transform(numerical_data)

In [13]:
features = np.hstack([numerical_data_scaled, encoded_data.values])

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
similarity_matrix = cosine_similarity(features)

In [16]:
lookalike_map = {}
customer_ids = data["CustomerID"].values

In [17]:
for i, cust_id in enumerate(customer_ids[:20]):  # First 20 customers (C0001 - C0020)
    similarity_scores = list(enumerate(similarity_matrix[i]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_lookalikes = [(customer_ids[j], score) for j, score in similarity_scores[1:4]]  # Top 3 similar customers
    lookalike_map[cust_id] = top_lookalikes

In [21]:
lookalike_list = []
for cust_id, lookalikes in lookalike_map.items():
    for lookalike_id, score in lookalikes:
        lookalike_list.append({"CustomerID": cust_id, "LookalikeID": lookalike_id, "SimilarityScore": score})

lookalike_df = pd.DataFrame(lookalike_list)
lookalike_df.to_csv("Aditya_Tiwari_Lookalike.csv", index=False)

print("Lookalike model completed and results saved to Lookalike.csv.")


Lookalike model completed and results saved to Lookalike.csv.


In [23]:
df_lookalike = pd.read_csv('/content/Aditya_Tiwari_Lookalike.csv')

Showing lookalike csv file


In [24]:
df_lookalike

Unnamed: 0,CustomerID,LookalikeID,SimilarityScore
0,C0001,C0190,0.990089
1,C0001,C0048,0.982085
2,C0001,C0181,0.952734
3,C0002,C0088,0.961758
4,C0002,C0092,0.932742
5,C0002,C0134,0.932462
6,C0003,C0052,0.996856
7,C0003,C0031,0.974303
8,C0003,C0076,0.947301
9,C0004,C0155,0.983131
