In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [2]:
customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")
products = pd.read_csv("Products.csv")


In [3]:
df = pd.merge(transactions, customers, on="CustomerID", how="inner")
data = pd.merge(df, products, on="ProductID", how="inner")

In [4]:
customer_profile = data.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    total_quantity=('Quantity', 'sum'),
    avg_transaction_value=('TotalValue', 'mean'),
    num_transactions=('TransactionID', 'nunique'),
    most_purchased_category=('Category', lambda x: x.mode()[0])
).reset_index()

In [5]:
customer_profile = pd.merge(customers[['CustomerID', 'CustomerName']], 
                            customer_profile, 
                            on='CustomerID')

In [6]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_profile[['total_spent', 
                                                         'total_quantity', 
                                                         'avg_transaction_value', 
                                                         'num_transactions']])


In [7]:
cos_sim = cosine_similarity(scaled_features)

In [8]:
lookalike_map = {}
target_customers = customer_profile.head(20)

for i, cust_id in target_customers.iterrows():
    customer_id = cust_id['CustomerID']
    
    sim_scores = cos_sim[i]
    
    similarity_pairs = [(customer_profile['CustomerID'][j], sim_scores[j]) for j in range(len(sim_scores)) if customer_profile['CustomerID'][j] != customer_id]
    
    similarity_pairs = sorted(similarity_pairs, key=lambda x: x[1], reverse=True)[:3]
    
    lookalike_map[customer_id] = similarity_pairs

In [9]:
lookalike_data = []

for cust_id, lookalikes in lookalike_map.items():
    for lookalike in lookalikes:
        lookalike_data.append({'CustomerID': cust_id, 
                               'Lookalike_CustomerID': lookalike[0], 
                               'Similarity_Score': lookalike[1]})

lookalike_df = pd.DataFrame(lookalike_data)

In [10]:
lookalike_df.to_csv('Anuj_Deshmukh_Lookalike.csv', index=False)

In [11]:
lookalike_df

Unnamed: 0,CustomerID,Lookalike_CustomerID,Similarity_Score
0,C0001,C0164,0.996031
1,C0001,C0103,0.981548
2,C0001,C0069,0.963423
3,C0002,C0029,0.999525
4,C0002,C0031,0.997756
5,C0002,C0077,0.996656
6,C0003,C0027,0.851884
7,C0003,C0176,0.842155
8,C0003,C0073,0.78002
9,C0004,C0075,0.997675
