In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')
data = pd.merge(transactions, customers, on='CustomerID', how='inner')
data = pd.merge(data, products, on='ProductID', how='inner', suffixes=('_transaction', '_product'))

# Feature engineering
customer_features = data.groupby('CustomerID').agg({
    'TotalValue': 'sum',  
    'Quantity': 'sum',   
    'TransactionID': 'count',  
    'SignupDate': 'first', 
    'Region': 'first',      
    'Category': lambda x: x.mode()[0], 
}).reset_index()

# Convert SignupDate to days since signup (recency feature)
customer_features['SignupDate'] = pd.to_datetime(customer_features['SignupDate'])
customer_features['DaysSinceSignup'] = (pd.Timestamp.now() - customer_features['SignupDate']).dt.days

# Drop unnecessary columns
customer_features.drop(columns=['SignupDate'], inplace=True)
customer_features = pd.get_dummies(customer_features, columns=['Region', 'Category'], drop_first=True)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.drop(columns=['CustomerID']))
similarity_matrix = cosine_similarity(scaled_features)
lookalike_map = {}

for i in range(20):
    customer_id = customer_features.iloc[i]['CustomerID']
    similarity_scores = similarity_matrix[i]
    similarity_scores[i] = -1 
    top_3_indices = np.argsort(similarity_scores)[-3:][::-1]
    top_3_customers = customer_features.iloc[top_3_indices]['CustomerID'].values
    top_3_scores = similarity_scores[top_3_indices]
    lookalike_map[customer_id] = list(zip(top_3_customers, top_3_scores))


lookalike_df = pd.DataFrame(list(lookalike_map.items()), columns=['CustomerID', 'Lookalikes'])


lookalike_df.to_csv('Lookalike.csv', index=False)


print(lookalike_df.head())

  CustomerID                                         Lookalikes
0      C0001  [(C0184, 0.9530107530794122), (C0192, 0.935088...
1      C0002  [(C0106, 0.9321228466590178), (C0134, 0.919797...
2      C0003  [(C0052, 0.9735448208255614), (C0076, 0.969706...
3      C0004  [(C0165, 0.9752419725283313), (C0155, 0.941427...
4      C0005  [(C0007, 0.9912746459372063), (C0140, 0.928471...
