In [4]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load datasets
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')
customers = pd.read_csv('Customers.csv')

# Step 1: Merge datasets to create a unified dataset
merged_data = transactions.merge(products, on='ProductID').merge(customers, on='CustomerID')

# Step 2: Feature engineering - Aggregate data by CustomerID
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': ['sum', 'mean'],  # Total and average spending
    'Quantity': 'sum',             # Total quantity purchased
    'Category': lambda x: ','.join(x.unique()),  # Categories purchased
    'Region': 'first'              # Customer region
}).reset_index()

customer_features.columns = ['CustomerID', 'TotalSpending', 'AvgSpending', 'TotalQuantity', 'Categories', 'Region']

# Step 3: Encode categorical variables
customer_features_encoded = customer_features.copy()

# Encode Region as one-hot features
customer_features_encoded = pd.get_dummies(customer_features_encoded, columns=['Region'], drop_first=True)

# Encode Categories as counts of each category type
category_dummies = customer_features['Categories'].str.get_dummies(sep=',')
customer_features_encoded = pd.concat([customer_features_encoded, category_dummies], axis=1).drop(columns=['Categories'])

# Step 4: Standardize numerical features
scaler = StandardScaler()
numerical_features = ['TotalSpending', 'AvgSpending', 'TotalQuantity']
customer_features_encoded[numerical_features] = scaler.fit_transform(customer_features_encoded[numerical_features])

# Step 5: Calculate similarity matrix using cosine similarity
feature_matrix = customer_features_encoded.drop(columns=['CustomerID']).values
similarity_matrix = cosine_similarity(feature_matrix)

# Step 6: Find top 3 similar customers for each customer
lookalike_results = {}
for idx, customer_id in enumerate(customer_features_encoded['CustomerID'][:20]):  # First 20 customers
    similarity_scores = similarity_matrix[idx]
    top_indices = np.argsort(similarity_scores)[::-1][1:4]  # Top 3 excluding the customer itself
    lookalike_results[customer_id] = [
        (customer_features_encoded['CustomerID'][i], round(similarity_scores[i], 3)) for i in top_indices
    ]

# Step 7: Save results as Lookalike.csv
lookalike_df = pd.DataFrame([{'cust_id': key, 'lookalikes': value} for key, value in lookalike_results.items()])
lookalike_df.to_csv('Lookalike.csv', index=False)

# Output example
print("Top 3 lookalikes for the first 20 customers saved in Lookalike.csv")
lookalike_df.head(20)


Top 3 lookalikes for the first 20 customers saved in Lookalike.csv


Unnamed: 0,cust_id,lookalikes
0,C0001,"[(C0152, 0.987), (C0174, 0.975), (C0118, 0.874)]"
1,C0002,"[(C0043, 0.898), (C0062, 0.866), (C0159, 0.859)]"
2,C0003,"[(C0195, 0.915), (C0129, 0.906), (C0031, 0.888)]"
3,C0004,"[(C0012, 0.97), (C0113, 0.924), (C0102, 0.916)]"
4,C0005,"[(C0007, 0.898), (C0095, 0.897), (C0140, 0.87)]"
5,C0006,"[(C0187, 0.972), (C0196, 0.854), (C0079, 0.85)]"
6,C0007,"[(C0140, 0.972), (C0005, 0.898), (C0085, 0.802)]"
7,C0008,"[(C0024, 0.921), (C0194, 0.918), (C0098, 0.91)]"
8,C0009,"[(C0198, 0.978), (C0033, 0.857), (C0032, 0.855)]"
9,C0010,"[(C0132, 0.892), (C0142, 0.859), (C0103, 0.829)]"
