In [2]:
# Import necessary libraries
import pandas as pd

# Load the datasets
customers = pd.read_csv(r"C:\Users\Abhay Thakur\Downloads\Customers.csv")
products = pd.read_csv(r'C:\Users\Abhay Thakur\Downloads\Products.csv')
transactions = pd.read_csv(r'C:\Users\Abhay Thakur\Downloads\Transactions.csv')

# Extract brand name and clean ProductName in products dataset
products['Brand'] = products['ProductName'].str.split().str[0]
products['ProductName'] = products.apply(lambda row: row['ProductName'].replace(row['Brand'] + ' ', ''), axis=1)

# Merge transactions with products and customers datasets
tf = transactions.merge(products[['ProductID', 'Brand', 'ProductName']], on='ProductID', how='left')
tf = tf.merge(customers[['CustomerID', 'Region']], on='CustomerID', how='left')

# Convert TransactionDate to datetime format
tf['TransactionDate'] = pd.to_datetime(tf['TransactionDate'])

# Display the first few rows of the merged dataset
display(tf.head())


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price,Brand,ProductName,Region
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving,Bluetooth Speaker,Europe
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving,Bluetooth Speaker,Asia
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving,Bluetooth Speaker,Europe
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving,Bluetooth Speaker,South America
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving,Bluetooth Speaker,Europe


In [4]:
# Task 2 Lookalike Model

# Import libraries
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np


# Feature Engineering
# Customer-level features
customer_features = tf.groupby('CustomerID').agg(
    TotalSpend=('TotalValue', 'sum'),
    PurchaseFrequency=('TransactionID', 'count')
).reset_index()

# Add region and days since signup
customers['DaysSinceSignup'] = (pd.Timestamp('2025-01-01') - pd.to_datetime(customers['SignupDate'])).dt.days
customer_features = customer_features.merge(customers[['CustomerID', 'Region', 'DaysSinceSignup']], on='CustomerID')

# One-hot encode the region
customer_features = pd.get_dummies(customer_features, columns=['Region'])

# Normalize features
scaler = StandardScaler()
customer_features_scaled = scaler.fit_transform(customer_features.drop('CustomerID', axis=1))

# Similarity Calculation
# Compute cosine similarity
similarity_matrix = cosine_similarity(customer_features_scaled)

# Find top 3 similar customers for C0001 - C0020
customer_ids = customer_features['CustomerID'].tolist()
lookalike_results = {}

for i, customer_id in enumerate(customer_ids[:20]):  # First 20 customers
    similar_indices = np.argsort(-similarity_matrix[i])[1:4]  # Exclude itself
    similar_customers = [(customer_ids[idx], similarity_matrix[i][idx]) for idx in similar_indices]
    lookalike_results[customer_id] = similar_customers

#  Save Results
lookalike_df = pd.DataFrame([
    {'CustomerID': customer, 'LookalikeID': similar[0], 'SimilarityScore': similar[1]}
    for customer, similars in lookalike_results.items()
    for similar in similars
])

lookalike_df.to_csv('Lookalike.csv', index=False)

# Display results
display(lookalike_df.head(10))


Unnamed: 0,CustomerID,LookalikeID,SimilarityScore
0,C0001,C0152,0.995638
1,C0001,C0174,0.980218
2,C0001,C0011,0.972929
3,C0002,C0134,0.975045
4,C0002,C0159,0.968818
5,C0002,C0007,0.968667
6,C0003,C0052,0.997733
7,C0003,C0031,0.983983
8,C0003,C0129,0.977579
9,C0004,C0102,0.990755
