In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load all the given data
customers_df = pd.read_csv('Customers.csv') 
transactions_df = pd.read_csv('Transactions.csv') 
product_df = pd.read_csv('Products.csv')

# Pre-processing

In [6]:
# Merge transactions data with product data using left join
merged_data = pd.merge(transactions_df, product_df, on='ProductID', how='left')

# Merge the merged data with customer data using left join
df = pd.merge(merged_data, customers_df, on='CustomerID', how='left')

df.head()


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,Asia,2024-09-04
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Kathryn Stevens,Europe,2024-04-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Travis Campbell,South America,2024-04-11
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Timothy Perez,Europe,2022-03-15


In [7]:
# Drop Price_y
df.drop('Price_y', axis= 1, inplace= True)
# Rename 'Price_x' to 'Price'
df.rename(columns={'Price_x': 'Price'}, inplace=True)

In [8]:
df.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price,ProductName,Category,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,Andrea Jenkins,Europe,2022-12-03
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,Brittany Harvey,Asia,2024-09-04
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,Kathryn Stevens,Europe,2024-04-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,Travis Campbell,South America,2024-04-11
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,Timothy Perez,Europe,2022-03-15


# Building a Lookalike Model

In [10]:
# Creating customer-level features like Total spend per customer, Total number of purchases, Average price of products purchased
customer_features = df.groupby('CustomerID').agg(
    total_spend=('Price', 'sum'),  
    num_purchases=('ProductID', 'count'),  
    avg_price=('Price', 'mean')  
).reset_index()

# Normalize features to fit the scale of all features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features[['total_spend', 'num_purchases', 'avg_price']])
customer_features[['total_spend', 'num_purchases', 'avg_price']] = scaled_features

In [11]:
# Computing similarity matrix
similarity_matrix = cosine_similarity(scaled_features)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

# Generating lookalike recommendations
lookalike_recommendations = {}

for customer_id in similarity_df.index:
    # Get similarity scores for the current customer, excluding itself
    similar_customers = similarity_df.loc[customer_id].drop(customer_id).sort_values(ascending=False)
    # Select the top 3 most similar customers
    top_3 = similar_customers.head(3).index.tolist()
    scores = similar_customers.head(3).values.tolist()
    lookalike_recommendations[customer_id] = list(zip(top_3, scores))

In [12]:
# Preparing the Lookalike.csv file
lookalike_df = pd.DataFrame.from_dict(
    {
        customer_id: [
            lookalikes[0][0] if len(lookalikes) > 0 else None,  # Lookalike_1
            lookalikes[0][1] if len(lookalikes) > 0 else None,  # Score_1
            lookalikes[1][0] if len(lookalikes) > 1 else None,  # Lookalike_2
            lookalikes[1][1] if len(lookalikes) > 1 else None,  # Score_2
            lookalikes[2][0] if len(lookalikes) > 2 else None,  # Lookalike_3
            lookalikes[2][1] if len(lookalikes) > 2 else None,  # Score_3
        ]
        for customer_id, lookalikes in lookalike_recommendations.items()
    },
    orient="index",
    columns=["Lookalike_1", "Score_1", "Lookalike_2", "Score_2", "Lookalike_3", "Score_3"],
)

# Filter the first 20 customers (C0001 - C0020)
filtered_lookalike_df = lookalike_df.loc['C0001':'C0020']

# Save to Lookalike.csv
filtered_lookalike_df.to_csv('Anithasri_M_Lookalike.csv')