Task 2: Lookalike Model

In [62]:
import numpy as np
import pandas as pd

customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

merged_data = pd.merge(transactions, customers, on="CustomerID")
merged_data = pd.merge(merged_data, products, on="ProductID")

In [63]:
# Step 1: Feature Engineering
# Create customer features
customer_features = merged_data.groupby("CustomerID").agg(
    total_spend=("TotalValue", "sum"),
    avg_order_value=("TotalValue", "mean"),
    favorite_category=("Category", lambda x: x.mode()[0])
).reset_index()

In [64]:
# Merge with customer profile data
customer_features = pd.merge(customer_features, customers, on="CustomerID", suffixes=('_features', '_customers'))
customer_features

Unnamed: 0,CustomerID,total_spend,avg_order_value,favorite_category,CustomerName,Region,SignupDate
0,C0001,3354.52,670.904000,Electronics,Lawrence Carroll,South America,2022-07-10
1,C0002,1862.74,465.685000,Clothing,Elizabeth Lutz,Asia,2022-02-13
2,C0003,2725.38,681.345000,Home Decor,Michael Rivera,South America,2024-03-07
3,C0004,5354.88,669.360000,Books,Kathleen Rodriguez,South America,2022-10-09
4,C0005,2034.24,678.080000,Electronics,Laura Weber,Asia,2022-08-15
...,...,...,...,...,...,...,...
194,C0196,4982.88,1245.720000,Home Decor,Laura Watts,Europe,2022-06-07
195,C0197,1928.65,642.883333,Electronics,Christina Harvey,Europe,2023-03-21
196,C0198,931.83,465.915000,Clothing,Rebecca Ray,Europe,2022-02-27
197,C0199,1979.28,494.820000,Electronics,Andrea Jenkins,Europe,2022-12-03


In [65]:
# Encode categorical variables
customer_features = pd.get_dummies(customer_features, columns=["Region", "favorite_category"])
customer_features

Unnamed: 0,CustomerID,total_spend,avg_order_value,CustomerName,SignupDate,Region_Asia,Region_Europe,Region_North America,Region_South America,favorite_category_Books,favorite_category_Clothing,favorite_category_Electronics,favorite_category_Home Decor
0,C0001,3354.52,670.904000,Lawrence Carroll,2022-07-10,False,False,False,True,False,False,True,False
1,C0002,1862.74,465.685000,Elizabeth Lutz,2022-02-13,True,False,False,False,False,True,False,False
2,C0003,2725.38,681.345000,Michael Rivera,2024-03-07,False,False,False,True,False,False,False,True
3,C0004,5354.88,669.360000,Kathleen Rodriguez,2022-10-09,False,False,False,True,True,False,False,False
4,C0005,2034.24,678.080000,Laura Weber,2022-08-15,True,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,C0196,4982.88,1245.720000,Laura Watts,2022-06-07,False,True,False,False,False,False,False,True
195,C0197,1928.65,642.883333,Christina Harvey,2023-03-21,False,True,False,False,False,False,True,False
196,C0198,931.83,465.915000,Rebecca Ray,2022-02-27,False,True,False,False,False,True,False,False
197,C0199,1979.28,494.820000,Andrea Jenkins,2022-12-03,False,True,False,False,False,False,True,False


In [66]:
# Step 2: Normalize Features
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(customer_features.drop(columns=["CustomerID", "CustomerName", "SignupDate"]))
scaled_features

array([[0.30894178, 0.47433644, 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.16809501, 0.30894039, 1.        , ..., 1.        , 0.        ,
        0.        ],
       [0.24954138, 0.48275135, 0.        , ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.08020292, 0.30912576, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.17909816, 0.33242172, 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.44150834, 0.7006598 , 1.        , ..., 1.        , 0.        ,
        0.        ]], shape=(199, 10))

In [68]:
# Step 3: Compute Similarity
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(scaled_features)
similarity_matrix


array([[1.        , 0.08940673, 0.56593051, ..., 0.07761212, 0.544015  ,
        0.18776466],
       [0.08940673, 1.        , 0.08654965, ..., 0.52488348, 0.06225812,
        0.95912711],
       [0.56593051, 0.08654965, 1.        , ..., 0.07705101, 0.0925172 ,
        0.18060147],
       ...,
       [0.07761212, 0.52488348, 0.07705101, ..., 1.        , 0.52640195,
        0.5269246 ],
       [0.544015  , 0.06225812, 0.0925172 , ..., 0.52640195, 1.        ,
        0.13005527],
       [0.18776466, 0.95912711, 0.18060147, ..., 0.5269246 , 0.13005527,
        1.        ]], shape=(199, 199))

In [69]:
# Step 4: Find Top 3 Lookalikes for First 20 Customers
lookalikes = {}
for i in range(20):
    customer_id = customer_features.iloc[i]["CustomerID"]
    similarity_scores = similarity_matrix[i]
    top_3_indices = np.argsort(similarity_scores)[-4:-1]  # Exclude self
    top_3 = [(customer_features.iloc[idx]["CustomerID"], similarity_scores[idx]) for idx in top_3_indices]
    lookalikes[customer_id] = top_3

In [70]:
# Step 5: Save Results to CSV
import csv

with open("Lookalike.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["CustomerID", "LookalikeID", "SimilarityScore"])
    for cust_id, lookalike_list in lookalikes.items():
        for lookalike_id, score in lookalike_list:
            writer.writerow([cust_id, lookalike_id, score])