In [1]:
# Importing all required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# Loading the datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [3]:
#  Merging the datasets
merged = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")


In [4]:
# Feature engineering: Aggregating customer transaction data
customer_features = merged.groupby("CustomerID").agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'TransactionID': 'count',  # Frequency
    'Category': 'nunique',     # Unique categories purchased
}).rename(columns={
    'TotalValue': 'TotalSpending',
    'Quantity': 'TotalQuantity',
    'TransactionID': 'TransactionFrequency',
    'Category': 'UniqueCategories'
}).reset_index()

In [5]:
# Adding customer profile data (Region)
customer_features = customer_features.merge(customers[['CustomerID', 'Region']], on="CustomerID")


In [6]:
# One-hot encoding for categorical variables (e.g., Region)
customer_features = pd.get_dummies(customer_features, columns=['Region'])

In [7]:
# Normalizing numerical features
scaler = StandardScaler()
numerical_cols = ['TotalSpending', 'TotalQuantity', 'TransactionFrequency', 'UniqueCategories']
customer_features[numerical_cols] = scaler.fit_transform(customer_features[numerical_cols])


In [8]:
# Compute pairwise similarity using cosine similarity
customer_ids = customer_features['CustomerID']
features = customer_features.drop(columns=['CustomerID'])
similarity_matrix = cosine_similarity(features)

In [9]:
# Recommending top 3 lookalikes for each customer in C0001 - C0020
lookalikes = {}
for i, cust_id in enumerate(customer_ids[:20]):  # First 20 customers
    similarity_scores = list(enumerate(similarity_matrix[i]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_3 = [(customer_ids[j], score) for j, score in similarity_scores[1:4]]  # Skip self (index 0)
    lookalikes[cust_id] = top_3

In [10]:
lookalike_df = pd.DataFrame({
    "CustomerID": lookalikes.keys(),
    "Lookalikes": [str(lookalikes[cust]) for cust in lookalikes]
})

In [11]:
# Save to CSV
lookalike_df.to_csv("Alaina_Hafiza_Lookalike.csv", index=False)


In [12]:
# Printing example output
print(lookalike_df.head())

  CustomerID                                         Lookalikes
0      C0001  [('C0107', 0.989623839001654), ('C0174', 0.971...
1      C0002  [('C0142', 0.9907053654945355), ('C0159', 0.95...
2      C0003  [('C0174', 0.8818652346061687), ('C0031', 0.84...
3      C0004  [('C0165', 0.9852902527327856), ('C0012', 0.97...
4      C0005  [('C0186', 0.9978532023250041), ('C0159', 0.99...
