In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity


In [3]:
# Load datasets
customers_df = pd.read_csv("Customers.csv")
products_df = pd.read_csv("Products.csv")
transactions_df = pd.read_csv("Transactions.csv")

# merge all datasets
transactions_df = transactions_df.merge(products_df, on="ProductID")
merged_df = transactions_df.merge(customers_df, on="CustomerID")

# Viewing data
print(merged_df.head())


  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x                      ProductName     Category  Price_y  \
0      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
1      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
2      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
3      601.36   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
4      902.04   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   

      CustomerName         Region  SignupDate  
0   Andrea Jenkins         Europe  202

In [19]:
# Summarize transaction data per customer
transaction_summary = merged_df.groupby('CustomerID').agg({
    'TotalValue': ['sum', 'mean'],       # Total and average spending
    'TransactionID': 'count',           # Number of transactions
    'Category': lambda x: x.mode()[0]   # Most purchased category
}).reset_index()

# Flatten multi-level column names
transaction_summary.columns = ['CustomerID', 'TotalSpending', 'AvgSpending', 'TransactionCount', 'FavCategory']

# Merge with customer profiles
customer_profiles = customers_df.merge(transaction_summary, on='CustomerID')

# One-hot encode categorical features (Region and FavCategory)
encoder = OneHotEncoder(sparse_output=False)
encoded_region = encoder.fit_transform(customer_profiles[['Region']])
encoded_category = encoder.fit_transform(customer_profiles[['FavCategory']])

print(customer_profiles.columns)

features = np.hstack([
    encoded_region,
    encoded_category,
    customer_profiles[['TotalSpending', 'AvgSpending', 'TransactionCount']].values
])



# Standardize features using standard scaler
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)


Index(['CustomerID', 'CustomerName', 'Region', 'SignupDate', 'TotalSpending',
       'AvgSpending', 'TransactionCount', 'FavCategory'],
      dtype='object')


In [20]:
# Compute cosine similarity matrix(mathematcial formula for cosine similarity: the dot product divided by the product of the two vectors' magnitudes)
similarity_matrix = cosine_similarity(features_scaled)

# extracting Customer IDs
customer_ids = customer_profiles['CustomerID'].values

# as per task generating top-3 lookalikes for the first 20 customers
lookalikes = {}

for i, customer_id in enumerate(customer_ids[:20]):  # First 20 customers
    similarities = list(enumerate(similarity_matrix[i]))
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)  # Sort by similarity score
    top_3 = [(customer_ids[j], score) for j, score in similarities[1:4]]  # Exclude self, pick top 3
    lookalikes[customer_id] = top_3

# Convert to dataframe for output in csv
lookalike_list = []
for cust_id, similar in lookalikes.items():
    for sim_cust_id, score in similar:
        lookalike_list.append({'CustomerID': cust_id, 'SimilarCustomerID': sim_cust_id, 'Score': score})

lookalike_df = pd.DataFrame(lookalike_list)
lookalike_df.to_csv("Lookalike.csv", index=False)
