In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [18]:
# Load Data
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [19]:
# Convert date columns to datetime
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

In [20]:
# Merge datasets for analysis
df = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

In [21]:
df.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [22]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [23]:
# Add Customer Profile Info
customer_features = customer_features.merge(customers, on="CustomerID")

In [24]:
# Scaling Features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features[['TotalValue', 'Quantity', 'Price_x']])

In [25]:
# Similarity Calculation
similarity_matrix = cosine_similarity(scaled_features)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

In [26]:
# Find Top 3 Lookalikes
lookalikes = {}
for customer in similarity_df.index[:20]:
    similar_customers = similarity_df[customer].sort_values(ascending=False)[1:4]
    lookalikes[customer] = list(similar_customers.items())

In [27]:
# Save Lookalikes as CSV
lookalike_csv = []
for cust_id, sim_list in lookalikes.items():
    for sim_cust_id, score in sim_list:
        lookalike_csv.append({"cust_id": cust_id, "similar_cust_id": sim_cust_id, "score": score})

lookalike_df = pd.DataFrame(lookalike_csv)
lookalike_df.to_csv("Lookalike.csv", index=False)