In [46]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')


In [8]:
customers['Region'].fillna(customers['Region'].mode()[0])  
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'], errors='coerce')


In [12]:
products['Price'].fillna(products['Price'].mean())

0     169.30
1     346.30
2      44.12
3      95.69
4     429.31
       ...  
95    307.47
96    319.34
97    299.93
98    354.29
99    126.34
Name: Price, Length: 100, dtype: float64

In [14]:
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'], errors='coerce')
transactions.dropna(subset=['TransactionDate', 'Quantity', 'TotalValue'])

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68
...,...,...,...,...,...,...,...
995,T00496,C0118,P037,2024-10-24 08:30:27,1,459.86,459.86
996,T00759,C0059,P037,2024-06-04 02:15:24,3,1379.58,459.86
997,T00922,C0018,P037,2024-04-05 13:05:32,4,1839.44,459.86
998,T00959,C0115,P037,2024-09-29 10:16:02,2,919.72,459.86


In [16]:
# Customer-Level Features from Transaction Data
customer_features = transactions.groupby('CustomerID').agg(
    total_spend=('TotalValue', 'sum'),
    transaction_count=('TransactionID', 'nunique'),
    product_count=('ProductID', 'nunique')
).reset_index()


In [30]:
customer_features = pd.merge(customer_features, customers[['CustomerID', 'Region','SignupDate']], on="CustomerID", how="left")

In [32]:
customer_features['signup_year'] = customer_features['SignupDate'].dt.year
customer_features['signup_month'] = customer_features['SignupDate'].dt.month


In [34]:
customer_features.head()

Unnamed: 0,CustomerID,total_spend,transaction_count,product_count,Region_x,Region_y,SignupDate,signup_year,signup_month
0,C0001,3354.52,5,5,South America,South America,2022-07-10,2022,7
1,C0002,1862.74,4,4,Asia,Asia,2022-02-13,2022,2
2,C0003,2725.38,4,4,South America,South America,2024-03-07,2024,3
3,C0004,5354.88,8,8,South America,South America,2022-10-09,2022,10
4,C0005,2034.24,3,3,Asia,Asia,2022-08-15,2022,8


In [36]:
 #Normalize the Features
scaler = StandardScaler()
features_to_scale = ['total_spend', 'transaction_count', 'product_count', 'signup_year', 'signup_month']
customer_features[features_to_scale] = scaler.fit_transform(customer_features[features_to_scale])


In [69]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(customer_features[features_to_scale])

similar_customers = {}

for i, customer_id in enumerate(customer_features['CustomerID']):
    similarity_scores = similarity_matrix[i]

    similarity_scores[i] = -1
  
    top_3_indices = similarity_scores.argsort()[-3:][::-1] 
    
    similar_customers[customer_id] = [(customer_features['CustomerID'].iloc[j], similarity_scores[j]) for j in top_3_indices]
