In [5]:
import pandas as pd

#Load Datasets
customers = pd.read_csv('../ecommerce-analysis/data/Customers.csv')
transactions = pd.read_csv('../ecommerce-analysis/data/Transactions.csv')
products = pd.read_csv('../ecommerce-analysis/data/Products.csv')

#merge transactions with products to get category and Price
merged = pd.merge(transactions, products, on="ProductID", how='left')
#merge with customers to get region and signup date
merged = pd.merge(merged, customers, on='CustomerID', how='left')


In [6]:
from datetime import datetime

#calculate days since signup
merged['SignupDate'] = pd.to_datetime(merged['SignupDate'])
latest_date = merged['TransactionDate'].max()
merged['Tenure'] = (pd.to_datetime(latest_date) - merged['SignupDate']).dt.days

#Aggregate transaction feaatures
customer_features = merged.groupby('CustomerID').agg(
    Region = ('Region','first'),
    Tenure = ('Tenure','first'),
    TotalPurchase = ('TransactionID','count'),
    AvgTransactionValue = ('TotalValue','mean'),
    FavouriteCategory =('Category', lambda x: x.mode()[0])
).reset_index()

In [7]:
from sklearn.preprocessing import OneHotEncoder

# Onehot encode 'Region and 'FavouriteCategory'
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(customer_features[['Region','FavouriteCategory']])
encoded_df = pd.DataFrame(encoded_features.toarray(), columns = encoder.get_feature_names_out())

In [8]:
from sklearn.preprocessing import StandardScaler

#Scale numerical features
scaler = StandardScaler()
numerical_features = customer_features[['Tenure', 'TotalPurchase', 'AvgTransactionValue']]
scaled_features = scaler.fit_transform(numerical_features)
scaled_df = pd.DataFrame(scaled_features, columns=numerical_features.columns)

final_features = pd.concat([encoded_df, scaled_df], axis=1)

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

#Compute similarity pairwise
similarity_matrix = cosine_similarity(final_features)

In [10]:
# Get indices of 20 customers
target_customers = customer_features[customer_features['CustomerID'].isin([f'C{i:04d}' for i in range(1, 21)])].index

# Create lookalike map
lookalike_map = {}
for idx in target_customers:
    customer_id = customer_features.iloc[idx]['CustomerID']
    scores = list(enumerate(similarity_matrix[idx]))
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:4]  # Exclude self
    lookalike_map[customer_id] = [
        (customer_features.iloc[i]['CustomerID'], round(score, 2))  # Use 'score' instead of 'scores'
        for i, score in sorted_scores  # Corrected iteration
    ]

In [11]:
import csv

with open('Avneet_Kaur_Lookalike.csv', 'w', newline="" )as file:
    writer = csv.writer(file)
    for customer_id, recommendations in lookalike_map.items():
        row = [customer_id] + [f"{id}, {score}" for id, score in recommendations]
        writer.writerow(row)