# Libraries

In [4]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from datetime import datetime

# Load Data

In [3]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Preprocessing

In [5]:
# type conversion
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

In [6]:
transactions_with_details = transactions.merge(products, on='ProductID') # transactions with product details
transactions_with_customers = transactions_with_details.merge(customers, on='CustomerID') # transactions with customer details

# Feature Engineering

In [7]:
customer_features = transactions_with_customers.groupby('CustomerID').agg(
    total_spending=('TotalValue', 'sum'),
    avg_order_value=('TotalValue', 'mean'),
    transaction_count=('TransactionID', 'count'),
    favorite_category=('Category', lambda x: x.mode()[0]),  # Most frequent category
).reset_index()

In [8]:
customer_features = customer_features.merge(customers[['CustomerID', 'Region']], on='CustomerID')
customer_features = pd.get_dummies(customer_features, columns=['favorite_category', 'Region'], drop_first=True)

In [9]:
scaler = StandardScaler() # feature scaling
normalized_features = scaler.fit_transform(customer_features.drop(['CustomerID'], axis=1))

# Similarity Score

In [10]:
similarity_matrix = cosine_similarity(normalized_features)

In [11]:
similar_customers = {}
for i, customer_id in enumerate(customer_features['CustomerID']):
    similarities = list(enumerate(similarity_matrix[i]))
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)  # Sort by similarity score
    top_3 = [(customer_features['CustomerID'][idx], score) for idx, score in similarities[1:4]]  # Exclude self
    similar_customers[customer_id] = top_3

In [12]:
similar_customers_subset = {key: similar_customers[key] for key in customer_features['CustomerID'] if key in [f'C{i:04d}' for i in range(1, 21)]}

# Save Similarity Map

In [19]:
lookalike_df = pd.DataFrame({'cust_id': similar_customers_subset.keys(), 'lookalikes': similar_customers_subset.values()})
lookalike_df.to_csv('Janvi_Bhatt_Lookalike.csv', index=False)

# Extra: Dynamically returns lookalikes
### for all the customers

In [14]:
def get_top_3_lookalikes(customer_id):
    if customer_id not in customer_features['CustomerID'].values:
        return f"CustomerID {customer_id} not found."

    customer_idx = customer_features[customer_features['CustomerID'] == customer_id].index[0]
    similarities = list(enumerate(similarity_matrix[customer_idx]))
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)  # Sort by similarity score
    top_3 = [(customer_features['CustomerID'][idx], score) for idx, score in similarities[1:4]] # discarding self cust ID
    return top_3

In [17]:
print(get_top_3_lookalikes('C0003'))

[('C0052', 0.995097046978372), ('C0031', 0.9605101553121962), ('C0076', 0.9382684470986701)]


In [16]:
print(get_top_3_lookalikes('C0011'))

[('C0169', 0.9395786532149496), ('C0174', 0.9222322478387583), ('C0153', 0.9159318817104083)]
