In [13]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [17]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [18]:
# Merge transactions with products to get product categories for each transaction
transactions = transactions.merge(products[['ProductID', 'Category']], on='ProductID')

In [26]:
transactions.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price,Category
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Electronics
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Electronics
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Electronics
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Electronics
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Electronics


In [19]:
# Merge the transactions with customers to associate each transaction with customer info
full_data = transactions.merge(customers[['CustomerID', 'Region']], on='CustomerID')

In [27]:
full_data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price,Category,Region
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Electronics,Europe
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Electronics,Asia
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Electronics,Europe
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Electronics,South America
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Electronics,Europe


# Generate customer profile vectors based on transaction history and demographic info

In [20]:
# Step 1:

#Generate customer transaction profile
customer_profiles = full_data.groupby(['CustomerID', 'Region', 'Category']).agg(
    total_spending=('TotalValue', 'sum'),
    total_quantity=('Quantity', 'sum')
).reset_index()

# Pivot the table to get customer profiles with categories as columns
profile_matrix = customer_profiles.pivot_table(
    index=['CustomerID', 'Region'],
    columns='Category',
    values='total_spending',
    fill_value=0
)

# Normalize the spending data (important for cosine similarity)
scaler = StandardScaler()
profile_matrix_scaled = scaler.fit_transform(profile_matrix)

In [21]:
# Step 2:

#Calculate cosine similarity between customers
similarity_matrix = cosine_similarity(profile_matrix_scaled)

In [22]:
# Step 3:

#Get Top 3 Lookalikes for Customers C0001 to C0020
lookalike_data = {}
top_n = 3

# Loop through the first 20 customers (C0001 to C0020)

In [23]:
for idx, customer_id in enumerate(profile_matrix.index[:20]):
    # Get similarity scores for customer 'customer_id'
    similarity_scores = similarity_matrix[idx]

    # Exclude self-similarity (set the score for the customer itself to -1)
    similarity_scores[idx] = -1

    # Get the indices of the top 'top_n' most similar customers
    top_n_indices = np.argsort(similarity_scores)[-top_n:][::-1]

    # Map top N similar customers to customer_id with their similarity scores
    top_customers = [(profile_matrix.index[i][0], similarity_scores[i]) for i in top_n_indices]
    lookalike_data[customer_id] = top_customers

# Convert the lookalike data to DataFrame

In [24]:
lookalike_list = []
for customer_id, top_customers in lookalike_data.items():
    for similar_customer_id, score in top_customers:
        lookalike_list.append([customer_id, similar_customer_id, score])

lookalike_df = pd.DataFrame(lookalike_list, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])

# Save the lookalikes to a CSV file

In [25]:
lookalike_df.to_csv('Lookalike.csv', index=False)