In [7]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np

In [3]:
# Load datasets
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

In [15]:
# Merge datasets to form a complete view
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

# Feature Engineering
# Aggregate transaction history for each customer
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Category': lambda x: x.mode()[0]  # Most frequent product category
}).reset_index()
print(merged_data.head())
print("\n\n")
print(customer_features.head())

  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Bluetooth Speaker

In [16]:
# Encode the 'Category' column
customer_features = pd.get_dummies(customer_features, columns=['Category'], drop_first=True)

# Normalize features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.drop(columns=['CustomerID']))

# Compute cosine similarity
similarity_matrix = cosine_similarity(scaled_features)

# Map to get top 3 similar customers
lookalike_map = {}

for idx, customer_id in enumerate(customer_features['CustomerID']):
    # Get similarity scores for the current customer
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    
    # Exclude self-similarity and sort by similarity score
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = [(customer_features['CustomerID'][i], score) for i, score in similarity_scores if i != idx]

    # Select top 3 similar customers
    lookalike_map[customer_id] = similarity_scores[:3]

# Filter for CustomerID C0001 to C0020
lookalike_map_filtered = {cust_id: lookalike_map[cust_id] for cust_id in customer_features['CustomerID'] if cust_id in [f'C{str(i).zfill(4)}' for i in range(1, 21)]}


In [17]:
lookalike_list = []
for cust_id, similar_custs in lookalike_map_filtered.items():
    lookalike_list.append({
        'CustomerID': cust_id,
        'Lookalikes': str([(sim_cust[0], round(sim_cust[1], 4)) for sim_cust in similar_custs])
    })

In [18]:
lookalike_df = pd.DataFrame(lookalike_list)
lookalike_df.to_csv('Ankit_Kumar_Lookalike.csv', index=False)