 # Build the Lookalike Model

In [42]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the data

In [43]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Merging the customers and transactions data based on CustomerID

In [44]:
transactions = transactions.merge(customers, on='CustomerID', how='left')
transactions = transactions.merge(products, on='ProductID', how='left')

# Aggregating transactions data for each customer (total spend, unique product count, etc.)

In [45]:
agg_data = merged_data.groupby('CustomerID').agg({
    'Price': 'sum',  # Total spend
    'Quantity': 'sum',  # Total quantity purchased
    'ProductID': 'nunique'  # Unique products purchased
}).reset_index()

# Merging the aggregated data with customer demographics

In [46]:
customer_features = pd.merge(customers[['CustomerID', 'Region']], agg_data, on='CustomerID')

# Encoding categorical variables (Region)

In [47]:
customer_features = pd.get_dummies(customer_features, columns=['Region'], drop_first=True)

# Standardizing the data (important for distance-based metrics)

In [48]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.drop(columns=['CustomerID']))

# Calculate cosine similarity matrix

In [49]:
similarity_matrix = cosine_similarity(scaled_features)

# Function to get the top 3 lookalike customers for each customer

In [50]:
def get_top_lookalikes(similarity_matrix, customer_ids, top_n=3):
    lookalikes = {}
    for idx, customer_id in enumerate(customer_ids):
        # Get the similarity scores for the customer
        similarity_scores = similarity_matrix[idx]
        
        # Create a list of tuples (CustomerID, SimilarityScore), excluding the self-similarity
        similar_customers = [(customer_ids[i], similarity_scores[i]) for i in range(len(customer_ids)) if i != idx]
        
        # Sort by similarity score in descending order and pick the top n
        similar_customers.sort(key=lambda x: x[1], reverse=True)
        top_lookalikes[customer_id] = similar_customers[:top_n]
    
    return top_lookalikes

# Get customer IDs for the first 20 customers

In [51]:
top_n = 3
customer_ids = customer_features['CustomerID'].iloc[:20]

# Get the top lookalikes for each of the first 20 customers

In [52]:
def get_top_lookalikes(similarity_matrix, customer_ids, top_n=3):
    lookalikes = {}  # Initialize the dictionary to store results
    for idx, customer_id in enumerate(customer_ids):
        # Get the similarity scores for the customer
        similarity_scores = similarity_matrix[idx]
        
        # Create a list of tuples (CustomerID, SimilarityScore), excluding the self-similarity
        similar_customers = [(customer_ids[i], similarity_scores[i]) for i in range(len(customer_ids)) if i != idx]
        
        # Sort by similarity score in descending order and pick the top n
        similar_customers.sort(key=lambda x: x[1], reverse=True)
        
        # Store the top n lookalikes for the customer
        lookalikes[customer_id] = similar_customers[:top_n]
    
    return lookalikes


In [53]:
top_lookalikes = get_top_lookalikes(similarity_matrix, customer_ids, top_n)

# Create the Lookalike.csv with the results

In [54]:
lookalike_data = []
for cust_id, lookalikes_list in top_lookalikes.items():
    for lookalike, score in lookalikes_list:
        lookalike_data.append([cust_id, lookalike, score])

lookalike_df = pd.DataFrame(lookalike_data, columns=['cust_id', 'lookalike_cust_id', 'similarity_score'])
lookalike_df.to_csv('Soumyadeep_Das_Lookalike.csv', index=False)

print("Soumyadeep_Das_Lookalike.csv created with top 3 lookalikes for the first 20 customers.")

Soumyadeep_Das_Lookalike.csv created with top 3 lookalikes for the first 20 customers.


In [55]:
lookalike = pd.read_csv('Soumyadeep_Das_Lookalike.csv')
print(lookalike.describe())

       similarity_score
count         60.000000
mean           0.779467
std            0.171447
min            0.389820
25%            0.637248
50%            0.813024
75%            0.937302
max            0.993693
