In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime

def build_lookalike_model(customers_df, products_df, transactions_df):

    
    # 1. Feature Engineering
    
    # Customer profile features
    customers_df['DaysSinceSignup'] = (pd.to_datetime('today') - pd.to_datetime(customers_df['SignupDate'])).dt.days
    
    # Create region dummies
    region_dummies = pd.get_dummies(customers_df['Region'], prefix='Region')
    customers_df = pd.concat([customers_df, region_dummies], axis=1)
    
    # Transaction behavior features
    customer_metrics = transactions_df.groupby('CustomerID').agg({
        'TransactionID': 'count',  # Number of transactions
        'TotalValue': ['sum', 'mean'],  # Total spend and average transaction value
        'Quantity': ['sum', 'mean']  # Total quantity and average quantity per transaction
    }).reset_index()
    
    customer_metrics.columns = ['CustomerID', 'NumTransactions', 'TotalSpend', 
                              'AvgTransactionValue', 'TotalQuantity', 'AvgQuantityPerTransaction']
    
    # Product category preferences
    transactions_with_products = transactions_df.merge(products_df[['ProductID', 'Category']], on='ProductID')
    category_preferences = pd.crosstab(transactions_with_products['CustomerID'], transactions_with_products['Category'])
    category_preferences = category_preferences.div(category_preferences.sum(axis=1), axis=0)
    
    
    
    # 2. Combine Features
    feature_df = customers_df.merge(customer_metrics, on='CustomerID', how='left')
    feature_df = feature_df.merge(category_preferences, on='CustomerID', how='left')
    
    # Fill NaN values for customers with no transactions
    feature_df = feature_df.fillna(0)
    
    # Select features for similarity calculation
    feature_columns = (['DaysSinceSignup', 'NumTransactions', 'TotalSpend', 
                       'AvgTransactionValue', 'TotalQuantity', 'AvgQuantityPerTransaction'] + 
                      region_dummies.columns.tolist() + 
                      category_preferences.columns.tolist())
    
    
    # 3. Calculate Similarity
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(feature_df[feature_columns])
    
    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(scaled_features)
    
    
    # 4. Get Top Lookalikes
    lookalike_map = {}
    
    for idx, customer_id in enumerate(feature_df['CustomerID']):
        # Get similarity scores for current customer
        customer_similarities = similarity_matrix[idx]
        
        # Get indices of top 4 similar customers (including self)
        top_indices = np.argsort(customer_similarities)[-4:][::-1]
        
        # Remove self from recommendations
        top_indices = top_indices[top_indices != idx]
        
        # Get customer IDs and similarity scores
        top_customers = feature_df.iloc[top_indices]['CustomerID'].tolist()
        similarity_scores = customer_similarities[top_indices]
        
        # Store top 3 lookalikes with scores
        lookalike_map[customer_id] = [{'customer_id': cust_id, 'similarity_score': float(score)} for cust_id, score in zip(top_customers, similarity_scores)][:3]
    
    return lookalike_map




In [2]:
def save_lookalikes_to_csv(lookalike_map, output_file='Lookalike.csv'):

    rows = []
    for customer_id, lookalikes in lookalike_map.items():
        row = {
            'CustomerID': customer_id,
            'Lookalike1': lookalikes[0]['customer_id'],
            'Score1': round(lookalikes[0]['similarity_score'], 3),
            'Lookalike2': lookalikes[1]['customer_id'],
            'Score2': round(lookalikes[1]['similarity_score'], 3),
            'Lookalike3': lookalikes[2]['customer_id'],
            'Score3': round(lookalikes[2]['similarity_score'], 3)
        }
        rows.append(row)
    
    result_df = pd.DataFrame(rows)
    result_df.to_csv(output_file, index=False)





In [3]:
# Read data
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

# Build lookalike model
lookalike_map = build_lookalike_model(customers_df, products_df, transactions_df)

# Filter for first 20 customers
first_20_customers = {k: v for k, v in lookalike_map.items() 
                     if k in customers_df['CustomerID'].iloc[:20].tolist()}

# Save results to CSV
save_lookalikes_to_csv(first_20_customers)