In [1]:
#Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cdist
from datetime import datetime

In [2]:
#Loading DataSets
def load_data():
    customers=pd.read_csv('Customers.csv')
    products=pd.read_csv('Products.csv')
    transactions=pd.read_csv('Transactions.csv')
    # Convert date columns to datetime
    customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
    transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])
    
    return customers,products,transactions

In [3]:
def create_customer_features():
    
    customers, products, transactions = load_data()
    
    # customer transaction features
    customer_features = transactions.groupby('CustomerID').agg({
        'TransactionID': 'count',
        'TotalValue': ['sum', 'mean', 'std'],
        'Quantity': ['sum', 'mean', 'std']
    })
    
    # Handle NaN values for std columns (customers with single transaction)
    customer_features = customer_features.fillna({
        ('TotalValue', 'std'): 0,
        ('Quantity', 'std'): 0
    })
    
    # Flatten column names
    customer_features.columns = ['_'.join(col).strip() for col in customer_features.columns.values]
    
    # Adding customer profile features
    customer_features['signup_age'] = (datetime.now() - 
                                     pd.to_datetime(customers['SignupDate'])).dt.days
    
    # Adding region as dummy variables
    region_dummies = pd.get_dummies(customers['Region'], prefix='region')
    customer_features = customer_features.join(region_dummies)
    
    # Adding category preferences
    category_preferences = transactions.merge(products, on='ProductID')\
        .groupby(['CustomerID', 'Category'])['TotalValue'].sum()\
        .unstack(fill_value=0)
    category_preferences.columns = [f'category_spend_{col}' for col in category_preferences.columns]
    
    customer_features = customer_features.join(category_preferences)
    
    # Final check for any remaining NaN values
    customer_features = customer_features.fillna(0)
    
    # Removing any features with zero variance
    variance = customer_features.var()
    customer_features = customer_features.loc[:, variance > 0]
    
    return customer_features

def find_lookalikes(customer_features, target_customer_id, n_recommendations=3):
    # Standardize features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(customer_features)
    
    # Calculating similarity scores
    similarity_matrix = cosine_similarity(scaled_features)
    
    # To Get target customer index
    target_idx = customer_features.index.get_loc(target_customer_id)
    
    #To  Get similarity scores for target customer
    customer_similarities = similarity_matrix[target_idx]
    
    #To Get indices of top similar customers (excluding self)
    similar_indices = np.argsort(customer_similarities)[::-1][1:n_recommendations+1]
    
    #To Get customer IDs and scores
    recommendations = [
        (customer_features.index[idx], float(customer_similarities[idx]))
        for idx in similar_indices
    ]
    
    return recommendations

def generate_lookalike_recommendations():
    # Creating customer features
    customer_features = create_customer_features()
    
    # Generating recommendations for first 20 customers
    recommendations = {}
    for customer_id in customer_features.index[:20]:
        lookalikes = find_lookalikes(customer_features, customer_id)
        recommendations[customer_id] = lookalikes
    
    # Creating output DataFrame
    output_rows = []
    for cust_id, lookalikes in recommendations.items():
        row = {
            'customer_id': cust_id,
            'lookalike_1': lookalikes[0][0],
            'score_1': round(lookalikes[0][1], 4),
            'lookalike_2': lookalikes[1][0],
            'score_2': round(lookalikes[1][1], 4),
            'lookalike_3': lookalikes[2][0],
            'score_3': round(lookalikes[2][1], 4)
        }
        output_rows.append(row)
    
    output_df = pd.DataFrame(output_rows)
    output_df.to_csv('Ankita_Jaiswal_Lookalike.csv', index=False)
    
    return output_df

if __name__ == "__main__":
    recommendations_df = generate_lookalike_recommendations()
    print("Generated recommendations for the first 20 customers:")
    print(recommendations_df)

Generated recommendations for the first 20 customers:
   customer_id lookalike_1  score_1 lookalike_2  score_2 lookalike_3  score_3
0        C0001       C0069   0.9143       C0125   0.7773       C0183   0.6674
1        C0002       C0031   0.8653       C0077   0.8346       C0121   0.8124
2        C0003       C0144   0.8360       C0091   0.6860       C0148   0.6427
3        C0004       C0075   0.9471       C0065   0.8552       C0041   0.8381
4        C0005       C0130   0.8907       C0014   0.8416       C0150   0.8245
5        C0006       C0196   0.8054       C0079   0.7779       C0200   0.7739
6        C0007       C0085   0.8440       C0026   0.8061       C0166   0.7961
7        C0008       C0109   0.7937       C0175   0.7420       C0162   0.7408
8        C0009       C0097   0.9752       C0058   0.9738       C0083   0.9501
9        C0010       C0142   0.8874       C0030   0.8670       C0062   0.8630
10       C0011       C0153   0.7959       C0013   0.7436       C0099   0.7423
11       C