# Customer Segmentation using Clustering Techniques
This notebook performs customer segmentation using clustering techniques. We will use both profile information and transaction information to segment the customers.

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

## Load Data
Load and preprocess the datasets.

In [2]:
def load_data():
    """Load and preprocess all datasets"""
    try:
        customers = pd.read_csv("Customers.csv")
        products = pd.read_csv("Products.csv")
        transactions = pd.read_csv("Transactions.csv")
        
        # Ensure consistent column names
        transactions.rename(columns={'customer_id': 'CustomerID', 
                                  'product_id': 'ProductID'}, inplace=True)
        return customers, products, transactions
    except Exception as e:
        print(f"Error loading data: {e}")
        return None, None, None

customers, products, transactions = load_data()
print("Data loaded successfully")
print("Transactions shape:", transactions.shape)
print("Products shape:", products.shape)
print("Customers shape:", customers.shape)

## Create Customer Features
Create feature vectors for each customer based on their transaction history and profile.

In [3]:
def create_customer_features(customers, transactions, products):
    """Create feature vectors for each customer"""
    try:
        # Create basic purchase statistics
        purchase_stats = transactions.groupby('CustomerID').agg({
            'TotalValue': ['sum', 'mean', 'count'],
            'Quantity': ['sum', 'mean']
        }).reset_index()
        
        # Flatten column names
        purchase_stats.columns = ['CustomerID'] + [f'{col[0]}_{col[1]}' for col in purchase_stats.columns[1:]]
        
        # Create category preferences
        product_categories = transactions.merge(products[['ProductID', 'Category']], on='ProductID')
        category_pivoted = pd.crosstab(product_categories['CustomerID'], 
                                     product_categories['Category'],
                                     normalize='index')
        
        # Create recency feature
        latest_date = pd.to_datetime(transactions['TransactionDate']).max()
        recency = (transactions.groupby('CustomerID')
                  .agg({'TransactionDate': lambda x: (latest_date - pd.to_datetime(x).max()).days})
                  .rename(columns={'TransactionDate': 'recency'}))
        
        # Create region features
        region_dummies = pd.get_dummies(customers[['CustomerID', 'Region']], 
                                      prefix='region', 
                                      columns=['Region'])
        
        # Merge all features
        final_features = (purchase_stats
                         .merge(category_pivoted.reset_index(), on='CustomerID', how='left')
                         .merge(recency.reset_index(), on='CustomerID', how='left')
                         .merge(region_dummies, on='CustomerID', how='left'))
        
        # Fill missing values
        final_features = final_features.fillna(0)
        
        return final_features
        
    except Exception as e:
        print(f"Error creating features: {e}")
        print("\nDebugging info:")
        print("Purchase stats columns:", purchase_stats.columns.tolist())
        print("Category pivot columns:", category_pivoted.columns.tolist())
        print("Region dummy columns:", region_dummies.columns.tolist())
        return None

features = create_customer_features(customers, transactions, products)
print("\nFeatures created successfully")
print("Feature matrix shape:", features.shape)

## Clustering
Perform clustering and evaluate using Davies-Bouldin Index.

In [4]:
def perform_clustering(features, n_clusters=5):
    """Perform clustering and evaluate using Davies-Bouldin Index"""
    try:
        # Remove CustomerID for clustering
        feature_cols = features.columns.difference(['CustomerID'])
        
        # Scale features
        scaler = StandardScaler()
        scaled_features = scaler.fit_transform(features[feature_cols])
        
        # Perform KMeans clustering
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        clusters = kmeans.fit_predict(scaled_features)
        
        # Calculate Davies-Bouldin Index
        db_index = davies_bouldin_score(scaled_features, clusters)
        
        return clusters, db_index
        
    except Exception as e:
        print(f"Error performing clustering: {e}")
        return None, None

clusters, db_index = perform_clustering(features, n_clusters=5)
if clusters is not None:
    print(f"\nClustering performed successfully with DB Index: {db_index:.4f}")
    features['Cluster'] = clusters

## Visualize Clusters
Visualize the clusters using relevant plots.

In [5]:
def visualize_clusters(features):
    """Visualize the clusters using relevant plots"""
    try:
        # Plot total spend vs. recency
        plt.figure(figsize=(12, 6))
        sns.scatterplot(x='total_spend_sum', y='recency', hue='Cluster', data=features, palette='viridis')
        plt.title('Customer Segments: Total Spend vs. Recency')
        plt.show()
        
        # Plot total transactions vs. average transaction value
        plt.figure(figsize=(12, 6))
        sns.scatterplot(x='TotalValue_count', y='TotalValue_mean', hue='Cluster', data=features, palette='viridis')
        plt.title('Customer Segments: Total Transactions vs. Average Transaction Value')
        plt.show()
        
    except Exception as e:
        print(f"Error visualizing clusters: {e}")

visualize_clusters(features)

## Save Results
Save the clustering results to a CSV file.

In [6]:
def save_results(features, output_file):
    """Save clustering results to CSV"""
    try:
        features.to_csv(output_file, index=False)
        print(f"Clustering results saved to {output_file}")
        
    except Exception as e:
        print(f"Error saving results: {e}")

save_results(features, "Customer_Segments.csv")