# Customer Segmentation - Model Building

This notebook focuses on building and evaluating clustering models to identify customer segments:

1. Find the optimal number of clusters
2. Apply different clustering algorithms
3. Compare clustering results
4. Select the best model
5. Visualize the identified clusters

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.decomposition import PCA
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
import plotly.express as px
import os

# Set plotting style
%matplotlib inline
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')

# Increase default figure size
plt.rcParams['figure.figsize'] = [12, 8]

# Create output directory if it doesn't exist
os.makedirs('./output', exist_ok=True)

## 1. Load Preprocessed Data

In [None]:
# Load the preprocessed data
preprocessed_file = './output/preprocessed_data.csv'
df = pd.read_csv(preprocessed_file)

# Also load the unscaled data for interpretation
unscaled_file = './output/cleaned_data_unscaled.csv'
df_unscaled = pd.read_csv(unscaled_file)

# Display basic information
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Prepare data for clustering
# Exclude customer_id from clustering if present
if 'customer_id' in df.columns:
    features_df = df.drop(columns=['customer_id'])
else:
    features_df = df.copy()

feature_names = features_df.columns.tolist()
print(f"Features used for clustering: {feature_names}")

## 2. Finding the Optimal Number of Clusters

In [None]:
# Define a function to find the optimal number of clusters
def find_optimal_k(data, max_clusters=10):
    # Initialize empty lists to store results
    inertia = []
    silhouette = []
    calinski_harabasz = []
    davies_bouldin = []
    
    # Range of clusters to try
    K = range(2, max_clusters+1)
    
    for k in K:
        # KMeans clustering
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(data)
        
        # Get cluster labels
        labels = kmeans.labels_
        
        # Inertia (within-cluster sum-of-squares)
        inertia.append(kmeans.inertia_)
        
        # Silhouette score
        silhouette.append(silhouette_score(data, labels))
        
        # Calinski-Harabasz Index
        calinski_harabasz.append(calinski_harabasz_score(data, labels))
        
        # Davies-Bouldin Index
        davies_bouldin.append(davies_bouldin_score(data, labels))
    
    # Create a results dictionary
    results = {
        'k_values': list(K),
        'inertia': inertia,
        'silhouette': silhouette,
        'calinski_harabasz': calinski_harabasz,
        'davies_bouldin': davies_bouldin
    }
    
    # Return the results
    return results

In [None]:
# Find optimal k
results = find_optimal_k(features_df, max_clusters=10)

# Plot the results
plt.figure(figsize=(15, 12))

# Plot Inertia (Elbow Method)
plt.subplot(2, 2, 1)
plt.plot(results['k_values'], results['inertia'], 'bo-')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method')
plt.grid(True)

# Plot Silhouette Score
plt.subplot(2, 2, 2)
plt.plot(results['k_values'], results['silhouette'], 'go-')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score (higher is better)')
plt.grid(True)

# Plot Calinski-Harabasz Index
plt.subplot(2, 2, 3)
plt.plot(results['k_values'], results['calinski_harabasz'], 'ro-')
plt.xlabel('Number of clusters')
plt.ylabel('Calinski-Harabasz Index')
plt.title('Calinski-Harabasz Index (higher is better)')
plt.grid(True)

# Plot Davies-Bouldin Index
plt.subplot(2, 2, 4)
plt.plot(results['k_values'], results['davies_bouldin'], 'mo-')
plt.xlabel('Number of clusters')
plt.ylabel('Davies-Bouldin Index')
plt.title('Davies-Bouldin Index (lower is better)')
plt.grid(True)

plt.tight_layout()
plt.savefig('./output/optimal_k_metrics.png', dpi=300)
plt.show()

In [None]:
# Use yellowbrick for elbow visualization
plt.figure(figsize=(10, 6))
visualizer = KElbowVisualizer(KMeans(random_state=42), k=(2, 10), timings=False)
visualizer.fit(features_df)
visualizer.finalize()
plt.savefig('./output/elbow_visualizer.png', dpi=300)
plt.show()

In [None]:
# Use yellowbrick for silhouette visualization for k=3 (based on domain knowledge)
plt.figure(figsize=(10, 6))
model = KMeans(n_clusters=3, random_state=42)
visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
visualizer.fit(features_df)
visualizer.finalize()
plt.title('Silhouette Plot for KMeans with 3 clusters')
plt.savefig('./output/silhouette_k3.png', dpi=300)
plt.show()

In [None]:
# Print optimal number of clusters based on different metrics
k_values = np.array(results['k_values'])
silhouette_scores = np.array(results['silhouette'])
ch_scores = np.array(results['calinski_harabasz'])
db_scores = np.array(results['davies_bouldin'])

print(f"Optimal k based on Silhouette Score: {k_values[np.argmax(silhouette_scores)]} (score: {np.max(silhouette_scores):.4f})")
print(f"Optimal k based on Calinski-Harabasz Index: {k_values[np.argmax(ch_scores)]} (score: {np.max(ch_scores):.4f})")
print(f"Optimal k based on Davies-Bouldin Index: {k_values[np.argmin(db_scores)]} (score: {np.min(db_scores):.4f})")
print("\nNote: Based on domain knowledge, we expect 3 customer segments (Bargain Hunters, High Spenders, Window Shoppers)")

## 3. Apply Different Clustering Algorithms

In [None]:
# We'll use k=3 based on domain knowledge (we expect 3 customer segments)
n_clusters = 3

# Dictionary to store clustering results
clustering_results = {}

In [None]:
# 1. K-Means clustering
print("\nApplying K-Means clustering...")
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
kmeans.fit(features_df)
kmeans_labels = kmeans.labels_
kmeans_centroids = kmeans.cluster_centers_

# Calculate evaluation metrics
kmeans_silhouette = silhouette_score(features_df, kmeans_labels)
kmeans_calinski = calinski_harabasz_score(features_df, kmeans_labels)
kmeans_davies = davies_bouldin_score(features_df, kmeans_labels)

print(f"K-Means Silhouette Score: {kmeans_silhouette:.4f}")
print(f"K-Means Calinski-Harabasz Index: {kmeans_calinski:.4f}")
print(f"K-Means Davies-Bouldin Index: {kmeans_davies:.4f}")

# Store the results
clustering_results['kmeans'] = {
    'model': kmeans,
    'labels': kmeans_labels,
    'centroids': kmeans_centroids,
    'silhouette': kmeans_silhouette,
    'calinski_harabasz': kmeans_calinski,
    'davies_bouldin': kmeans_davies
}

In [None]:
# 2. Gaussian Mixture Model (GMM)
print("\nApplying Gaussian Mixture Model...")
gmm = GaussianMixture(n_components=n_clusters, random_state=42, n_init=10)
gmm.fit(features_df)
gmm_labels = gmm.predict(features_df)
gmm_centroids = gmm.means_

# Calculate evaluation metrics
gmm_silhouette = silhouette_score(features_df, gmm_labels)
gmm_calinski = calinski_harabasz_score(features_df, gmm_labels)
gmm_davies = davies_bouldin_score(features_df, gmm_labels)

print(f"GMM Silhouette Score: {gmm_silhouette:.4f}")
print(f"GMM Calinski-Harabasz Index: {gmm_calinski:.4f}")
print(f"GMM Davies-Bouldin Index: {gmm_davies:.4f}")

# Store the results
clustering_results['gmm'] = {
    'model': gmm,
    'labels': gmm_labels,
    'centroids': gmm_centroids,
    'silhouette': gmm_silhouette,
    'calinski_harabasz': gmm_calinski,
    'davies_bouldin': gmm_davies
}

In [None]:
# 3. Hierarchical Clustering
print("\nApplying Hierarchical Clustering...")
hierarchical = AgglomerativeClustering(n_clusters=n_clusters)
hierarchical_labels = hierarchical.fit_predict(features_df)

# Calculate evaluation metrics
hierarchical_silhouette = silhouette_score(features_df, hierarchical_labels)
hierarchical_calinski = calinski_harabasz_score(features_df, hierarchical_labels)
hierarchical_davies = davies_bouldin_score(features_df, hierarchical_labels)

print(f"Hierarchical Silhouette Score: {hierarchical_silhouette:.4f}")
print(f"Hierarchical Calinski-Harabasz Index: {hierarchical_calinski:.4f}")
print(f"Hierarchical Davies-Bouldin Index: {hierarchical_davies:.4f}")

# Store the results
clustering_results['hierarchical'] = {
    'model': hierarchical,
    'labels': hierarchical_labels,
    'silhouette': hierarchical_silhouette,
    'calinski_harabasz': hierarchical_calinski,
    'davies_bouldin': hierarchical_davies
}

In [None]:
# 4. DBSCAN
print("\nApplying DBSCAN...")
# DBSCAN requires careful parameter tuning
# We'll use a simple heuristic for eps (the maximum distance between samples)
from sklearn.neighbors import NearestNeighbors

# Determine eps value using k-distance graph (k=min_samples-1)
k = 5  # min_samples will be k+1 = 6
neigh = NearestNeighbors(n_neighbors=k)
neigh.fit(features_df)
distances, indices = neigh.kneighbors(features_df)
distances = np.sort(distances[:, -1])

# Plot k-distance graph
plt.figure(figsize=(10, 6))
plt.plot(distances)
plt.xlabel('Data Points (sorted by distance)')
plt.ylabel(f'{k}-th Nearest Neighbor Distance')
plt.title('K-Distance Graph for DBSCAN Parameter Selection')
plt.grid(True)
plt.savefig('./output/dbscan_kdistance.png', dpi=300)
plt.show()

# Choose eps where the curve shows an "elbow"
# This is a visual inspection, but you can also automate this
eps = 0.5  # This value should be adjusted based on the k-distance graph
min_samples = k + 1

print(f"Using eps={eps} and min_samples={min_samples} for DBSCAN")

dbscan = DBSCAN(eps=eps, min_samples=min_samples)
dbscan_labels = dbscan.fit_predict(features_df)

# Count the number of clusters and noise points
n_clusters_dbscan = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
n_noise = list(dbscan_labels).count(-1)
print(f"DBSCAN found {n_clusters_dbscan} clusters and {n_noise} noise points")

# Calculate evaluation metrics only if we have multiple clusters and no noise points
# or if we can exclude noise points
if n_clusters_dbscan > 1:
    if -1 not in dbscan_labels:
        dbscan_silhouette = silhouette_score(features_df, dbscan_labels)
        dbscan_calinski = calinski_harabasz_score(features_df, dbscan_labels)
        dbscan_davies = davies_bouldin_score(features_df, dbscan_labels)
    else:
        # Exclude noise points for evaluation
        mask = dbscan_labels != -1
        if sum(mask) > 1:  # Ensure we have at least 2 non-noise points
            dbscan_silhouette = silhouette_score(features_df[mask], dbscan_labels[mask])
            dbscan_calinski = calinski_harabasz_score(features_df[mask], dbscan_labels[mask])
            dbscan_davies = davies_bouldin_score(features_df[mask], dbscan_labels[mask])
        else:
            dbscan_silhouette = float('nan')
            dbscan_calinski = float('nan')
            dbscan_davies = float('nan')
else:
    dbscan_silhouette = float('nan')
    dbscan_calinski = float('nan')
    dbscan_davies = float('nan')

print(f"DBSCAN Silhouette Score: {dbscan_silhouette:.4f}")
print(f"DBSCAN Calinski-Harabasz Index: {dbscan_calinski:.4f}")
print(f"DBSCAN Davies-Bouldin Index: {dbscan_davies:.4f}")

# Store the results
clustering_results['dbscan'] = {
    'model': dbscan,
    'labels': dbscan_labels,
    'n_clusters': n_clusters_dbscan,
    'n_noise': n_noise,
    'silhouette': dbscan_silhouette,
    'calinski_harabasz': dbscan_calinski,
    'davies_bouldin': dbscan_davies
}

## 4. Compare Clustering Results

In [None]:
# Compare the performance metrics of different clustering algorithms
metrics_comparison = pd.DataFrame({
    'Algorithm': ['K-Means', 'GMM', 'Hierarchical', 'DBSCAN'],
    'Silhouette Score': [
        clustering_results['kmeans']['silhouette'],
        clustering_results['gmm']['silhouette'],
        clustering_results['hierarchical']['silhouette'],
        clustering_results['dbscan']['silhouette']
    ],
    'Calinski-Harabasz Index': [
        clustering_results['kmeans']['calinski_harabasz'],
        clustering_results['gmm']['calinski_harabasz'],
        clustering_results['hierarchical']['calinski_harabasz'],
        clustering_results['dbscan']['calinski_harabasz']
    ],
    'Davies-Bouldin Index': [
        clustering_results['kmeans']['davies_bouldin'],
        clustering_results['gmm']['davies_bouldin'],
        clustering_results['hierarchical']['davies_bouldin'],
        clustering_results['dbscan']['davies_bouldin']
    ]
})

metrics_comparison

In [None]:
# Visualize the comparison
plt.figure(figsize=(15, 10))

# Silhouette Score (higher is better)
plt.subplot(3, 1, 1)
sns.barplot(x='Algorithm', y='Silhouette Score', data=metrics_comparison)
plt.title('Silhouette Score Comparison (Higher is Better)')
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Calinski-Harabasz Index (higher is better)
plt.subplot(3, 1, 2)
sns.barplot(x='Algorithm', y='Calinski-Harabasz Index', data=metrics_comparison)
plt.title('Calinski-Harabasz Index Comparison (Higher is Better)')
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Davies-Bouldin Index (lower is better)
plt.subplot(3, 1, 3)
sns.barplot(x='Algorithm', y='Davies-Bouldin Index', data=metrics_comparison)
plt.title('Davies-Bouldin Index Comparison (Lower is Better)')
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.savefig('./output/clustering_comparison.png', dpi=300)
plt.show()

In [None]:
# Select the best model based on Silhouette Score
silhouette_scores = [clustering_results[alg]['silhouette'] for alg in ['kmeans', 'gmm', 'hierarchical', 'dbscan']]
best_algorithm_idx = np.nanargmax(silhouette_scores)  # Using nanargmax to handle NaN values
best_algorithm = ['kmeans', 'gmm', 'hierarchical', 'dbscan'][best_algorithm_idx]

print(f"Best clustering algorithm based on Silhouette Score: {best_algorithm}")
print(f"Silhouette Score: {clustering_results[best_algorithm]['silhouette']:.4f}")

In [None]:
# Since we know there are 3 customer segments, we'll use the model that performs best 
# with n_clusters=3 and has the highest Silhouette Score among K-Means, GMM, and Hierarchical
silhouette_3_clusters = [clustering_results[alg]['silhouette'] for alg in ['kmeans', 'gmm', 'hierarchical']]
best_3_cluster_idx = np.argmax(silhouette_3_clusters)
best_3_cluster_algorithm = ['kmeans', 'gmm', 'hierarchical'][best_3_cluster_idx]

print(f"Best algorithm for 3 clusters: {best_3_cluster_algorithm}")
print(f"Silhouette Score: {clustering_results[best_3_cluster_algorithm]['silhouette']:.4f}")

# We'll use this as our final model
final_model = clustering_results[best_3_cluster_algorithm]['model']
final_labels = clustering_results[best_3_cluster_algorithm]['labels']

## 5. Visualize the Identified Clusters

Continuing from the previous notebook, we'll visualize the clusters identified by our best model.

In [None]:
# Function to visualize clusters in 2D using PCA
def visualize_clusters_2d(data, labels, title="Cluster Visualization", save_path=None):
    # Apply PCA for visualization
    pca = PCA(n_components=2)
    principal_components = pca.fit_transform(data)
    
    # Create a dataframe with principal components and cluster labels
    pca_df = pd.DataFrame(
        data=principal_components,
        columns=['PC1', 'PC2']
    )
    pca_df['Cluster'] = labels
    
    # Create scatter plot
    plt.figure(figsize=(12, 8))
    sns.scatterplot(x='PC1', y='PC2', hue='Cluster', data=pca_df, palette='viridis', s=80, alpha=0.8)
    plt.title(title, fontsize=16)
    plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)', fontsize=12)
    plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)', fontsize=12)
    plt.grid(linestyle='--', alpha=0.5)
    plt.legend(title='Cluster', fontsize=12)
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300)
    
    plt.show()
    
    # Create interactive scatter plot using plotly
    fig = px.scatter(
        pca_df, 
        x='PC1', 
        y='PC2', 
        color='Cluster',
        title=title,
        labels={
            'PC1': f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)',
            'PC2': f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)'
        }
    )
    fig.update_traces(marker=dict(size=10))
    
    if save_path:
        html_path = save_path.replace('.png', '.html')
        fig.write_html(html_path)
    
    fig.show()
    
    return pca_df, pca

In [None]:
# Function to visualize clusters in 3D using PCA
def visualize_clusters_3d(data, labels, title="3D Cluster Visualization", save_path=None):
    # Apply PCA for visualization
    pca = PCA(n_components=3)
    principal_components = pca.fit_transform(data)
    
    # Create a dataframe with principal components and cluster labels
    pca_df = pd.DataFrame(
        data=principal_components,
        columns=['PC1', 'PC2', 'PC3']
    )
    pca_df['Cluster'] = labels
    
    # Create interactive 3D scatter plot
    fig = px.scatter_3d(
        pca_df, 
        x='PC1', 
        y='PC2', 
        z='PC3',
        color='Cluster',
        title=title,
        labels={
            'PC1': f'PC1 ({pca.explained_variance_ratio_[0]:.2%})',
            'PC2': f'PC2 ({pca.explained_variance_ratio_[1]:.2%})',
            'PC3': f'PC3 ({pca.explained_variance_ratio_[2]:.2%})'
        }
    )
    fig.update_traces(marker=dict(size=5))
    
    if save_path:
        html_path = save_path.replace('.png', '.html')
        fig.write_html(html_path)
    
    fig.show()
    
    return pca_df, pca

In [None]:
# Visualize the clusters found by each algorithm in 2D
for alg in ['kmeans', 'gmm', 'hierarchical']:
    labels = clustering_results[alg]['labels']
    title = f"{alg.capitalize()} Clustering (k=3)"
    save_path = f"./output/{alg}_clusters.png"
    pca_df, pca = visualize_clusters_2d(features_df, labels, title=title, save_path=save_path)

In [None]:
# Visualize the best model's clusters in 3D
best_labels = clustering_results[best_3_cluster_algorithm]['labels']
title = f"{best_3_cluster_algorithm.capitalize()} Clustering 3D (k=3)"
save_path = f"./output/{best_3_cluster_algorithm}_clusters_3d.png"
pca_3d_df, pca_3d = visualize_clusters_3d(features_df, best_labels, title=title, save_path=save_path)

## 6. Examine Cluster Characteristics

In [None]:
# Add cluster labels to the original unscaled dataframe for interpretation
df_with_clusters = df_unscaled.copy()
df_with_clusters['Cluster'] = final_labels

# Get the distribution of clusters
cluster_counts = df_with_clusters['Cluster'].value_counts()
cluster_percentages = cluster_counts / len(df_with_clusters) * 100

# Print cluster distribution
print("Cluster Distribution:")
for cluster, count in cluster_counts.iteritems():
    print(f"Cluster {cluster}: {count} customers ({cluster_percentages[cluster]:.2f}%)")

# Plot cluster distribution
plt.figure(figsize=(10, 6))
ax = cluster_counts.plot(kind='bar', color='skyblue')
plt.title('Cluster Size Distribution', fontsize=14)
plt.xlabel('Cluster', fontsize=12)
plt.ylabel('Number of Customers', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add data labels on top of each bar
for i, v in enumerate(cluster_counts):
    ax.text(i, v+5, str(v), ha='center', fontsize=10)

plt.tight_layout()
plt.savefig('./output/cluster_distribution.png', dpi=300)
plt.show()

In [None]:
# Calculate cluster profiles (mean of each feature for each cluster)
cluster_profiles = df_with_clusters.groupby('Cluster').mean()

# Display cluster profiles
print("Cluster Profiles (Mean Values):")
display(cluster_profiles)

# Visualize cluster profiles with a heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(cluster_profiles, annot=True, cmap='viridis', fmt='.2f')
plt.title('Cluster Profiles (Mean Values)', fontsize=16)
plt.tight_layout()
plt.savefig('./output/cluster_profiles_heatmap.png', dpi=300)
plt.show()

In [None]:
# Create box plots for each feature by cluster
feature_cols = [col for col in df_with_clusters.columns if col != 'customer_id' and col != 'Cluster']
num_features = len(feature_cols)

plt.figure(figsize=(15, num_features * 4))

for i, feature in enumerate(feature_cols):
    plt.subplot(num_features, 1, i+1)
    sns.boxplot(x='Cluster', y=feature, data=df_with_clusters, palette='viridis')
    plt.title(f'Distribution of {feature} by Cluster', fontsize=14)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    
plt.savefig('./output/cluster_feature_distributions.png', dpi=300)
plt.show()

In [None]:
# Create radar chart to visualize cluster profiles
# First, normalize the values for better visualization
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
cluster_profiles_scaled = pd.DataFrame(
    scaler.fit_transform(cluster_profiles),
    index=cluster_profiles.index,
    columns=cluster_profiles.columns
)

# Create radar chart
categories = feature_cols
fig = go.Figure()

for i, cluster in enumerate(cluster_profiles_scaled.index):
    values = cluster_profiles_scaled.loc[cluster].values.tolist()
    values.append(values[0])  # Close the loop
    
    fig.add_trace(go.Scatterpolar(
        r=values,
        theta=categories + [categories[0]],  # Close the loop
        fill='toself',
        name=f'Cluster {cluster}'
    ))

fig.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, 1]
        )
    ),
    title='Cluster Profiles (Normalized)',
    showlegend=True
)

fig.write_html('./output/cluster_profiles_radar.html')
fig.show()

## 7. Save Clustering Results for Further Analysis

In [None]:
# Save the clustered data with customer IDs
# Check if customer_id exists in the dataset
if 'customer_id' in df_unscaled.columns:
    customer_df = df_with_clusters[['customer_id', 'Cluster']]
    customer_df.to_csv('./output/customer_clusters.csv', index=False)
    print(f"Saved customer cluster assignments to './output/customer_clusters.csv'")
    
# Save the full dataset with cluster labels
df_with_clusters.to_csv('./output/customer_data_with_clusters.csv', index=False)
print(f"Saved full dataset with cluster labels to './output/customer_data_with_clusters.csv'")

# Save cluster profiles
cluster_profiles.to_csv('./output/cluster_profiles.csv')
print(f"Saved cluster profiles to './output/cluster_profiles.csv'")

## 8. Initial Segment Interpretation

Based on the cluster profiles and our domain knowledge of the expected customer segments, we can begin to interpret what each cluster represents.

In [None]:
# Calculate relative feature importance for each cluster (compared to overall mean)
overall_mean = df_unscaled[feature_cols].mean()
relative_importance = cluster_profiles.copy()

for feature in feature_cols:
    relative_importance[feature] = (cluster_profiles[feature] - overall_mean[feature]) / overall_mean[feature]

# Display relative importance as percentages
print("Relative Feature Importance (% difference from overall mean):")
relative_importance_pct = relative_importance.applymap(lambda x: f"{x*100:.1f}%")
display(relative_importance_pct)

# Visualize relative importance
plt.figure(figsize=(12, 8))
sns.heatmap(relative_importance, annot=True, cmap='RdYlGn', center=0, fmt='.2f')
plt.title('Relative Feature Importance by Cluster', fontsize=16)
plt.tight_layout()
plt.savefig('./output/relative_feature_importance.png', dpi=300)
plt.show()

In [None]:
# Based on the cluster profiles, match clusters to the expected segments
# Expected segments: Bargain Hunters, High Spenders, Window Shoppers

# Create a dictionary to store interpretations
segment_interpretations = {}

# You'll need to analyze the relative importance and mean values to determine which cluster matches which segment
# This is a simplistic example - you'll need to adapt this based on your actual cluster profiles

# For each cluster, determine the matching segment based on feature values
for cluster_id in cluster_profiles.index:
    profile = cluster_profiles.loc[cluster_id]
    rel_profile = relative_importance.loc[cluster_id]
    
    # Characteristics of Bargain Hunters:
    # - High total_purchases
    # - Low avg_cart_value
    # - High discount_count
    bargain_score = 0
    if rel_profile['total_purchases'] > 0:
        bargain_score += 1
    if rel_profile['avg_cart_value'] < 0:
        bargain_score += 1
    discount_col = 'discount_counts' if 'discount_counts' in rel_profile else 'discount_count'
    if rel_profile[discount_col] > 0:
        bargain_score += 1
    
    # Characteristics of High Spenders:
    # - Moderate total_purchases
    # - High avg_cart_value
    # - Low discount_count
    spender_score = 0
    if abs(rel_profile['total_purchases']) < 0.2:  # Close to average
        spender_score += 1
    if rel_profile['avg_cart_value'] > 0:
        spender_score += 1
    if rel_profile[discount_col] < 0:
        spender_score += 1
    
    # Characteristics of Window Shoppers:
    # - Low total_purchases
    # - High total_time_spent
    # - High product_click
    # - Low discount_count
    shopper_score = 0
    if rel_profile['total_purchases'] < 0:
        shopper_score += 1
    if rel_profile['total_time_spent'] > 0:
        shopper_score += 1
    if rel_profile['product_click'] > 0:
        shopper_score += 1
    if rel_profile[discount_col] < 0:
        shopper_score += 1
    
    # Determine the best match
    scores = {
        'Bargain Hunters': bargain_score,
        'High Spenders': spender_score,
        'Window Shoppers': shopper_score
    }
    best_match = max(scores, key=scores.get)
    
    segment_interpretations[cluster_id] = {
        'segment': best_match,
        'scores': scores,
        'key_characteristics': {
            'total_purchases': 'High' if rel_profile['total_purchases'] > 0.2 else 
                              ('Low' if rel_profile['total_purchases'] < -0.2 else 'Moderate'),
            'avg_cart_value': 'High' if rel_profile['avg_cart_value'] > 0.2 else 
                             ('Low' if rel_profile['avg_cart_value'] < -0.2 else 'Moderate'),
            'total_time_spent': 'High' if rel_profile['total_time_spent'] > 0.2 else 
                               ('Low' if rel_profile['total_time_spent'] < -0.2 else 'Moderate'),
            'product_click': 'High' if rel_profile['product_click'] > 0.2 else 
                           ('Low' if rel_profile['product_click'] < -0.2 else 'Moderate'),
            'discount_usage': 'High' if rel_profile[discount_col] > 0.2 else 
                            ('Low' if rel_profile[discount_col] < -0.2 else 'Moderate')
        }
    }

# Print interpretations
for cluster_id, interpretation in segment_interpretations.items():
    print(f"\nCluster {cluster_id} → {interpretation['segment']}")
    print("Segment match scores:", interpretation['scores'])
    print("Key characteristics:")
    for characteristic, level in interpretation['key_characteristics'].items():
        print(f"  - {characteristic}: {level}")

In [None]:
# Create a dataframe with the final segment mappings
segment_mapping = {cluster_id: interp['segment'] for cluster_id, interp in segment_interpretations.items()}
segment_df = pd.DataFrame({
    'Cluster': list(segment_mapping.keys()),
    'Segment': list(segment_mapping.values())
})

# Rename the clusters in the original dataframe
df_with_segments = df_with_clusters.copy()
df_with_segments['Segment'] = df_with_segments['Cluster'].map(segment_mapping)

# Save the segment mapping
segment_df.to_csv('./output/segment_mapping.csv', index=False)
print(f"Saved segment mapping to './output/segment_mapping.csv'")

# Save the customer data with segment labels
df_with_segments.to_csv('./output/customer_data_with_segments.csv', index=False)
print(f"Saved customer data with segments to './output/customer_data_with_segments.csv'")

## Summary

In this notebook, we have:

1. Found the optimal number of clusters using multiple metrics
2. Applied different clustering algorithms (K-Means, GMM, Hierarchical, DBSCAN)
3. Compared the performance of these algorithms
4. Selected the best model based on evaluation metrics
5. Visualized the identified clusters
6. Examined cluster characteristics
7. Interpreted the clusters as customer segments

We have successfully identified three distinct customer segments that align with our domain knowledge:

1. **Bargain Hunters**: Customers who make frequent purchases of low-value items and rely heavily on discounts
2. **High Spenders**: Customers who make moderate purchases of high-value items without relying on discounts
3. **Window Shoppers**: Customers who spend significant time browsing many products but rarely make purchases

These segments provide valuable insights for targeted marketing strategies. In the next notebook, we'll perform a more in-depth analysis of these segments.