## 5. Visualize the Identified Clusters

Continuing from the previous notebook, we'll visualize the clusters identified by our best model.

In [None]:
# Function to visualize clusters in 2D using PCA
def visualize_clusters_2d(data, labels, title="Cluster Visualization", save_path=None):
    # Apply PCA for visualization
    pca = PCA(n_components=2)
    principal_components = pca.fit_transform(data)
    
    # Create a dataframe with principal components and cluster labels
    pca_df = pd.DataFrame(
        data=principal_components,
        columns=['PC1', 'PC2']
    )
    pca_df['Cluster'] = labels
    
    # Create scatter plot
    plt.figure(figsize=(12, 8))
    sns.scatterplot(x='PC1', y='PC2', hue='Cluster', data=pca_df, palette='viridis', s=80, alpha=0.8)
    plt.title(title, fontsize=16)
    plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)', fontsize=12)
    plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)', fontsize=12)
    plt.grid(linestyle='--', alpha=0.5)
    plt.legend(title='Cluster', fontsize=12)
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300)
    
    plt.show()
    
    # Create interactive scatter plot using plotly
    fig = px.scatter(
        pca_df, 
        x='PC1', 
        y='PC2', 
        color='Cluster',
        title=title,
        labels={
            'PC1': f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)',
            'PC2': f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)'
        }
    )
    fig.update_traces(marker=dict(size=10))
    
    if save_path:
        html_path = save_path.replace('.png', '.html')
        fig.write_html(html_path)
    
    fig.show()
    
    return pca_df, pca

In [None]:
# Function to visualize clusters in 3D using PCA
def visualize_clusters_3d(data, labels, title="3D Cluster Visualization", save_path=None):
    # Apply PCA for visualization
    pca = PCA(n_components=3)
    principal_components = pca.fit_transform(data)
    
    # Create a dataframe with principal components and cluster labels
    pca_df = pd.DataFrame(
        data=principal_components,
        columns=['PC1', 'PC2', 'PC3']
    )
    pca_df['Cluster'] = labels
    
    # Create interactive 3D scatter plot
    fig = px.scatter_3d(
        pca_df, 
        x='PC1', 
        y='PC2', 
        z='PC3',
        color='Cluster',
        title=title,
        labels={
            'PC1': f'PC1 ({pca.explained_variance_ratio_[0]:.2%})',
            'PC2': f'PC2 ({pca.explained_variance_ratio_[1]:.2%})',
            'PC3': f'PC3 ({pca.explained_variance_ratio_[2]:.2%})'
        }
    )
    fig.update_traces(marker=dict(size=5))
    
    if save_path:
        html_path = save_path.replace('.png', '.html')
        fig.write_html(html_path)
    
    fig.show()
    
    return pca_df, pca

In [None]:
# Visualize the clusters found by each algorithm in 2D
for alg in ['kmeans', 'gmm', 'hierarchical']:
    labels = clustering_results[alg]['labels']
    title = f"{alg.capitalize()} Clustering (k=3)"
    save_path = f"./output/{alg}_clusters.png"
    pca_df, pca = visualize_clusters_2d(features_df, labels, title=title, save_path=save_path)

In [None]:
# Visualize the best model's clusters in 3D
best_labels = clustering_results[best_3_cluster_algorithm]['labels']
title = f"{best_3_cluster_algorithm.capitalize()} Clustering 3D (k=3)"
save_path = f"./output/{best_3_cluster_algorithm}_clusters_3d.png"
pca_3d_df, pca_3d = visualize_clusters_3d(features_df, best_labels, title=title, save_path=save_path)

## 6. Examine Cluster Characteristics

In [None]:
# Add cluster labels to the original unscaled dataframe for interpretation
df_with_clusters = df_unscaled.copy()
df_with_clusters['Cluster'] = final_labels

# Get the distribution of clusters
cluster_counts = df_with_clusters['Cluster'].value_counts()
cluster_percentages = cluster_counts / len(df_with_clusters) * 100

# Print cluster distribution
print("Cluster Distribution:")
for cluster, count in cluster_counts.iteritems():
    print(f"Cluster {cluster}: {count} customers ({cluster_percentages[cluster]:.2f}%)")

# Plot cluster distribution
plt.figure(figsize=(10, 6))
ax = cluster_counts.plot(kind='bar', color='skyblue')
plt.title('Cluster Size Distribution', fontsize=14)
plt.xlabel('Cluster', fontsize=12)
plt.ylabel('Number of Customers', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add data labels on top of each bar
for i, v in enumerate(cluster_counts):
    ax.text(i, v+5, str(v), ha='center', fontsize=10)

plt.tight_layout()
plt.savefig('./output/cluster_distribution.png', dpi=300)
plt.show()

In [None]:
# Calculate cluster profiles (mean of each feature for each cluster)
cluster_profiles = df_with_clusters.groupby('Cluster').mean()

# Display cluster profiles
print("Cluster Profiles (Mean Values):")
display(cluster_profiles)

# Visualize cluster profiles with a heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(cluster_profiles, annot=True, cmap='viridis', fmt='.2f')
plt.title('Cluster Profiles (Mean Values)', fontsize=16)
plt.tight_layout()
plt.savefig('./output/cluster_profiles_heatmap.png', dpi=300)
plt.show()

In [None]:
# Create box plots for each feature by cluster
feature_cols = [col for col in df_with_clusters.columns if col != 'customer_id' and col != 'Cluster']
num_features = len(feature_cols)

plt.figure(figsize=(15, num_features * 4))

for i, feature in enumerate(feature_cols):
    plt.subplot(num_features, 1, i+1)
    sns.boxplot(x='Cluster', y=feature, data=df_with_clusters, palette='viridis')
    plt.title(f'Distribution of {feature} by Cluster', fontsize=14)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    
plt.savefig('./output/cluster_feature_distributions.png', dpi=300)
plt.show()

In [None]:
# Create radar chart to visualize cluster profiles
# First, normalize the values for better visualization
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
cluster_profiles_scaled = pd.DataFrame(
    scaler.fit_transform(cluster_profiles),
    index=cluster_profiles.index,
    columns=cluster_profiles.columns
)

# Create radar chart
categories = feature_cols
fig = go.Figure()

for i, cluster in enumerate(cluster_profiles_scaled.index):
    values = cluster_profiles_scaled.loc[cluster].values.tolist()
    values.append(values[0])  # Close the loop
    
    fig.add_trace(go.Scatterpolar(
        r=values,
        theta=categories + [categories[0]],  # Close the loop
        fill='toself',
        name=f'Cluster {cluster}'
    ))

fig.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, 1]
        )
    ),
    title='Cluster Profiles (Normalized)',
    showlegend=True
)

fig.write_html('./output/cluster_profiles_radar.html')
fig.show()

## 7. Save Clustering Results for Further Analysis

In [None]:
# Save the clustered data with customer IDs
# Check if customer_id exists in the dataset
if 'customer_id' in df_unscaled.columns:
    customer_df = df_with_clusters[['customer_id', 'Cluster']]
    customer_df.to_csv('./output/customer_clusters.csv', index=False)
    print(f"Saved customer cluster assignments to './output/customer_clusters.csv'")
    
# Save the full dataset with cluster labels
df_with_clusters.to_csv('./output/customer_data_with_clusters.csv', index=False)
print(f"Saved full dataset with cluster labels to './output/customer_data_with_clusters.csv'")

# Save cluster profiles
cluster_profiles.to_csv('./output/cluster_profiles.csv')
print(f"Saved cluster profiles to './output/cluster_profiles.csv'")

## 8. Initial Segment Interpretation

Based on the cluster profiles and our domain knowledge of the expected customer segments, we can begin to interpret what each cluster represents.

In [None]:
# Calculate relative feature importance for each cluster (compared to overall mean)
overall_mean = df_unscaled[feature_cols].mean()
relative_importance = cluster_profiles.copy()

for feature in feature_cols:
    relative_importance[feature] = (cluster_profiles[feature] - overall_mean[feature]) / overall_mean[feature]

# Display relative importance as percentages
print("Relative Feature Importance (% difference from overall mean):")
relative_importance_pct = relative_importance.applymap(lambda x: f"{x*100:.1f}%")
display(relative_importance_pct)

# Visualize relative importance
plt.figure(figsize=(12, 8))
sns.heatmap(relative_importance, annot=True, cmap='RdYlGn', center=0, fmt='.2f')
plt.title('Relative Feature Importance by Cluster', fontsize=16)
plt.tight_layout()
plt.savefig('./output/relative_feature_importance.png', dpi=300)
plt.show()

In [None]:
# Based on the cluster profiles, match clusters to the expected segments
# Expected segments: Bargain Hunters, High Spenders, Window Shoppers

# Create a dictionary to store interpretations
segment_interpretations = {}

# You'll need to analyze the relative importance and mean values to determine which cluster matches which segment
# This is a simplistic example - you'll need to adapt this based on your actual cluster profiles

# For each cluster, determine the matching segment based on feature values
for cluster_id in cluster_profiles.index:
    profile = cluster_profiles.loc[cluster_id]
    rel_profile = relative_importance.loc[cluster_id]
    
    # Characteristics of Bargain Hunters:
    # - High total_purchases
    # - Low avg_cart_value
    # - High discount_count
    bargain_score = 0
    if rel_profile['total_purchases'] > 0:
        bargain_score += 1
    if rel_profile['avg_cart_value'] < 0:
        bargain_score += 1
    discount_col = 'discount_counts' if 'discount_counts' in rel_profile else 'discount_count'
    if rel_profile[discount_col] > 0:
        bargain_score += 1
    
    # Characteristics of High Spenders:
    # - Moderate total_purchases
    # - High avg_cart_value
    # - Low discount_count
    spender_score = 0
    if abs(rel_profile['total_purchases']) < 0.2:  # Close to average
        spender_score += 1
    if rel_profile['avg_cart_value'] > 0:
        spender_score += 1
    if rel_profile[discount_col] < 0:
        spender_score += 1
    
    # Characteristics of Window Shoppers:
    # - Low total_purchases
    # - High total_time_spent
    # - High product_click
    # - Low discount_count
    shopper_score = 0
    if rel_profile['total_purchases'] < 0:
        shopper_score += 1
    if rel_profile['total_time_spent'] > 0:
        shopper_score += 1
    if rel_profile['product_click'] > 0:
        shopper_score += 1
    if rel_profile[discount_col] < 0:
        shopper_score += 1
    
    # Determine the best match
    scores = {
        'Bargain Hunters': bargain_score,
        'High Spenders': spender_score,
        'Window Shoppers': shopper_score
    }
    best_match = max(scores, key=scores.get)
    
    segment_interpretations[cluster_id] = {
        'segment': best_match,
        'scores': scores,
        'key_characteristics': {
            'total_purchases': 'High' if rel_profile['total_purchases'] > 0.2 else 
                              ('Low' if rel_profile['total_purchases'] < -0.2 else 'Moderate'),
            'avg_cart_value': 'High' if rel_profile['avg_cart_value'] > 0.2 else 
                             ('Low' if rel_profile['avg_cart_value'] < -0.2 else 'Moderate'),
            'total_time_spent': 'High' if rel_profile['total_time_spent'] > 0.2 else 
                               ('Low' if rel_profile['total_time_spent'] < -0.2 else 'Moderate'),
            'product_click': 'High' if rel_profile['product_click'] > 0.2 else 
                           ('Low' if rel_profile['product_click'] < -0.2 else 'Moderate'),
            'discount_usage': 'High' if rel_profile[discount_col] > 0.2 else 
                            ('Low' if rel_profile[discount_col] < -0.2 else 'Moderate')
        }
    }

# Print interpretations
for cluster_id, interpretation in segment_interpretations.items():
    print(f"\nCluster {cluster_id} â†’ {interpretation['segment']}")
    print("Segment match scores:", interpretation['scores'])
    print("Key characteristics:")
    for characteristic, level in interpretation['key_characteristics'].items():
        print(f"  - {characteristic}: {level}")

In [None]:
# Create a dataframe with the final segment mappings
segment_mapping = {cluster_id: interp['segment'] for cluster_id, interp in segment_interpretations.items()}
segment_df = pd.DataFrame({
    'Cluster': list(segment_mapping.keys()),
    'Segment': list(segment_mapping.values())
})

# Rename the clusters in the original dataframe
df_with_segments = df_with_clusters.copy()
df_with_segments['Segment'] = df_with_segments['Cluster'].map(segment_mapping)

# Save the segment mapping
segment_df.to_csv('./output/segment_mapping.csv', index=False)
print(f"Saved segment mapping to './output/segment_mapping.csv'")

# Save the customer data with segment labels
df_with_segments.to_csv('./output/customer_data_with_segments.csv', index=False)
print(f"Saved customer data with segments to './output/customer_data_with_segments.csv'")

## Summary

In this notebook, we have:

1. Found the optimal number of clusters using multiple metrics
2. Applied different clustering algorithms (K-Means, GMM, Hierarchical, DBSCAN)
3. Compared the performance of these algorithms
4. Selected the best model based on evaluation metrics
5. Visualized the identified clusters
6. Examined cluster characteristics
7. Interpreted the clusters as customer segments

We have successfully identified three distinct customer segments that align with our domain knowledge:

1. **Bargain Hunters**: Customers who make frequent purchases of low-value items and rely heavily on discounts
2. **High Spenders**: Customers who make moderate purchases of high-value items without relying on discounts
3. **Window Shoppers**: Customers who spend significant time browsing many products but rarely make purchases

These segments provide valuable insights for targeted marketing strategies. In the next notebook, we'll perform a more in-depth analysis of these segments.