In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import numpy as np
from sklearn import preprocessing
import math
FIGURES_PATH = Path(r"E:\ml_projects\e2e-customer-segmentation-rfm\reports\figures")


In [None]:
rfm_df = pd.read_csv(r"E:\ml_projects\e2e-customer-segmentation-rfm\data\interim\customer_history.csv")
rfm_df.head()

In [None]:
def save_fig(fig_name, dpi=300):
    """
    Saves the current Matplotlib figure to the FIGURES_PATH.
    
    Args:
        fig_name (str): The name of the file (without extension).
        dpi (int): The resolution for the saved image.
    """
    # Create the full file path
    path = FIGURES_PATH / f"{fig_name}.png"
    
    # Save the figure
    print(f"Saving figure to: {path}")
    plt.savefig(path, format='png', dpi=dpi, bbox_inches='tight')

In [None]:
sns.histplot(rfm_df['recency'], bins=30, kde=True)
plt.title('Distribution of Recency')
save_fig('recency_distribution')
plt.show()

In [None]:
sns.histplot(np.log(rfm_df['frequency']), bins=30, kde=True)
plt.title('Distribution of Log(Frequency)')
save_fig('frequency_log_distribution')
plt.show()

In [None]:
sns.histplot(np.log(rfm_df['monetary']), bins=30, kde=True)
plt.title('Distribution of Log(Monetary)')
save_fig('monetary_log_distribution')
plt.show()

In [None]:
# Frequency vs. Monetary
sns.scatterplot(data=rfm_df, x='frequency', y='monetary')
plt.title('Frequency vs. Monetary')
save_fig('frequency_vs_monetary_scatter')
plt.show()

In [None]:
# Recency vs. Monetary
sns.scatterplot(data=rfm_df, x='recency', y='monetary')
plt.title('Recency vs. Monetary')
save_fig('recency_vs_monetary_scatter')
plt.show()

In [None]:
plt.figure(figsize=(12, 4))

plt.subplot(1, 3, 1)
sns.boxplot(y=rfm_df['recency'])
plt.title('Recency Boxplot')

plt.subplot(1, 3, 2)
sns.boxplot(y=rfm_df['frequency'])
plt.title('Frequency Boxplot')

plt.subplot(1, 3, 3)
sns.boxplot(y=rfm_df['monetary'])
plt.title('Monetary Boxplot')

plt.tight_layout()
save_fig('rfm_boxplots')
plt.show()

In [None]:
import numpy as np
from sklearn import preprocessing


# This calculates log(1 + x)
rfm_df['recency_log'] = np.log1p(rfm_df['recency'])
rfm_df['frequency_log'] = np.log1p(rfm_df['frequency'])
rfm_df['monetary_log'] = np.log1p(rfm_df['monetary'])

feature_vector = ['monetary_log', 'recency_log', 'frequency_log']
X_subset = rfm_df[feature_vector].values 

scaler = preprocessing.StandardScaler().fit(X_subset)
X_scaled = scaler.transform(X_subset)

print("Data successfully transformed and scaled.")

In [None]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

xs =rfm_df.recency_log
ys = rfm_df.frequency_log
zs = rfm_df.monetary_log
ax.scatter(xs, ys, zs, s=5)

ax.set_xlabel('Recency')
ax.set_ylabel('Frequency')
ax.set_zlabel('Monetary')

save_fig('rfm_3d_scatter')
plt.show()

In [None]:
nan_rows = np.isnan(X_scaled).any(axis=1)
X_scaled = X_scaled[~nan_rows]

# Verification (Optional):
print(f"X_scaled now has shape: {X_scaled.shape}")
print("NaN check:", np.isnan(X_scaled).any())

In [None]:
from sklearn.cluster import KMeans


inertia = []
K_range = range(2, 11) 

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=10)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

# Plot the results
plt.figure(figsize=(8, 5))
plt.plot(K_range, inertia, marker='o')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia (WCSS)')
plt.title('Elbow Method for Optimal k')
plt.grid(True)
save_fig('kmeans_elbow_plot')
plt.show()

In [None]:
import matplotlib.cm as cm
from sklearn.metrics import silhouette_samples, silhouette_score

k_range = range(3, 7)
cluster_results = dict()
print(f"Running Silhouette Analysis for k in {list(k_range)}...")
cluster_results = dict()

# --- 3. Set the Loop Range (Based on your Elbow plot) ---
# We test k=3, 4, 5, and 6
k_range = range(3, 7)

print(f"Running Silhouette Analysis for k in {list(k_range)}...")

for n_clusters in k_range:
    # --- 4. Setup the Subplots (ax1 = silhouette, ax2 = 3D clusters) ---
    fig = plt.figure(figsize=(20, 9))
    
    # Create the silhouette plot
    ax1 = fig.add_subplot(1, 2, 1)
    
    # *** KEY CHANGE: Create the 3D cluster plot ***
    ax2 = fig.add_subplot(1, 2, 2, projection='3d')

    fig.set_size_inches(20, 9)

    # --- 5. Silhouette Plot (ax1) ---
    ax1.set_xlim([-0.2, 1])
    ax1.set_ylim([0, len(X_scaled) + (n_clusters + 1) * 10])

    # Initialize KMeans and fit
    clusterer = KMeans(n_clusters=n_clusters, random_state=10, n_init=10)
    cluster_labels = clusterer.fit_predict(X_scaled)

    # Calculate average silhouette score
    silhouette_avg = silhouette_score(X_scaled, cluster_labels)
    print(f"For n_clusters = {n_clusters}, the average silhouette_score is : {silhouette_avg:.4f}")

    # Store results
    cluster_results.update({n_clusters: {
                                'cluster_center': clusterer.cluster_centers_,
                                'silhouette_score': silhouette_avg,
                                'labels': cluster_labels}
                           })

    # Get silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X_scaled, cluster_labels)
    y_lower = 10
    
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to cluster i
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        # *** FIX for AttributeError ***
        color = plt.get_cmap('Spectral')(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # Draw the vertical line for the average silhouette score
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
    ax1.set_yticks([])
    ax1.set_xticks([-0.2, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # --- 6. 3D Cluster Visualization (ax2) ---
    # *** FIX for AttributeError ***
    colors = plt.get_cmap('Spectral')(cluster_labels.astype(float) / n_clusters)
    
    # Plot all 3 features
    # Assumes: 0=Monetary, 1=Recency, 2=Frequency
    ax2.scatter3D(X_scaled[:, 0], X_scaled[:, 1], X_scaled[:, 2], marker='.', s=30, lw=0, alpha=0.7,
                  c=colors, edgecolor='k')

    # Plot the cluster centers
    centers = clusterer.cluster_centers_
    ax2.scatter3D(centers[:, 0], centers[:, 1], centers[:, 2], marker='o',
                  c="white", alpha=1, s=200, edgecolor='k')
    
    for i, c in enumerate(centers):
        # Add cluster number label to the center
        ax2.scatter3D(c[0], c[1], c[2], marker='$%d$' % i, alpha=1,
                      s=50, edgecolor='k')

    ax2.set_title("3D visualization of the clustered data.")
    ax2.set_xlabel("Feature 1 (Monetary_log)")
    ax2.set_ylabel("Feature 2 (Recency_log)")
    ax2.set_zlabel("Feature 3 (Frequency_log)") # *** KEY CHANGE ***

    # --- 7. Finalize and Save Plot ---
    plt.suptitle((f"Silhouette analysis for k = {n_clusters}"),
                 fontsize=14, fontweight='bold')
    
    # Save the figure
    save_fig(f"silhouette_analysis_k_{n_clusters}")
    
    # Show the plot
    plt.show()

print("Silhouette analysis complete.")

In [None]:
print("\n--- Cluster Center Analysis ---")


for i in range(3, 7): 
    
    # Check if the key exists (in case you change the loop)
    if i in cluster_results:
        print("="*30)
        print(f"FOR {i} NUMBER OF CLUSTERS")
        print("="*30)
        
        # FIX 2: Use the correct dictionary name 'cluster_results'
        centers_scaled = cluster_results[i]['cluster_center']
        
        # Inverse the scaling
        centers_log_transformed = scaler.inverse_transform(centers_scaled)
        
        # FIX 3: Use np.expm1() to correctly inverse np.log1p()
        # This transforms the centers from log-space back to their
        # original real-world values (e.g., 10.5 days, 5 purchases)
        centers_original = np.expm1(centers_log_transformed)
        
        # Create a DataFrame for easy reading
        centers_df = pd.DataFrame(centers_original, columns=feature_vector)
        
        # Add a column for the cluster number
        centers_df['cluster'] = [f'Cluster {j}' for j in range(i)]
        print(centers_df.to_string())
        
        # Print the silhouette score
        score = cluster_results[i]['silhouette_score']
        print(f"\nSilhouette score for {i} clusters: {score:.4f}\n")

In [None]:
rfm_features = rfm_df[['monetary_log', 'recency_log', 'frequency_log']]

# 2. Drop NaNs, but keep track of the index
rfm_features_clean = rfm_features.dropna()

# 3. Scale this clean data
scaler = preprocessing.StandardScaler().fit(rfm_features_clean)
X_scaled = scaler.transform(rfm_features_clean)

# 4. Re-run K-Means (only on the clean data)
kmeans_final = KMeans(n_clusters=4, random_state=10, n_init=10)
labels_k4 = kmeans_final.fit_predict(X_scaled)

# 5. Assign labels back using the .index
rfm_df['cluster_4_labels'] = np.nan
rfm_df.loc[rfm_features_clean.index, 'cluster_4_labels'] = labels_k4

print("Labels assigned successfully!")
print(rfm_df['cluster_4_labels'].isnull().sum()) 


In [None]:
import numpy as np

# --- 1. Get the index of your 'clean' data (4331 rows) ---
# (This is the same index we found in the last step)
feature_vector = ['monetary_log', 'recency_log', 'frequency_log']
clean_data_index = rfm_df[feature_vector].dropna().index

# --- 2. Retrieve and assign labels for k=3 ---

# Get the labels from your results dictionary
labels_k3 = cluster_results[3]['labels'] 

# Create the new NaN column
rfm_df['num_cluster3_labels'] = np.nan

# Assign the 4331 labels to the correct 4331 rows
rfm_df.loc[clean_data_index, 'num_cluster3_labels'] = labels_k3


# --- 3. Retrieve and assign labels for k=5 ---

# Get the labels from your results dictionary
labels_k5 = cluster_results[5]['labels']

# Create the new NaN column
rfm_df['num_cluster5_labels'] = np.nan

# Assign the 4331 labels to the correct 4331 rows
rfm_df.loc[clean_data_index, 'num_cluster5_labels'] = labels_k5


# --- 4. Verify the result ---
print("Cluster labels for k=3 and k=5 assigned successfully.")
print(rfm_df.head())

# Check the counts
print(f"\nCluster 3 NaN count: {rfm_df['num_cluster3_labels'].isnull().sum()}")
print(f"Cluster 5 NaN count: {rfm_df['num_cluster5_labels'].isnull().sum()}")

In [None]:
import plotly.graph_objs as go
import plotly.io as pio
import numpy as np
import pandas as pd

# Set the default renderer for Plotly
pio.renderers.default = "notebook_connected" 

def plot_segment_distribution(df, cluster_col, field_col, cutoff_percentile=100):
    """
    Creates an interactive Plotly box plot to compare the distribution of
    a metric (e.g., 'recency') across different clusters.
    
    Args:
        df (pd.DataFrame): Your main customer DataFrame
        cluster_col (str): The column name of the cluster labels (e.g., 'num_cluster5_labels')
        field_col (str): The metric you want to plot (e.g., 'recency', 'monetary')
        cutoff_percentile (int): The percentile to use for capping outliers.
                                 Default is 100 (no cutoff).
                                 A value of 99 would remove the top 1% of outliers.
    """
    
    print(f"Generating plot for: {field_col} across {cluster_col}")
    
    # Find the number of clusters (ignoring NaN)
    cluster_labels = df[cluster_col].dropna().unique()
    cluster_labels.sort() # Ensure clusters are in order (0, 1, 2, ...)
    
    traces = []
    
    # Loop through each cluster and create a box plot trace
    for label in cluster_labels:
        # 1. Filter the DataFrame for the specific cluster
        cluster_data = df[df[cluster_col] == label][field_col]
        
        # 2. Handle NaNs and apply the percentile cutoff
        if not cluster_data.empty:
            # Drop any NaNs from this specific metric
            cluster_data = cluster_data.dropna()
            
            # Calculate the cutoff value
            cutoff_value = np.percentile(cluster_data, cutoff_percentile)
            
            # Filter the data
            filtered_data = cluster_data[cluster_data <= cutoff_value]
        
            # 3. Create the trace
            traces.append(go.Box(
                y=filtered_data,
                name=f'Cluster {int(label)}',
                boxpoints=False, # Use 'all' or 'outliers' if you want to see points
                jitter=0.5,
                whiskerwidth=0.2,
                marker=dict(size=2),
                line=dict(width=1),
            ))

    layout = go.Layout(
        title=f'Distribution of {field_col} by {cluster_col}',
        yaxis=dict(
            title=f'{field_col} (Outliers capped at {cutoff_percentile}th percentile)',
            autorange=True,
            showgrid=True,
            zeroline=True,
            gridcolor='rgb(230, 230, 230)',
            gridwidth=1,
            zerolinecolor='rgb(0, 0, 0)',
            zerolinewidth=2,
        ),
        xaxis=dict(
            title='Cluster'
        ),
        margin=dict(l=60, r=30, b=80, t=100),
        paper_bgcolor='white',
        plot_bgcolor='white',
        showlegend=False
    )
    
    fig = go.Figure(data=traces, layout=layout)
    fig.show()


In [None]:
rfm_df['num_cluster5_labels'] = rfm_df['num_cluster5_labels'].astype('Int64')
rfm_df['num_cluster3_labels'] = rfm_df['num_cluster3_labels'].astype('Int64')
rfm_df['cluster_4_labels'] = rfm_df['cluster_4_labels'].astype('Int64')

In [None]:
import plotly.graph_objs as go
import plotly.io as pio
import numpy as np
import pandas as pd

# Set the default renderer for Plotly
pio.renderers.default = "notebook_connected" 

def plot_segment_distribution(df, cluster_col, field_col, cutoff_percentile=100):
    """
    Creates an interactive Plotly box plot to compare the distribution of
    a metric (e.g., 'recency') across different clusters.
    
    Args:
        df (pd.DataFrame): Your main customer DataFrame
        cluster_col (str): The column name of the cluster labels (e.g., 'num_cluster5_labels')
        field_col (str): The metric you want to plot (e.g., 'recency', 'monetary')
        cutoff_percentile (int): The percentile to use for capping outliers.
                                 Default is 100 (no cutoff).
                                 A value of 99 would remove the top 1% of outliers.
    """
    
    print(f"Generating plot for: {field_col} across {cluster_col}")
    
    # Find the number of clusters (ignoring NaN)
    cluster_labels = df[cluster_col].dropna().unique()
    
    # *** FIX: Use np.sort() function instead of .sort() method ***
    cluster_labels = np.sort(cluster_labels) # Ensure clusters are in order (0, 1, 2, ...)
    
    traces = []
    
    # Loop through each cluster and create a box plot trace
    for label in cluster_labels:
        # 1. Filter the DataFrame for the specific cluster
        cluster_data = df[df[cluster_col] == label][field_col]
        
        # 2. Handle NaNs and apply the percentile cutoff
        if not cluster_data.empty:
            # Drop any NaNs from this specific metric
            cluster_data = cluster_data.dropna()
            
            # Check if cluster_data is empty after dropping NaNs
            if not cluster_data.empty:
                # Calculate the cutoff value
                cutoff_value = np.percentile(cluster_data, cutoff_percentile)
                
                # Filter the data
                filtered_data = cluster_data[cluster_data <= cutoff_value]
            else:
                filtered_data = cluster_data
        
            # 3. Create the trace
            traces.append(go.Box(
                y=filtered_data,
                name=f'Cluster {int(label)}',
                boxpoints=False, # Use 'all' or 'outliers' if you want to see points
                jitter=0.5,
                whiskerwidth=0.2,
                marker=dict(size=2),
                line=dict(width=1),
            ))

    layout = go.Layout(
        title=f'Distribution of {field_col} by {cluster_col}',
        yaxis=dict(
            title=f'{field_col} (Outliers capped at {cutoff_percentile}th percentile)',
            autorange=True,
            showgrid=True,
            zeroline=True,
            gridcolor='rgb(230, 230, 230)',
            gridwidth=1,
            zerolinecolor='rgb(0, 0, 0)',
            zerolinewidth=2,
        ),
        xaxis=dict(
            title='Cluster'
        ),
        margin=dict(l=60, r=30, b=80, t=100),
        paper_bgcolor='white',
        plot_bgcolor='white',
        showlegend=False
    )
    
    fig = go.Figure(data=traces, layout=layout)
    fig.show()

# --- How to use the function ---

# First, make sure your cluster label columns are numeric (not float)
# This will help the loop work correctly by handling any NaNs
# We assume rfm_df is your main DataFrame
try:
    rfm_df['num_cluster5_labels'] = rfm_df['num_cluster5_labels'].astype('Int64')
    rfm_df['num_cluster3_labels'] = rfm_df['num_cluster3_labels'].astype('Int64')
    rfm_df['cluster_4_labels'] = rfm_df['cluster_4_labels'].astype('Int64')
except NameError:
    print("Warning: 'rfm_df' not defined. Please ensure it exists.")
    # Create a dummy df for the script to be runnable
    rfm_df = pd.DataFrame({
        'cluster_4_labels': [0, 1, 0, 1, 2, 3, 2, np.nan],
        'recency': [10, 300, 15, 250, 80, 120, 75, 50],
        'frequency': [100, 2, 90, 1, 30, 20, 35, 5],
        'monetary': [5000, 100, 4500, 50, 1000, 800, 1100, 200]
    })


# --- Analyze your k=4 model (the one your elbow plot suggested) ---
# (Using a 99th percentile cutoff to remove extreme outliers for better viz)
plot_segment_distribution(rfm_df, 'cluster_4_labels', 'recency', cutoff_percentile=99)
plot_segment_distribution(rfm_df, 'cluster_4_labels', 'frequency', cutoff_percentile=99)
plot_segment_distribution(rfm_df, 'cluster_4_labels', 'monetary', cutoff_percentile=99)

In [None]:
# --- 1. Calculate the mean RFM values for each cluster ---
# We use the ORIGINAL RFM columns (not log) for interpretation
segment_summary = rfm_df.groupby('cluster_4_labels')[['recency', 'frequency', 'monetary']].mean()

# --- 2. Calculate the size (number of customers) of each cluster ---
# .value_counts() will ignore NaNs by default
segment_size = rfm_df['cluster_4_labels'].value_counts().sort_index()
segment_summary['Count'] = segment_size
segment_summary['Percent'] = (segment_summary['Count'] / segment_summary['Count'].sum()) * 100

# --- 3. Display the final summary table ---
# Sort by Monetary or Recency to make it easier to read
segment_summary = segment_summary.sort_values(by='monetary', ascending=False)

print("--- Segment Summary (k=4) ---")
print(segment_summary.to_markdown(floatfmt=",.2f"))

In [None]:
segment_name_map = {
    0: 'Champions',
    1: 'Lost Customers',
    2: 'New Customers',
    3: 'At-Risk' 
}

# Add the human-readable names to your DataFrame
rfm_df['segment_name'] = rfm_df['cluster_4_labels'].map(segment_name_map)
rfm_df.head()

In [None]:
rfm_df.drop(columns = ["num_cluster3_labels", "num_cluster5_labels"], inplace=True)

In [None]:
rfm_df.head()

In [None]:
segment_counts = rfm_df['segment_name'].value_counts()
colors = sns.color_palette('viridis', len(segment_counts))

# 3. Create the pie chart
plt.figure(figsize=(8, 8))
plt.pie(
    segment_counts, 
    labels=segment_counts.index, 
    autopct='%1.1f%%',  # Shows percentage with one decimal
    startangle=140,       # Rotates the start of the pie
    colors=colors,
    wedgeprops={'edgecolor': 'white'} # Adds a thin white line between slices
)

# 4. Add title and ensure it's a circle
plt.title('Customer Segment Proportions', fontsize=16, fontweight='bold')
plt.axis('equal') # Ensures the pie is drawn as a circle

# 5. Save the figure
save_fig('customer_segment_proportions_pie')

# 6. Show the plot
plt.show()