# Customer Segmentation Analysis Project

This notebook demonstrates customer segmentation analysis, including:
- Data preprocessing and feature engineering
- RFM (Recency, Frequency, Monetary) analysis
- K-means clustering
- Customer segment profiling
- Visualization of results

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from datetime import datetime, timedelta
from sklearn.metrics import silhouette_score

# Set style for visualizations
plt.style.use('seaborn')
sns.set_palette('deep')

In [None]:
def generate_sample_data(n_customers=1000):
    """
    Generate sample customer transaction data
    
    Parameters:
    -----------
    n_customers : int
        Number of customers to generate
    """
    # Generate customer IDs
    customer_ids = [f'CUST_{i:04d}' for i in range(n_customers)]
    
    # Generate random dates
    end_date = datetime.now()
    start_date = end_date - timedelta(days=365)
    dates = [start_date + timedelta(days=np.random.randint(0, 365)) 
             for _ in range(n_customers * 5)]  # Multiple transactions per customer
    
    # Generate transaction amounts
    amounts = np.random.lognormal(3, 1, n_customers * 5)
    
    # Create DataFrame
    df = pd.DataFrame({
        'customer_id': np.repeat(customer_ids, 5),
        'transaction_date': dates,
        'amount': amounts
    })
    
    return df

# Generate sample data
df = generate_sample_data()
df.head()

In [None]:
def calculate_rfm(df):
    """
    Calculate RFM metrics for each customer
    """
    # Calculate current date (last date in the dataset)
    current_date = df['transaction_date'].max()
    
    # Group by customer and calculate RFM metrics
    rfm = df.groupby('customer_id').agg({
        'transaction_date': lambda x: (current_date - x.max()).days,  # Recency
        'customer_id': 'count',  # Frequency
        'amount': 'sum'  # Monetary
    }).rename(columns={
        'transaction_date': 'recency',
        'customer_id': 'frequency',
        'amount': 'monetary'
    })
    
    return rfm

# Calculate RFM metrics
rfm_df = calculate_rfm(df)

# Scale the features
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm_df)
rfm_scaled = pd.DataFrame(rfm_scaled, columns=rfm_df.columns, index=rfm_df.index)

In [None]:
# Find optimal number of clusters using elbow method
inertias = []
silhouette_scores = []
K = range(2, 11)

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(rfm_scaled)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(rfm_scaled, kmeans.labels_))

# Plot elbow curve
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

ax1.plot(K, inertias, 'bx-')
ax1.set_xlabel('k')
ax1.set_ylabel('Inertia')
ax1.set_title('Elbow Method')

ax2.plot(K, silhouette_scores, 'rx-')
ax2.set_xlabel('k')
ax2.set_ylabel('Silhouette Score')
ax2.set_title('Silhouette Method')

plt.show()

In [None]:
# Perform k-means clustering
optimal_k = 4  # Based on elbow method and silhouette score
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
rfm_df['Segment'] = kmeans.fit_predict(rfm_scaled)

# Calculate segment profiles
segment_profiles = rfm_df.groupby('Segment').agg({
    'recency': 'mean',
    'frequency': 'mean',
    'monetary': 'mean',
}).round(2)

# Assign segment labels
segment_labels = {
    0: 'Best Customers',
    1: 'Lost Customers',
    2: 'Lost Cheap Customers',
    3: 'Big Spenders'
}

rfm_df['Segment Label'] = rfm_df['Segment'].map(segment_labels)

In [None]:
# Visualize segments
fig = plt.figure(figsize=(15, 10))

# 3D scatter plot
ax = fig.add_subplot(121, projection='3d')
scatter = ax.scatter(rfm_df['recency'],
                     rfm_df['frequency'],
                     rfm_df['monetary'],
                     c=rfm_df['Segment'],
                     cmap='viridis')
ax.set_xlabel('Recency')
ax.set_ylabel('Frequency')
ax.set_zlabel('Monetary')
plt.title('Customer Segments in 3D')
plt.colorbar(scatter)

# Segment size distribution
plt.subplot(122)
segment_sizes = rfm_df['Segment Label'].value_counts()
plt.pie(segment_sizes, labels=segment_sizes.index, autopct='%1.1f%%')
plt.title('Segment Size Distribution')

plt.tight_layout()
plt.show()

# Print segment profiles
print('\nSegment Profiles:')
print(segment_profiles)

# Generate segment insights
for segment, label in segment_labels.items():
    segment_data = rfm_df[rfm_df['Segment'] == segment]
    print(f'\n{label}:')
    print(f'Number of customers: {len(segment_data)}')
    print(f'Average purchase frequency: {segment_data["frequency"].mean():.1f}')
    print(f'Average purchase amount: ${segment_data["monetary"].mean():.2f}')