K-means clustering

In [1]:
import pandas as pd

df = pd.read_csv("data/household_summary.csv")

print(df.shape)
print(df.head(6))

(509935, 6)
   household  total_spend  total_transactions  total_units  \
0          1       253.00                  11          187   
1          2     29217.90                  88        13175   
2          3         5.28                   1            2   
3          4         5.70                   2            6   
4          5        29.49                   3           18   
5          6         0.79                   1            1   

   avg_days_between_purchases  couponed_usage_rate  
0                       69.30                 0.00  
1                        8.36                 2.27  
2                         NaN                 0.00  
3                      164.00                 0.00  
4                       97.00                 0.00  
5                         NaN                 0.00  


In [None]:
df['avg_days_between_purchases'].fillna(728, inplace=True)

In [8]:
print(df.head(6))

   household  total_spend  total_transactions  total_units  \
0          1       253.00                  11          187   
1          2     29217.90                  88        13175   
2          3         5.28                   1            2   
3          4         5.70                   2            6   
4          5        29.49                   3           18   
5          6         0.79                   1            1   

   avg_days_between_purchases  couponed_usage_rate  
0                       69.30                 0.00  
1                        8.36                 2.27  
2                      728.00                 0.00  
3                      164.00                 0.00  
4                       97.00                 0.00  
5                      728.00                 0.00  


In [9]:
from sklearn.preprocessing import StandardScaler

features = df[["total_spend","total_transactions","total_units","avg_days_between_purchases","couponed_usage_rate"]]
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

In [11]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters  = 4, random_state = 42)
df['cluster'] = kmeans.fit_predict(scaled_features)

In [12]:
cluster_summary = df.groupby('cluster').mean().round(2)
print(cluster_summary)

         household  total_spend  total_transactions  total_units  \
cluster                                                            
0        243290.51       165.45                6.44       113.34   
1        192656.49     14318.17               64.87      9441.95   
2        289825.97         3.16                1.06         2.14   
3        197456.20      2639.63               29.82      1793.61   

         avg_days_between_purchases  couponed_usage_rate  
cluster                                                   
0                             91.53                 1.80  
1                             11.63                 2.87  
2                            715.67                 1.15  
3                             24.30                 2.94  


In [None]:
df.groupby('cluster').mean().round(2)

Cluster | Label                 | Description                 
==============================================================
0       | Occasional Customers  | Infrequent, low spend       
1       | Power Users           | High spenders, high loyalty 
2       | One-Time Customers    | Tried once, disappeared     
3       | Coupon-Regulars       | Frequent, value-sensitive   


In [16]:
label_map = {
    0: 'Occasional Customers',
    1: 'Power Users',
    2: 'One-Time Customers',
    3: 'Coupon-Regulars'
}
df['segment'] = df['cluster'].map(label_map)

In [18]:
df.to_csv("data/household_clusters.csv", index=False)