In [1]:
import pandas as pd
import numpy as np
from google.cloud.bigquery import Client, QueryJobConfig
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import plotly.express as px

In [46]:
df = pd.read_csv('Data_files/data_w_entry_prod_price_no_outliers_03_22_23.csv')

In [47]:
df.drop(columns=['Unnamed: 0'], inplace=True)

### Sample

In [48]:
# Random sample with seed

random_seed = 1
random_state = np.random.RandomState(random_seed)

# randomly sample 1000000 rows from the dataframe
sample_df = df.sample(n=1000000, random_state=random_state)
sample_df

Unnamed: 0,customer_id,avg_time_btw_orders,recency,months_elapsed,purchase_times,lifetime_spent,lifetime_quantity,per_order_spent,per_order_quantity,distinct_category_count,...,product_removed_count,customizer_started_count,customizer_completed_count,cart_viewed_count,checkout_started_count,order_cancelled_count,email_received,email_open_rate,unsubscribed,max_entry_product_price
651447,ac7c9227-15d8-45ef-8d4b-00aa262984c2,5.080000,10.000000,35.466667,6,620.92,13.0,103.486667,2.166667,4,...,2.0,3.0,2.0,10.0,7.0,0.0,7124,0.598540,0.0,3.00
967078,0cf722ca-64d8-419b-beaa-16b5d31fa126,34.666667,4.066667,38.766667,2,174.95,11.0,87.475000,5.500000,3,...,0.0,0.0,0.0,3.0,0.0,0.0,5522,0.177291,0.0,14.99
58609,61ea5af8-8197-4f8b-bad6-93dc93f403de,0.755556,23.833333,26.133333,4,185.89,20.0,46.472500,5.000000,2,...,11.0,14.0,5.0,9.0,7.0,2.0,6498,0.105263,0.0,3.00
543318,94278e01-aa83-4c8b-8475-e12650c33ae6,54.800000,8.500000,63.300000,2,169.96,6.0,84.980000,3.000000,2,...,2.0,0.0,0.0,14.0,1.0,0.0,411,0.576642,0.0,18.74
1682672,3f7de1e2-f8e4-43b8-8b15-e954cd3a7a21,19.733333,10.133333,29.900000,2,349.98,6.0,174.990000,3.000000,2,...,1.0,0.0,0.0,7.0,6.0,0.0,3204,0.466292,0.0,250.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1328402,0cc246d9-a02c-426c-bece-d80a841c8e0a,6.500000,2.933333,9.433333,2,76.00,4.0,38.000000,2.000000,2,...,0.0,0.0,0.0,2.0,2.0,0.0,0,0.426932,2.0,35.00
990049,ffc00791-fff4-46bb-93e3-d02f4e67bd99,15.722222,17.533333,64.800000,4,659.93,10.0,164.982500,2.500000,3,...,2.0,10.0,7.0,1.0,5.0,0.0,2264,0.106007,0.0,25.00
923753,7f2b007e-4787-4029-a4dc-18d837e59009,10.300000,5.866667,16.166667,2,97.99,3.0,48.995000,1.500000,1,...,0.0,0.0,0.0,3.0,1.0,0.0,0,0.426932,2.0,35.00
1346874,13c5813f-c198-46bd-b0b6-b9238088d00b,9.966667,8.200000,18.200000,2,60.99,4.0,30.495000,2.000000,2,...,0.0,4.0,1.0,3.0,3.0,0.0,60,0.266667,1.0,25.00


In [49]:
sample_df.columns

Index(['customer_id', 'avg_time_btw_orders', 'recency', 'months_elapsed',
       'purchase_times', 'lifetime_spent', 'lifetime_quantity',
       'per_order_spent', 'per_order_quantity', 'distinct_category_count',
       'discount_frequency_order', 'discount_frequency_product',
       'outdoor_equipment_pct', 'cargo_pct', 'soft_cooler_pct', 'bags_pct',
       'drinkware_pct', 'hard_cooler_pct', 'other_pct', 'black_pct',
       'white_pct', 'navy_pct', 'seaform_pct', 'stainless_pct', 'charcoal_pct',
       'nordic_purple_pct', 'harvest_red_pct', 'alpine_yellow_pct',
       'other_color_pct', 'product_added_count',
       'product_added_to_wishlist_count', 'product_removed_count',
       'customizer_started_count', 'customizer_completed_count',
       'cart_viewed_count', 'checkout_started_count', 'order_cancelled_count',
       'email_received', 'email_open_rate', 'unsubscribed',
       'max_entry_product_price'],
      dtype='object')

In [50]:
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 651447 to 623919
Data columns (total 41 columns):
 #   Column                           Non-Null Count    Dtype  
---  ------                           --------------    -----  
 0   customer_id                      1000000 non-null  object 
 1   avg_time_btw_orders              1000000 non-null  float64
 2   recency                          1000000 non-null  float64
 3   months_elapsed                   1000000 non-null  float64
 4   purchase_times                   1000000 non-null  int64  
 5   lifetime_spent                   1000000 non-null  float64
 6   lifetime_quantity                1000000 non-null  float64
 7   per_order_spent                  1000000 non-null  float64
 8   per_order_quantity               1000000 non-null  float64
 9   distinct_category_count          1000000 non-null  int64  
 10  discount_frequency_order         1000000 non-null  float64
 11  discount_frequency_product       1000000 non-n

In [51]:
# Make sure max_entry does not have NA
sample_df.dropna(subset = ['max_entry_product_price'],inplace = True)

In [52]:
# Specify columns to standardize
cols_to_standardize = ['avg_time_btw_orders', 'recency',
       'months_elapsed', 'purchase_times', 'lifetime_spent',
       'lifetime_quantity', 'per_order_spent', 'per_order_quantity',
       'distinct_category_count', 'discount_frequency_order',
       'discount_frequency_product', 'outdoor_equipment_pct', 'cargo_pct',
       'soft_cooler_pct', 'bags_pct', 'drinkware_pct', 'hard_cooler_pct',
       'other_pct', 'black_pct', 'white_pct', 'navy_pct', 'seaform_pct',
       'stainless_pct', 'charcoal_pct', 'nordic_purple_pct', 'harvest_red_pct',
       'alpine_yellow_pct', 'other_color_pct', 'product_added_count',
       'product_added_to_wishlist_count', 'product_removed_count',
       'customizer_started_count', 'customizer_completed_count',
       'cart_viewed_count', 'checkout_started_count', 'order_cancelled_count',
       'email_received', 'email_open_rate', 'unsubscribed',
       'max_entry_product_price']

# Create a StandardScaler object
scaler = StandardScaler()

# Fit and transform the specified columns using the StandardScaler object
sample_df[cols_to_standardize] = scaler.fit_transform(sample_df[cols_to_standardize])

#Putting weights on Frequency 
sample_df["avg_time_btw_orders"]=sample_df["avg_time_btw_orders"]*3
#sample_df["recency"]=sample_df["recency"]


# Select the features to use for clustering
X = sample_df[cols_to_standardize]

In [27]:
n_clusters=10

In [28]:
# Perform K-means clustering with the optimal number of clusters
kmeans = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=300, n_init=10, random_state=0)
y_kmeans = kmeans.fit_predict(X)
sample_df['cluster'] = y_kmeans

In [29]:
kmeans=sample_df.groupby('cluster').agg(['mean','std'])

In [31]:
kmeans

Unnamed: 0_level_0,avg_time_btw_orders,avg_time_btw_orders,recency,recency,months_elapsed,months_elapsed,purchase_times,purchase_times,lifetime_spent,lifetime_spent,...,order_cancelled_count,order_cancelled_count,email_received,email_received,email_open_rate,email_open_rate,unsubscribed,unsubscribed,max_entry_product_price,max_entry_product_price
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,...,mean,std,mean,std,mean,std,mean,std,mean,std
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,-1.112216,1.048591,0.010435,0.936407,-0.269747,0.786705,-0.072986,0.502779,-0.140301,0.326733,...,-0.013794,0.813726,0.01376,0.583755,0.003425,0.997246,-0.004234,1.000442,-0.260886,0.548845
1,-2.024439,0.79726,-0.684676,0.772536,0.484793,0.988482,6.990861,5.899176,5.057053,5.590201,...,2.102592,6.882994,6.592557,7.455554,0.189855,1.001299,-0.42209,0.575675,0.327551,1.337597
2,-1.197026,1.107944,0.209907,1.076088,-0.309105,0.79067,-0.17027,0.460358,-0.252637,0.233183,...,-0.035789,0.780918,-0.251986,0.318387,-0.011644,1.02924,0.063853,1.029795,-0.345024,0.361834
3,-0.829423,1.620797,0.273161,1.15888,-0.231219,0.908617,-0.216926,0.474884,0.297455,0.809822,...,-0.023681,0.851619,-0.287073,0.291531,-0.081006,0.980623,0.104523,1.06651,2.359643,1.515847
4,10.641941,3.063589,-0.04927,0.864184,1.757435,0.720369,-0.437463,0.05855,-0.22527,0.260826,...,-0.101299,0.483234,-0.329381,0.214769,-0.06797,0.954704,0.223585,1.116326,0.018201,1.050323
5,3.266033,1.582648,-0.037683,0.946401,0.789888,0.905751,-0.291263,0.233429,-0.211446,0.263565,...,-0.084232,0.540721,-0.188723,0.347589,-0.101233,0.976638,-0.05886,0.930834,-0.154808,0.786063
6,-1.081902,1.775427,-0.19711,0.903349,-0.2814,0.95054,0.241499,1.306304,5.532747,4.746841,...,0.100596,1.244814,0.268726,1.671433,-0.236772,0.926606,0.372915,1.180366,-0.008028,1.034584
7,-1.212996,1.117078,-0.108029,1.04907,-0.342855,0.830856,-0.023311,0.556442,-0.101266,0.347852,...,-0.006681,0.846686,-0.110606,0.457981,0.08142,1.015059,-0.043328,0.990682,0.008886,0.931772
8,-1.608439,0.6745,-0.569538,0.701364,0.510595,1.02053,2.183779,2.079867,1.186898,1.591113,...,0.46849,2.226726,2.018323,2.033065,0.205089,0.949403,-0.418404,0.571433,0.162373,1.142621
9,-0.877963,1.567666,-0.060595,0.988033,-0.356759,0.867814,-0.16832,0.492191,0.215703,0.739276,...,-0.039284,0.762028,-0.269071,0.310413,0.066209,0.9833,0.191882,1.114,1.110978,1.207232


We should keep Cluster:
0,1,5,7,8,4

In [39]:
sample_df['cluster'].value_counts()

0    303532
2    209390
5    157314
7    127349
8     53861
9     52926
3     44670
4     38950
6      8209
1      3237
Name: cluster, dtype: int64

In [40]:
kmeans.iloc[:,0:15]

Unnamed: 0_level_0,avg_time_btw_orders,avg_time_btw_orders,recency,recency,months_elapsed,months_elapsed,purchase_times,purchase_times,lifetime_spent,lifetime_spent,lifetime_quantity,lifetime_quantity,per_order_spent,per_order_spent,per_order_quantity
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
0,-1.112216,1.048591,0.010435,0.936407,-0.269747,0.786705,-0.072986,0.502779,-0.140301,0.326733,-0.033017,0.415291,-0.121495,0.441455,0.033073
1,-2.024439,0.79726,-0.684676,0.772536,0.484793,0.988482,6.990861,5.899176,5.057053,5.590201,4.518,4.404706,0.497853,1.167365,0.584607
2,-1.197026,1.107944,0.209907,1.076088,-0.309105,0.79067,-0.17027,0.460358,-0.252637,0.233183,-0.230835,0.180271,-0.257675,0.300564,-0.206339
3,-0.829423,1.620797,0.273161,1.15888,-0.231219,0.908617,-0.216926,0.474884,0.297455,0.809822,-0.243253,0.187831,0.682697,0.932859,-0.215907
4,10.641941,3.063589,-0.04927,0.864184,1.757435,0.720369,-0.437463,0.05855,-0.22527,0.260826,-0.257046,0.185289,-0.029633,0.610971,-0.145068
5,3.266033,1.582648,-0.037683,0.946401,0.789888,0.905751,-0.291263,0.233429,-0.211446,0.263565,-0.194705,0.247667,-0.127464,0.433015,-0.106486
6,-1.081902,1.775427,-0.19711,0.903349,-0.2814,0.95054,0.241499,1.306304,5.532747,4.746841,7.34103,4.730267,7.125978,5.837958,8.258549
7,-1.212996,1.117078,-0.108029,1.04907,-0.342855,0.830856,-0.023311,0.556442,-0.101266,0.347852,-0.065557,0.329661,-0.096642,0.38915,-0.02816
8,-1.608439,0.6745,-0.569538,0.701364,0.510595,1.02053,2.183779,2.079867,1.186898,1.591113,1.02899,1.173214,0.155522,0.602837,0.185879
9,-0.877963,1.567666,-0.060595,0.988033,-0.356759,0.867814,-0.16832,0.492191,0.215703,0.739276,-0.228565,0.186115,0.47924,0.883989,-0.205927


In [41]:
kmeans.iloc[:,15:30]

Unnamed: 0_level_0,per_order_quantity,distinct_category_count,distinct_category_count,discount_frequency_order,discount_frequency_order,discount_frequency_product,discount_frequency_product,outdoor_equipment_pct,outdoor_equipment_pct,cargo_pct,cargo_pct,soft_cooler_pct,soft_cooler_pct,bags_pct,bags_pct
Unnamed: 0_level_1,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
0,0.550137,0.183051,0.665753,-0.257423,0.632083,-0.325474,0.482478,-0.03143,0.786477,-0.085304,0.429265,-0.192197,0.434612,-0.129674,0.460115
1,1.653426,2.309017,1.847623,0.862909,1.068457,0.880228,1.175242,-0.016186,0.389319,0.126937,0.845426,-0.029521,0.474303,0.074841,0.555984
2,0.191402,-0.803428,0.543403,-0.600431,0.376309,-0.536749,0.37618,-0.149209,0.481121,-0.110167,0.325756,-0.23685,0.392023,-0.17156,0.385366
3,0.179966,0.183579,0.956765,-0.059795,1.007119,0.052788,1.133549,-0.029521,0.880738,-0.009639,0.825888,0.139076,1.08105,0.01138,0.950706
4,0.362331,-0.33121,0.716207,-0.319302,0.743531,-0.259556,0.791934,-0.046379,0.972736,0.016557,1.180444,0.106762,1.221783,0.028416,1.139889
5,0.377719,-0.15755,0.778567,-0.210245,0.809303,-0.196,0.794422,-0.027362,0.920832,-0.0293,0.850787,-0.077632,0.764253,-0.058919,0.777337
6,5.334458,0.198234,0.962178,-0.175094,0.908091,-0.414239,0.603727,-0.162721,0.487499,-0.093423,0.650831,-0.093819,1.116225,-0.072813,1.113772
7,0.409309,0.177527,0.962371,1.77262,0.520165,1.745835,0.78387,-0.021998,0.805771,-0.03519,0.668386,-0.027079,0.686112,-0.074094,0.586921
8,0.694975,1.519137,1.485912,0.517649,1.030033,0.48004,1.070473,0.039827,0.563887,0.073418,0.695185,-0.050743,0.457294,0.043212,0.563558
9,0.194583,0.540779,1.190051,-0.055468,0.995794,0.040289,1.106321,0.94957,2.758719,1.024798,3.290128,2.207251,2.542617,1.708057,3.00768


In [42]:
kmeans.iloc[:,30:45]

Unnamed: 0_level_0,drinkware_pct,drinkware_pct,hard_cooler_pct,hard_cooler_pct,other_pct,other_pct,black_pct,black_pct,white_pct,white_pct,navy_pct,navy_pct,seaform_pct,seaform_pct,stainless_pct
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
0,-0.435699,0.543682,-0.232822,0.355391,0.83544,0.64826,-0.084961,0.762936,-0.144999,0.649743,-0.119304,0.717468,-0.111831,0.699759,-0.176485
1,-0.198542,0.674251,-0.039651,0.621111,0.223342,0.858062,-0.00966,0.569306,0.023557,0.539083,-0.093918,0.536367,-0.101123,0.416633,-0.109577
2,1.130431,0.394665,-0.268686,0.316359,-0.931593,0.359136,0.076564,1.242968,0.123194,1.254622,0.047213,1.147882,0.201055,1.35668,0.06201
3,-1.240864,0.933849,3.28771,1.953538,-0.616143,0.766739,-0.260862,0.769473,0.657077,1.850059,0.486527,1.586294,-0.182405,0.929705,-0.165318
4,0.095812,1.142883,0.277893,1.395428,-0.337904,0.958903,-0.009741,1.134635,-0.017633,1.105132,-0.173583,0.943015,0.040911,1.228595,0.922879
5,0.176286,0.91535,-0.085354,0.719914,-0.08272,0.951472,0.024029,1.046641,-0.036678,0.947242,-0.055283,0.957135,0.052386,1.084685,0.211768
6,-0.472864,0.738993,-0.185489,0.819142,0.826262,0.912654,0.326228,1.321823,0.152183,1.238747,0.316613,1.286898,-0.368798,0.432182,0.041177
7,0.053546,0.841679,-0.106501,0.542278,0.057547,0.968023,0.100005,1.003017,0.032158,0.913655,0.044687,0.902726,0.094536,0.964123,-0.069104
8,-0.24647,0.640015,-0.095734,0.445714,0.332079,0.813596,-0.054903,0.587018,-0.028093,0.566102,-0.113529,0.553437,-0.093184,0.476146,-0.091739
9,-1.313278,0.881569,0.057453,0.829681,-0.709285,0.635439,0.105497,1.220378,-0.162233,0.779549,0.343503,1.482265,-0.255193,0.750388,-0.142324


In [43]:
kmeans.iloc[:,45:]

Unnamed: 0_level_0,stainless_pct,charcoal_pct,charcoal_pct,nordic_purple_pct,nordic_purple_pct,harvest_red_pct,harvest_red_pct,alpine_yellow_pct,alpine_yellow_pct,other_color_pct,...,order_cancelled_count,order_cancelled_count,email_received,email_received,email_open_rate,email_open_rate,unsubscribed,unsubscribed,max_entry_product_price,max_entry_product_price
Unnamed: 0_level_1,std,mean,std,mean,std,mean,std,mean,std,mean,...,mean,std,mean,std,mean,std,mean,std,mean,std
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,0.514717,-0.201077,0.428491,-0.104261,0.582264,-0.05973,0.682753,-0.070112,0.654345,0.460077,...,-0.013794,0.813726,0.01376,0.583755,0.003425,0.997246,-0.004234,1.000442,-0.260886,0.548845
1,0.564963,-0.01481,0.459856,-0.080279,0.401812,-0.004227,0.447421,0.003862,0.520219,0.174631,...,2.102592,6.882994,6.592557,7.455554,0.189855,1.001299,-0.42209,0.575675,0.327551,1.337597
2,1.194158,-0.179384,0.624395,0.090428,1.281531,0.176526,1.420306,0.107647,1.361623,-0.277797,...,-0.035789,0.780918,-0.251986,0.318387,-0.011644,1.02924,0.063853,1.029795,-0.345024,0.361834
3,0.687664,0.183886,1.297816,0.001344,1.239438,0.252261,1.752452,-0.016924,1.129573,-0.468769,...,-0.023681,0.851619,-0.287073,0.291531,-0.081006,0.980623,0.104523,1.06651,2.359643,1.515847
4,1.981271,0.017257,1.106627,-0.01316,1.043663,-0.08033,0.927283,-0.017998,1.068052,-0.282506,...,-0.101299,0.483234,-0.329381,0.214769,-0.06797,0.954704,0.223585,1.116326,0.018201,1.050323
5,1.261649,-0.071242,0.80461,-0.020007,0.928574,-0.119164,0.704725,-0.04971,0.86757,0.003454,...,-0.084232,0.540721,-0.188723,0.347589,-0.101233,0.976638,-0.05886,0.930834,-0.154808,0.786063
6,1.212598,-0.017463,1.117172,-0.199717,0.399406,-0.060087,0.88114,-0.093871,0.829103,-0.195887,...,0.100596,1.244814,0.268726,1.671433,-0.236772,0.926606,0.372915,1.180366,-0.008028,1.034584
7,0.813874,-0.018581,0.716757,0.149078,1.200182,-0.062009,0.736599,0.018838,0.909983,-0.129958,...,-0.006681,0.846686,-0.110606,0.457981,0.08142,1.015059,-0.043328,0.990682,0.008886,0.931772
8,0.54222,-0.029803,0.494555,-0.069179,0.460885,-0.025135,0.526208,-0.016844,0.519467,0.238492,...,0.46849,2.226726,2.018323,2.033065,0.205089,0.949403,-0.418404,0.571433,0.162373,1.142621
9,0.738003,1.98538,2.450158,0.055778,1.378474,0.028935,1.246164,0.137635,1.564009,-0.856527,...,-0.039284,0.762028,-0.269071,0.310413,0.066209,0.9833,0.191882,1.114,1.110978,1.207232


### Put weights on Recency.

In [53]:
#Putting weights on Recency
R_sample_df=sample_df
R_sample_df["recency"]=R_sample_df["recency"]*3


# Select the features to use for clustering
X = R_sample_df[cols_to_standardize]

In [54]:
# Perform K-means clustering with the optimal number of clusters
kmeans = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=300, n_init=10, random_state=0)
y_kmeans = kmeans.fit_predict(X)
R_sample_df['cluster'] = y_kmeans

In [55]:
R_kmeans=R_sample_df.groupby('cluster').agg(['mean','std'])
R_kmeans

Unnamed: 0_level_0,avg_time_btw_orders,avg_time_btw_orders,recency,recency,months_elapsed,months_elapsed,purchase_times,purchase_times,lifetime_spent,lifetime_spent,...,order_cancelled_count,order_cancelled_count,email_received,email_received,email_open_rate,email_open_rate,unsubscribed,unsubscribed,max_entry_product_price,max_entry_product_price
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,...,mean,std,mean,std,mean,std,mean,std,mean,std
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,-1.380175,1.133573,5.035821,2.210307,0.325996,0.695199,-0.192277,0.499521,-0.17014,0.394232,...,-0.047834,0.752737,-0.142216,0.497894,-0.209352,0.967404,-0.079736,0.898337,-0.047963,0.94917
1,-1.177543,1.072241,0.926443,1.081333,-0.229873,0.667354,-0.140998,0.465824,-0.20061,0.277832,...,0.01714,0.944384,-0.085625,0.498554,-0.033144,1.013153,-0.073343,0.919391,-0.268843,0.548285
2,11.033174,3.015598,-0.477029,2.316602,1.774136,0.725324,-0.438427,0.056698,-0.225377,0.26181,...,-0.103333,0.477897,-0.33296,0.213032,-0.060338,0.955022,0.244573,1.129977,0.014892,1.049571
3,-2.026998,0.760298,-2.085733,2.262275,0.498354,0.992639,6.816909,5.729595,4.880008,5.431582,...,2.004791,6.594133,6.36759,7.119788,0.192236,1.00105,-0.419661,0.576381,0.33374,1.359702
4,-1.157219,0.985879,-2.535041,0.846156,-0.719256,0.713783,-0.063145,0.493143,-0.145737,0.30341,...,-0.04425,0.704842,-0.13569,0.470352,0.167492,1.007733,0.17069,1.139289,-0.219224,0.614346
5,-1.089373,1.769655,-0.635474,2.608366,-0.288554,0.945572,0.249359,1.31248,5.589734,4.755869,...,0.105806,1.255297,0.287925,1.713656,-0.234698,0.925571,0.372723,1.181164,-0.007166,1.035294
6,-1.493511,0.702529,-2.090684,1.529175,0.46982,1.013712,1.905407,1.864274,1.002422,1.410112,...,0.387449,1.982437,1.705302,1.855123,0.201091,0.952954,-0.416987,0.574745,0.171916,1.140707
7,2.880436,1.642487,-2.164061,1.082515,0.466361,0.907326,-0.247537,0.280216,-0.187556,0.278956,...,-0.086549,0.525002,-0.148796,0.392943,-0.068764,0.988485,-0.096331,0.910667,-0.120585,0.833078
8,-0.978267,1.453114,-0.579463,2.075576,-0.471883,0.779197,-0.172403,0.502882,0.337327,0.840737,...,-0.017507,0.874265,-0.284696,0.296832,0.028749,0.983654,0.200054,1.12024,1.953805,1.398527
9,3.587548,1.839633,2.495866,1.736319,1.174871,0.759859,-0.325344,0.198216,-0.215438,0.274623,...,-0.077231,0.577402,-0.235356,0.296135,-0.145861,0.961366,-0.010874,0.952645,-0.083684,0.906207


In [56]:
R_sample_df['cluster'].value_counts()

4    259516
1    234449
0    142763
7    112523
9     69268
8     68078
6     66279
2     34703
5      8054
3      3805
Name: cluster, dtype: int64