In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import davies_bouldin_score, silhouette_score
%config InlineBackend.figure_format='retina'
from datetime import datetime
import numpy as np

In [None]:
df_master = pd.read_csv('Data/master_data.csv')

In [None]:
df_master.info()

In [None]:
df = df_master.copy()

In [None]:
df.columns[1:15]

In [None]:
df['COUNTRY'].value_counts()

In [None]:
df['Days_last_order'] = (datetime.now() - pd.to_datetime(df['CREATED_AT'])).dt.days

In [None]:
df.drop(['Unnamed: 0', 'ID_x', 'FIRST_NAME_x', 'ID', 'IS_DEFAULT', 'NAME', 'FIRST_NAME_x',
               'PHONE_y', 'ADDRESS_1', 'ADDRESS_2', 'LAST_NAME_x', 'COMPANY', 'FIRST_NAME_y', 'LAST_NAME_y',
               'ACCEPTS_MARKETING_UPDATED_AT', 'CREATED_AT', 'EMAIL', 'LAST_NAME_x', 'MARKETING_OPT_IN_LEVEL',
               'NOTE', 'PHONE_x', 'STATE', 'TAX_EXEMPT', 'UPDATED_AT', 'VERIFIED_EMAIL', 'DEFAULT_ADDRESS_ID',
               '_FIVETRAN_SYNCED_x', '_FIVETRAN_SYNCED_y', 'CAN_DELETE', 'MULTIPASS_IDENTIFIER', 'COUNT(USER_ID)', 
               'SUM(TOTAL_SPENT)', 'ADDRESS_2', 'PROVINCE', 'PROVINCE_CODE', 'LATITUDE', 'LONGITUDE',
                'ZIP', 'COUNTRY_CODE', 'CITY', 'LIFETIME_DURATION', 'CUSTOMER_ID', 'ACCEPTS_MARKETING'], axis=1, inplace=True)

In [None]:
df.head(10)

In [None]:
df_no_na = df.dropna()
len(df_no_na)

# WITH NAs

## First Approach

In [None]:
for i in df.columns:

    if i.startswith('Channel') or i.startswith('Event'):
        df[i].fillna(0, inplace=True)
    elif i == 'Sessions_minutes':
        df[i].fillna(df[i].mean(), inplace=True)
    elif i == 'Sessions_pages':
        df[i].fillna(df[df['Sessions_count']==1][i].mean(), inplace=True)
    elif i == 'Sessions_first_order':
        df[i].fillna(df[df['Sessions_count']==1][i].mean(), inplace=True)
    elif i == 'Sessions_count':
        df[i].fillna(1, inplace=True)

# for i in df.columns:
#     print(i, ': ', sum(df[i].isna()))
    
df.dropna(inplace=True)

In [None]:
c = df.corr().abs()

s = c.unstack()
so = s.sort_values(kind="quicksort", ascending=False)
so

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm') # This line plots nicely visualized matrix of correlation

In [None]:
df.columns

In [None]:
df_all = pd.get_dummies(df, columns=['COUNTRY'])
df_all.drop(['ID_y', 'USER_ID'], axis=1, inplace=True)

In [None]:
Xstd_all = StandardScaler().fit_transform(df_all)
pca_all = PCA(n_components=3)
components_all = pca_all.fit_transform(Xstd_all)

In [None]:
features = range(pca_all.n_components_)
plt.bar(features, pca_all.explained_variance_ratio_, color='black')
plt.xlabel('PCA features')
plt.ylabel('variance %')
plt.xticks(features)

In [None]:
# Save components to a DataFrame
PCA_components_all = pd.DataFrame(components_all)

In [None]:
plt.scatter(PCA_components_all[0], PCA_components_all[1], alpha=.1, color='black')
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')

In [None]:
ks = range(1, 10)
inertias_all = []
for k in ks:
    # Create a KMeans instance with k clusters: model
    model_all = KMeans(n_clusters=k)
    
    # Fit model to samples
    model_all.fit(PCA_components_all.iloc[:,:3])
    
    # Append the inertia to the list of inertias
    inertias_all.append(model_all.inertia_)
    
plt.plot(ks, inertias_all, '-o', color='black')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()

In [None]:
model_all_chosen = KMeans(n_clusters=4)
model_all_chosen.fit(PCA_components_all.iloc[:,:3])

In [None]:
df_all_result = pd.concat([df.reset_index(drop=True), pd.DataFrame(PCA_components_all.iloc[:,:3])], axis=1)

In [None]:
df_all_result['segment'] = model_all.labels_

In [None]:
df_all.columns[1:40]

## Second Approach

In [None]:
df_merge = df.copy()

In [None]:
for i in df_merge['COUNTRY']:
    if i in ['Belgium', 'Luxembourg']:
        df_merge['COUNTRY'] = df_merge['COUNTRY'].replace(i, 'BE_LUX')
    elif i in ['Netherlands', 'Netherlands Antilles']:
        df_merge['COUNTRY'] = df_merge['COUNTRY'].replace(i, 'NL_ANG')
    elif i in ['United Kingdom', 'Ireland', 'Guernsey', 'Jersey']:
        df_merge['COUNTRY'] = df_merge['COUNTRY'].replace(i, 'UK_IR')
    elif i in ['Finland', 'Denmark', 'Sweden', 'Norway', 'Iceland']:
        df_merge['COUNTRY'] = df_merge['COUNTRY'].replace(i, 'North_EU')
    elif i in ['Germany', 'Austria']:
        df_merge['COUNTRY'] = df_merge['COUNTRY'].replace(i, 'DE_AT')
    elif i in ['Croatia', 'Hungary', 'Slovakia', 'Poland', 'Romania', 'Ukraine', 'Bulgaria', 'Lithuania', 'Latvia', 'Estonia',  'Bosnia', 'Russia', 'Czech Republic', 'Slovenia', 'North Macedonia']:
        df_merge['COUNTRY'] = df_merge['COUNTRY'].replace(i, 'East_EU')
    elif i in ['France', 'Spain', 'Malta', 'Portugal', 'Greece', 'Italy', 'Reunion', 'Monaco', 'Andorra']:
        df_merge['COUNTRY'] = df_merge['COUNTRY'].replace(i, 'South-Med_EU')
    elif i in ['Mexico', 'Canada', 'Colombia', 'Singapore', 'Trinidad and Tobago', 'New Zealand', 'Philippines', "Côte d'Ivoire", 'Oman', 'Solomon Islands','Turkey', 'United Arab Emirates', 'United States', 'Australia', 'Brunei', 'Cyprus']:
        df_merge['COUNTRY'] = df_merge['COUNTRY'].replace(i, 'Non_EU' )
    elif i in ['Switzerland', 'Liechtenstein']:
        df_merge['COUNTRY'] = df_merge['COUNTRY'].replace(i, 'CH_LI')

In [None]:
df_merge['COUNTRY'].value_counts()

In [None]:
df_merge['Event_Blog_Email'] = df_merge['Event_Blog_Post_Clicked'] + df_merge['Event_Blog_Post_Viewed'] +  df_merge['Event_Blogpage_Hero_Clicked'] + df_merge['Event_Email_Capture_Closed'] +df_merge['Event_Email_Capture_Submitted'] + df_merge['Event_Email_Capture_Viewed']
df_merge['Event_Collection'] = df_merge['Event_Collection_Anchor_Clicked'] + df_merge['Event_Collection_Clicked'] + df_merge['Event_Collection_Gender_Selected'] + df_merge['Event_Collection_Page_Viewed'] + df_merge['Event_Interaction_with_Collection_Gender_Filter']
df_merge['Event_Nav_Foot_Search'] = df_merge['Event_Footer_Clicked'] +df_merge ['Event_Footer_Viewed'] + df_merge['Event_Nav_Clicked'] + df_merge['Event_Nav_Closed'] + df_merge['Event_Nav_Opened'] + df_merge['Event_Search_Clicked'] + df_merge['Event_Search_Result_Clicked']
df_merge['Event_Product_Review'] = df_merge['Event_Product_Added'] + df_merge['Event_Product_Clicked'] + df_merge['Event_Product_Info_Clicked'] + df_merge['Event_Product_List_Viewed'] + df_merge['Event_Product_Removed'] + df_merge['Event_Product_Viewed'] + df_merge['Event_Product_Zoom_Clicked'] + df_merge['Event_Homepage_Review_Clicked'] + df_merge['Event_Review_Page_Viewed'] + df_merge['Event_Load_More_Reviews'] + df_merge['Event_Review_Anchor_Clicked']
df_merge['Event_Accordion_Carousel'] = df_merge['Event_Accordion_Clicked'] + df_merge['Event_Interaction_with_Product_Accordion'] + df_merge['Event_Carousel_Clicked']
df_merge['Event_Cart_Checkout'] = df_merge['Event_Cart_Viewed'] + df_merge['Event_Checkout_Started'] + df_merge['Event_Checkout_Step_Viewed']

In [None]:
df_merge['Channel_Organic'] = df_merge['Channel_bing_organic'] + df_merge['Channel_duckduckgo_organic'] + df_merge['Channel_instagram_organic'] + df_merge['Channel_ecosia_organic'] + df_merge['Channel_facebook_organic'] + df_merge['Channel_google_organic']
df_merge['Channel_Social'] = df_merge['Channel_inspiration'] + df_merge['Channel_referral'] + df_merge['Channel_social']
df_merge['Channel_Ads'] = df_merge['Channel_facebook_ads'] + df_merge['Channel_google_ads'] + df_merge['Channel_other_cpc_ads'] + df_merge['Channel_bing_ads']
df_merge['Channel_Email'] = df_merge['Channel_email']

In [None]:
df_merge['Sessions_first_order'][df_merge['Sessions_first_order'] <= 5] = 0
df_merge['Sessions_first_order'][(df_merge['Sessions_first_order'] <= 10) & (df_merge['Sessions_first_order'] > 5)] = 1
df_merge['Sessions_first_order'][(df_merge['Sessions_first_order'] <= 15) & (df_merge['Sessions_first_order'] > 10)] = 2
df_merge['Sessions_first_order'][df_merge['Sessions_first_order'] > 15] = 3

In [None]:
df_merge['ORDERS_COUNT'][df_merge['ORDERS_COUNT'] == 1] = 0
df_merge['ORDERS_COUNT'][(df_merge['ORDERS_COUNT'] <= 4) & (df_merge['ORDERS_COUNT'] > 1)] = 1
df_merge['ORDERS_COUNT'][df_merge['ORDERS_COUNT'] > 5] = 2

In [None]:
df_merge['Days_last_order'][df_merge['Days_last_order'] <= 90] = 0
df_merge['Days_last_order'][(df_merge['Days_last_order'] <= 180) & (df_merge['Days_last_order'] > 90)] = 1
df_merge['Days_last_order'][(df_merge['Days_last_order'] <= 360) & (df_merge['Days_last_order'] > 180)] = 2
df_merge['Days_last_order'][df_merge['Days_last_order'] > 360] = 3

In [None]:
df_merge['Per_session_pages'] = df_merge['Sessions_pages'] / df_merge['Sessions_count']
df_merge.drop(['Sessions_minutes', 'Sessions_count', 'Sessions_pages'], axis=1, inplace=True)

In [None]:
df_merge['Per_session_pages'][df_merge['Per_session_pages'] <= 10] = 0
df_merge['Per_session_pages'][(df_merge['Per_session_pages'] <= 20) & (df_merge['Per_session_pages'] > 10)] = 1
df_merge['Per_session_pages'][(df_merge['Per_session_pages'] <= 30) & (df_merge['Per_session_pages'] > 20)] = 2
df_merge['Per_session_pages'][df_merge['Per_session_pages'] > 30] = 3

In [None]:
df_merge.drop(df_merge.columns[2:75], axis=1, inplace=True)

In [None]:
df_merge = pd.get_dummies(df_merge, columns=['COUNTRY'])

In [None]:
df_merge.columns

In [None]:
Xstd_merge = MinMaxScaler().fit_transform(df_merge)

ks = range(1, 10)
inertias_merge = []
for k in ks:
    # Create a KMeans instance with k clusters: model
    model_merge = KMeans(n_clusters=k, init='k-means++', max_iter=300, n_init=10, random_state=0)
    
    # Fit model to samples
    model_merge.fit(Xstd_merge)
    
    # Append the inertia to the list of inertias
    inertias_merge.append(model_merge.inertia_)
    
plt.plot(ks, inertias_merge, '-o', color='black')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()

In [None]:
model_merge_selected = KMeans(n_clusters=3, init='k-means++', random_state=0, n_init=10, max_iter=300)
y_means = model_merge_selected.fit_predict(Xstd_merge)
df_merge['cluster'] = model_merge_selected.fit_predict(Xstd_merge)

#Visualizing the clusters for k=4
# plt.scatter(Xstd_merge[y_means==0,0],Xstd_merge[y_means==0,1],s=50, c='purple',label='Cluster1')
# plt.scatter(Xstd_merge[y_means==1,0],Xstd_merge[y_means==1,1],s=50, c='blue',label='Cluster2')
# plt.scatter(Xstd_merge[y_means==2,0],Xstd_merge[y_means==2,1],s=50, c='green',label='Cluster3')

# plt.scatter(model_merge_selected.cluster_centers_[:,0], model_merge_selected.cluster_centers_[:,1],s=200,marker='s', c='red', alpha=0.7, label='Centroids')
# plt.title('Customer segments')
# plt.legend()
# plt.show()

pd.plotting.parallel_coordinates(df_merge, 'cluster')

# for i in u_labels:
#     plt.scatter(Xstd_merge[label == i, 0], Xstd_merge[label == i, 1], label=i)
# plt.legend()
# plt.show()

In [None]:
results_merge = df_merge[['ORDERS_COUNT', 'TOTAL_SPENT', 'Sessions_first_order', 'Days_last_order', 'cluster']]
results_merge = pd.concat([results_merge, pd.DataFrame(Xstd_merge[:,4:], columns=df_merge.columns[4:-1])], axis=1)
results_merge.head()

In [None]:
results_merge_one = results_merge.groupby(['cluster']).mean()
results_merge_one

## NO NAs

## Third Approach

In [None]:
df_merge_no_na = df_no_na.reset_index(drop=True).copy()

In [None]:
for i in df_merge_no_na['COUNTRY']:
    if i in ['Belgium', 'Luxembourg']:
        df_merge_no_na['COUNTRY'] = df_merge_no_na['COUNTRY'].replace(i, 'BE_LUX')
    elif i in ['Netherlands', 'Netherlands Antilles']:
        df_merge_no_na['COUNTRY'] = df_merge_no_na['COUNTRY'].replace(i, 'NL_ANG')
    elif i in ['United Kingdom', 'Ireland', 'Guernsey', 'Jersey']:
        df_merge_no_na['COUNTRY'] = df_merge_no_na['COUNTRY'].replace(i, 'UK_IR')
    elif i in ['Finland', 'Denmark', 'Sweden', 'Norway', 'Iceland']:
        df_merge_no_na['COUNTRY'] = df_merge_no_na['COUNTRY'].replace(i, 'North_EU')
    elif i in ['Germany', 'Austria']:
        df_merge_no_na['COUNTRY'] = df_merge_no_na['COUNTRY'].replace(i, 'DE_AT')
    elif i in ['Croatia', 'Hungary', 'Slovakia', 'Poland', 'Romania', 'Ukraine', 'Bulgaria', 'Lithuania', 'Latvia', 'Estonia',  'Bosnia', 'Russia', 'Czech Republic', 'Slovenia', 'North Macedonia']:
        df_merge_no_na['COUNTRY'] = df_merge_no_na['COUNTRY'].replace(i, 'East_EU')
    elif i in ['France', 'Spain', 'Malta', 'Portugal', 'Greece', 'Italy', 'Reunion', 'Monaco', 'Andorra']:
        df_merge_no_na['COUNTRY'] = df_merge_no_na['COUNTRY'].replace(i, 'South-Med_EU')
    elif i in ['Mexico', 'Canada', 'Colombia', 'Singapore', 'Trinidad and Tobago', 'New Zealand', 'Philippines', "Côte d'Ivoire", 'Oman', 'Solomon Islands','Turkey', 'United Arab Emirates', 'United States', 'Australia', 'Brunei', 'Cyprus']:
        df_merge_no_na['COUNTRY'] = df_merge_no_na['COUNTRY'].replace(i, 'Non_EU' )
    elif i in ['Switzerland', 'Liechtenstein']:
        df_merge_no_na['COUNTRY'] = df_merge_no_na['COUNTRY'].replace(i, 'CH_LI')

In [None]:
df_merge_no_na['Event_Blog_Email'] = df_merge_no_na['Event_Blog_Post_Clicked'] + df_merge_no_na['Event_Blog_Post_Viewed'] +  df_merge_no_na['Event_Blogpage_Hero_Clicked'] + df_merge_no_na['Event_Email_Capture_Closed'] +df_merge_no_na['Event_Email_Capture_Submitted'] + df_merge_no_na['Event_Email_Capture_Viewed']
df_merge_no_na['Event_Collection'] = df_merge_no_na['Event_Collection_Anchor_Clicked'] + df_merge_no_na['Event_Collection_Clicked'] + df_merge_no_na['Event_Collection_Gender_Selected'] + df_merge_no_na['Event_Collection_Page_Viewed'] + df_merge_no_na['Event_Interaction_with_Collection_Gender_Filter']
df_merge_no_na['Event_Nav_Foot_Search'] = df_merge_no_na['Event_Footer_Clicked'] +df_merge_no_na ['Event_Footer_Viewed'] + df_merge_no_na['Event_Nav_Clicked'] + df_merge_no_na['Event_Nav_Closed'] + df_merge_no_na['Event_Nav_Opened'] + df_merge_no_na['Event_Search_Clicked'] + df_merge_no_na['Event_Search_Result_Clicked']
df_merge_no_na['Event_Product_Review'] = df_merge_no_na['Event_Product_Added'] + df_merge_no_na['Event_Product_Clicked'] + df_merge_no_na['Event_Product_Info_Clicked'] + df_merge_no_na['Event_Product_List_Viewed'] + df_merge_no_na['Event_Product_Removed'] + df_merge_no_na['Event_Product_Viewed'] + df_merge_no_na['Event_Product_Zoom_Clicked'] + df_merge_no_na['Event_Homepage_Review_Clicked'] + df_merge_no_na['Event_Review_Page_Viewed'] + df_merge_no_na['Event_Load_More_Reviews'] + df_merge_no_na['Event_Review_Anchor_Clicked']
df_merge_no_na['Event_Accordion_Carousel'] = df_merge_no_na['Event_Accordion_Clicked'] + df_merge_no_na['Event_Interaction_with_Product_Accordion'] + df_merge_no_na['Event_Carousel_Clicked']
df_merge_no_na['Event_Cart_Checkout'] = df_merge_no_na['Event_Cart_Viewed'] + df_merge_no_na['Event_Checkout_Started'] + df_merge_no_na['Event_Checkout_Step_Viewed']

df_merge_no_na['Channel_Organic'] = df_merge_no_na['Channel_bing_organic'] + df_merge_no_na['Channel_duckduckgo_organic'] + df_merge_no_na['Channel_instagram_organic'] + df_merge_no_na['Channel_ecosia_organic'] + df_merge_no_na['Channel_facebook_organic'] + df_merge_no_na['Channel_google_organic']
df_merge_no_na['Channel_Social'] = df_merge_no_na['Channel_inspiration'] + df_merge_no_na['Channel_referral'] + df_merge_no_na['Channel_social']
df_merge_no_na['Channel_Ads'] = df_merge_no_na['Channel_facebook_ads'] + df_merge_no_na['Channel_google_ads'] + df_merge_no_na['Channel_other_cpc_ads'] + df_merge_no_na['Channel_bing_ads']
df_merge_no_na['Channel_Email'] = df_merge_no_na['Channel_email']

df_merge_no_na['Sessions_first_order'][df_merge_no_na['Sessions_first_order'] <= 5] = 0
df_merge_no_na['Sessions_first_order'][(df_merge_no_na['Sessions_first_order'] <= 10) & (df_merge_no_na['Sessions_first_order'] > 5)] = 1
df_merge_no_na['Sessions_first_order'][(df_merge_no_na['Sessions_first_order'] <= 15) & (df_merge_no_na['Sessions_first_order'] > 10)] = 2
df_merge_no_na['Sessions_first_order'][df_merge_no_na['Sessions_first_order'] > 15] = 3

df_merge_no_na['ORDERS_COUNT'][df_merge_no_na['ORDERS_COUNT'] == 1] = 0
df_merge_no_na['ORDERS_COUNT'][(df_merge_no_na['ORDERS_COUNT'] <= 4) & (df_merge_no_na['ORDERS_COUNT'] > 1)] = 1
df_merge_no_na['ORDERS_COUNT'][df_merge_no_na['ORDERS_COUNT'] > 5] = 2

df_merge_no_na['Days_last_order'][df_merge_no_na['Days_last_order'] <= 90] = 0
df_merge_no_na['Days_last_order'][(df_merge_no_na['Days_last_order'] <= 180) & (df_merge_no_na['Days_last_order'] > 90)] = 1
df_merge_no_na['Days_last_order'][(df_merge_no_na['Days_last_order'] <= 360) & (df_merge_no_na['Days_last_order'] > 180)] = 2
df_merge_no_na['Days_last_order'][df_merge_no_na['Days_last_order'] > 360] = 3

df_merge_no_na['Per_session_pages'] = df_merge_no_na['Sessions_pages'] / df_merge_no_na['Sessions_count']
df_merge_no_na.drop(['Sessions_minutes', 'Sessions_count', 'Sessions_pages'], axis=1, inplace=True)

df_merge_no_na['Per_session_pages'][df_merge_no_na['Per_session_pages'] <= 10] = 0
df_merge_no_na['Per_session_pages'][(df_merge_no_na['Per_session_pages'] <= 20) & (df_merge_no_na['Per_session_pages'] > 10)] = 1
df_merge_no_na['Per_session_pages'][(df_merge_no_na['Per_session_pages'] <= 30) & (df_merge_no_na['Per_session_pages'] > 20)] = 2
df_merge_no_na['Per_session_pages'][df_merge_no_na['Per_session_pages'] > 30] = 3

df_merge_no_na.drop(df_merge_no_na.columns[2:75], axis=1, inplace=True)
df_merge_no_na = pd.get_dummies(df_merge_no_na, columns=['COUNTRY'])
df_merge_no_na.columns

In [None]:
Xstd_merge_no_na = MinMaxScaler().fit_transform(df_merge_no_na)

In [None]:
ks = range(1, 10)
inertias_merge_no_na = []
for k in ks:
    # Create a KMeans instance with k clusters: model
    model_merge_no_na = KMeans(n_clusters=k, init='k-means++', max_iter=300, n_init=10, random_state=0)
    
    # Fit model to samples
    model_merge_no_na.fit(Xstd_merge_no_na)
    
    # Append the inertia to the list of inertias
    inertias_merge_no_na.append(model_merge_no_na.inertia_)
    
plt.plot(ks, inertias_merge_no_na, '-o', color='black')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()

In [None]:
model_merge_selected_no_na = KMeans(n_clusters=4, init='k-means++', random_state=0, n_init=10, max_iter=300)
df_merge_no_na['cluster'] = model_merge_selected_no_na.fit_predict(Xstd_merge_no_na)

#Visualizing the clusters for k=4
# plt.scatter(Xstd_merge_no_na[y_means==0,0],Xstd_merge_no_na[y_means==0,1],s=50, c='purple',label='Cluster1')
# plt.scatter(Xstd_merge_no_na[y_means==1,0],Xstd_merge_no_na[y_means==1,1],s=50, c='blue',label='Cluster2')
# plt.scatter(Xstd_merge_no_na[y_means==2,0],Xstd_merge_no_na[y_means==2,1],s=50, c='green',label='Cluster3')

# plt.scatter(model_merge_no_na_selected.cluster_centers_[:,0], model_merge_no_na_selected.cluster_centers_[:,1],s=200,marker='s', c='red', alpha=0.7, label='Centroids')
# plt.title('Customer segments')
# plt.legend()
# plt.show()

pd.plotting.parallel_coordinates(df_merge_no_na, 'cluster')

# for i in u_labels:
#     plt.scatter(Xstd_merge[label == i, 0], Xstd_merge[label == i, 1], label=i)
# plt.legend()
# plt.show()

In [None]:
results_merge_no_na = df_merge_no_na[['ORDERS_COUNT', 'TOTAL_SPENT', 'Sessions_first_order', 'Days_last_order', 'cluster']]
results_merge_no_na = pd.concat([results_merge_no_na, pd.DataFrame(Xstd_merge_no_na[:,4:], columns=df_merge_no_na.columns[4:-1])], axis=1)
results_merge_no_na.head()

In [None]:
results_merge_no_na_one = results_merge_no_na.groupby(['cluster']).mean()
results_merge_one.unstack()

In [None]:
results=pd.DataFrame(columns=['Eps','Min_Samples','Number of Cluster','Silhouette Score'])
for i in range(1,12):
    for j in range(1,12):
        dbscan_cluster = DBSCAN(eps=i*0.5, min_samples=j)
        clusters=dbscan_cluster.fit_predict(Xstd_merge_no_na)
    if len(np.unique(clusters))>=2:
       results=results.append({'Eps':i*0.5,'Min_Samples':j,'Numberof  Cluster':len(np.unique(clusters)),'SilhouetteScore':silhouette_score(Xstd_merge_no_na,clusters),'Davies Bouldin Score':davies_bouldin_score(Xstd_merge_no_na,clusters)},ignore_index=True)

In [None]:
results