In [1]:
### NOTE: If the code is rerun, due to the stochasticity of the algorithms (despite setting a random state parameter),
# the results might be different. In order to see what is reflected in the report, please see printed output. Thank you!

import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn import metrics
import numpy as np

In [2]:
with open('train_df_std') as f:
    X = pd.read_csv(f)

#Getting non standardized df to compute std and mean of cols to unstandardize clusters
#for interpretability
with open('train_df') as f:
    X_raw = pd.read_csv(f)

In [4]:
# Creating a dataframe to keep the summaries of the clusters we find interesting
cluster_df=pd.DataFrame()

## K-Means Section

In [6]:
std_columns=['Age Group',
 'Num of Bad Mental Health Days',
 'Years Since Last Checkup',
 'Hours of Sleeping',
 'Age Started Smoking',
 'Cigarettes per Day',
 'Days Drinking',
'Income',
 'BMI']
unstandardize_dict={}
for col in std_columns:
    unstandardize_dict[col]=[X_raw[col].mean(axis=0),X_raw[col].std(axis=0)]
unstandardize_dict

{'Age Group': [7.61845370221172, 3.5673440802355807],
 'Num of Bad Mental Health Days': [4.396691238521362, 8.323896862439367],
 'Years Since Last Checkup': [1.3449712518815946, 0.7691084640829157],
 'Hours of Sleeping': [7.022121201546527, 1.4922760097657395],
 'Age Started Smoking': [7.102010698097683, 8.921420510930034],
 'Cigarettes per Day': [6.068767859836509, 10.3211089273867],
 'Days Drinking': [1.191693481353699, 1.8552323422844226],
 'Income': [6.781393771739068, 2.293759987057213],
 'BMI': [28.54164256948259, 6.250201767308392]}

In [3]:
silhouette=[]
DB=[]
CH=[]
cluster_list=[2,3,4,5,6,7,8,9,10,11,12]

for i in cluster_list:
    km = KMeans(n_clusters = i,init = 'k-means++',max_iter=1000, n_init=1,verbose=False,random_state=42)
    km.fit(X)
    silhouette.append(metrics.silhouette_score(X, km.labels_,sample_size = 1000))
    DB.append(metrics.davies_bouldin_score(X,km.labels_))
    CH.append(metrics.calinski_harabasz_score(X,km.labels_))

In [4]:
silhouette,DB,CH

([0.14845809590997555,
  0.14071196401267336,
  0.09136555449786156,
  0.0848647169492368,
  0.10139634994462915,
  0.09600249696319915,
  0.07388673167849331,
  0.08623306727333453,
  0.08662793261613963,
  0.07623884433323888,
  0.07593484868942856],
 [2.5271553419653925,
  2.298009839437643,
  2.42284145474288,
  2.304835532338005,
  2.173453014771115,
  2.1620000280912346,
  2.1268801723174047,
  2.052253298621979,
  2.0064921953597405,
  2.1189821259786075,
  2.278282159225411],
 [50195.42902457939,
  39072.68169559474,
  34460.40655359041,
  32390.940440474067,
  30916.153021324564,
  27979.67204857384,
  26339.789009836237,
  24953.10302644066,
  23337.426406678005,
  22269.82199962306,
  20695.679233375937])

Since 2 clusters is the optimal amount of clusters for a silhouette score and DH score, let's use that clustering and see if we can notice anything interesting about the cancer rates in either of these clusters.

In [25]:
n_clusters=2
km = KMeans(n_clusters = n_clusters,init = 'k-means++',max_iter=1000, n_init=1,verbose=False,random_state=42)
km.fit(X)
#Finding cancer ratio for each cluster
X['labels']=km.labels_
cancer_ratio=[]
for i in range(n_clusters):
    df=X[X.labels==i]
    cancer_ratio.append(df.Cancer.sum()/len(df))
cancer_ratio

[0.14052244968922503, 0.096318814025152]

One cluster does have 14% cancer rate, but that doesn't meet our 1.5 x normal rate threshold. Let's now try a cluster count of 10 as DB suggests

In [7]:
n_clusters=10
km = KMeans(n_clusters = n_clusters,init = 'k-means++',max_iter=1000, n_init=1,verbose=False,random_state=42)
km.fit(X)
#Finding cancer ratio for each cluster
X['labels']=km.labels_
cancer_ratio=[]
for i in range(n_clusters):
    df=X[X.labels==i]
    cancer_ratio.append(df.Cancer.sum()/len(df))
cancer_ratio

[0.1856427149818132,
 0.03151401367842296,
 0.07880763706202289,
 0.11330890462129567,
 0.15846719008859755,
 0.15048873103189386,
 0.20735413290113452,
 0.1713536354363981,
 0.08659855173097906,
 0.014145784793875708]

Cluster with highest percent has about 20% cancer rate, almost twice that of the rate in our dataset, second highest has 18%.

In [8]:
hi_cancer_index=np.argsort(cancer_ratio)[-1]
second_best_index=np.argsort(cancer_ratio)[-2]
third_best_index=np.argsort(cancer_ratio)[-3]
hi_cancer_index,second_best_index,third_best_index

(6, 0, 7)

In [9]:
hi_cancer_cluster=X[X.labels==hi_cancer_index]
print(len(hi_cancer_cluster))

#unstandardizing cluster
for col in unstandardize_dict.keys():
    hi_cancer_cluster[col]=(hi_cancer_cluster[col]*unstandardize_dict[col][1])+unstandardize_dict[col][0]
    
cluster_df['Cluster_A']=hi_cancer_cluster.sum()/len(hi_cancer_cluster)
#note the binary columns will display as percentages, continuous/ordinal variables  will be average values
cluster_df.Cluster_A

9872


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hi_cancer_cluster[col]=(hi_cancer_cluster[col]*unstandardize_dict[col][1])+unstandardize_dict[col][0]


Gender                                           0.661467
Age Group                                        9.945551
Num of Bad Mental Health Days                    3.969330
Could Afford Doctor                              0.072462
Years Since Last Checkup                         1.184986
Exercise in Past 30 Days                         0.604079
Hours of Sleeping                                6.963819
Heart Attack                                     0.173467
Heart Disease                                    0.183634
Stroke                                           0.098590
Asthma                                           0.185533
Depression                                       0.255338
Kidney Disease                                   0.099743
Arthritis                                        0.587183
Diabetes                                         0.314362
Married                                          0.499267
Deaf                                             0.206597
Blind         

In [10]:
second_best_cluster=X[X.labels==second_best_index]
print(len(second_best_cluster))
#unstandardizing cluster
for col in unstandardize_dict.keys():
    second_best_cluster[col]=(second_best_cluster[col]*unstandardize_dict[col][1])+unstandardize_dict[col][0]

cluster_df['Cluster_B']=second_best_cluster.sum()/len(second_best_cluster)
#note the binary columns will display as percentages, continuous/ordinal variables  will be average values
cluster_df.Cluster_B

58284


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  second_best_cluster[col]=(second_best_cluster[col]*unstandardize_dict[col][1])+unstandardize_dict[col][0]


Gender                                           0.317806
Age Group                                       10.967730
Num of Bad Mental Health Days                    1.187493
Could Afford Doctor                              0.035235
Years Since Last Checkup                         1.061886
Exercise in Past 30 Days                         0.718539
Hours of Sleeping                                7.203059
Heart Attack                                     0.077287
Heart Disease                                    0.090660
Stroke                                           0.067133
Asthma                                           0.125070
Depression                                       0.120026
Kidney Disease                                   0.077420
Arthritis                                        0.529734
Diabetes                                         0.225464
Married                                          0.477393
Deaf                                             0.144777
Blind         

In [11]:
third_best_cluster=X[X.labels==third_best_index]
print(len(third_best_cluster))
#unstandardizing cluster
for col in unstandardize_dict.keys():
    third_best_cluster[col]=(third_best_cluster[col]*unstandardize_dict[col][1])+unstandardize_dict[col][0]

cluster_df['Cluster_C']=third_best_cluster.sum()/len(third_best_cluster)
#note the binary columns will display as percentages, continuous/ordinal variables  will be average values
cluster_df.Cluster_C

6863


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  third_best_cluster[col]=(third_best_cluster[col]*unstandardize_dict[col][1])+unstandardize_dict[col][0]


Gender                                           0.410316
Age Group                                        9.247072
Num of Bad Mental Health Days                    8.231239
Could Afford Doctor                              0.116339
Years Since Last Checkup                         1.165598
Exercise in Past 30 Days                         0.475216
Hours of Sleeping                               11.736419
Heart Attack                                     0.148296
Heart Disease                                    0.136319
Stroke                                           0.134104
Asthma                                           0.210521
Depression                                       0.392443
Kidney Disease                                   0.116674
Arthritis                                        0.521122
Diabetes                                         0.293866
Married                                          0.307498
Deaf                                             0.182726
Blind         

## Agglomerative Clustering

In [12]:
#Using test_df_std since the training set was too large for agglomerative clustering.
with open('test_df_std') as f:
    X_test = pd.read_csv(f)

#Getting non standardized df to compute std and mean of cols to unstandardize clusters
#for interpretability
with open('test_df') as f:
    X_test_raw = pd.read_csv(f)

In [13]:
std_columns=['Age Group',
 'Num of Bad Mental Health Days',
 'Years Since Last Checkup',
 'Hours of Sleeping',
 'Age Started Smoking',
 'Cigarettes per Day',
 'Days Drinking',
'Income',
 'BMI']
unstandardize_dict={}
for col in std_columns:
    unstandardize_dict[col]=[X_test_raw[col].mean(axis=0),X_test_raw[col].std(axis=0)]
unstandardize_dict

{'Age Group': [7.638452311890301, 3.5625952863530213],
 'Num of Bad Mental Health Days': [4.340016914039027, 8.230763178131637],
 'Years Since Last Checkup': [1.340917946897508, 0.7639975045505629],
 'Hours of Sleeping': [7.0295587870438085, 1.4861359866832793],
 'Age Started Smoking': [7.144020162681092, 8.966439888729122],
 'Cigarettes per Day': [6.129175824737502, 10.239826310172502],
 'Days Drinking': [1.1970157616380357, 1.8572624874076875],
 'Income': [6.788780921541784, 2.2916063949734133],
 'BMI': [28.50097698549277, 6.208919952944744]}

In [6]:
silhouette=[]
DB=[]
CH=[]
for i in cluster_list:
    ag=AgglomerativeClustering(n_clusters = i)
    ag.fit(X_test)
    silhouette.append(metrics.silhouette_score(X_test, ag.labels_,sample_size = 1000))
    DB.append(metrics.davies_bouldin_score(X_test,ag.labels_))
    CH.append(metrics.calinski_harabasz_score(X_test,ag.labels_))

In [7]:
silhouette,DB,CH

([0.08211701905989412,
  0.08473679005431885,
  0.09076583608028277,
  0.09543171795822326,
  0.05705645295330608,
  0.03717906037408408,
  0.04663522271856349,
  0.0628275243239851,
  0.06920644900880353,
  0.04259119821655351,
  0.05472384813808576],
 [3.4646500455593023,
  2.8794410764682037,
  2.52572513967907,
  2.2679409736121032,
  2.5570631087319744,
  2.5070816556690105,
  2.4733325544094122,
  2.3024955330054437,
  2.2501168708895127,
  2.265086143871992,
  2.3394051283250827],
 [3391.5359910315615,
  3349.4647871267252,
  3255.422898075652,
  3246.540273586324,
  2970.777571451951,
  2724.989226623972,
  2521.058425347685,
  2367.2723846987096,
  2248.621295099417,
  2155.9960873322316,
  2081.5884581923156])

In [8]:
#Silhouette score algo says 5 is the optimal cluster amount
n_clusters=5
ag = AgglomerativeClustering(n_clusters = n_clusters)
ag.fit(X_test)
#Finding cancer ratio for each cluster
X_test['labels']=ag.labels_
cancer_ratio=[]
for i in range(n_clusters):
    df=X_test[X_test.labels==i]
    cancer_ratio.append(df.Cancer.sum()/len(df))
cancer_ratio

[0.10933249633814605,
 0.1442349846558527,
 0.04013556903317874,
 0.12082066869300911,
 0.14284420289855074]

Nothing too interesting here.

In [14]:
#DB algo says 10 is the optimal cluster
n_clusters=10
ag = AgglomerativeClustering(n_clusters = n_clusters)
ag.fit(X_test)
#Finding cancer ratio for each cluster
X_test['labels']=ag.labels_
cancer_ratio=[]
for i in range(n_clusters):
    df=X_test[X_test.labels==i]
    cancer_ratio.append(df.Cancer.sum()/len(df))
cancer_ratio

[0.054637607704004054,
 0.12082066869300911,
 0.04013556903317874,
 0.09076803723816912,
 0.22129186602870812,
 0.1364170913367307,
 0.18310457053912552,
 0.18706467661691542,
 0.1564245810055866,
 0.11050156739811912]

In [15]:
hi_cancer_index=np.argsort(cancer_ratio)[-1]
second_best_index=np.argsort(cancer_ratio)[-2]
third_best_index=np.argsort(cancer_ratio)[-3]
hi_cancer_index,second_best_index,third_best_index

(4, 7, 6)

In [16]:
hi_cancer_cluster=X_test[X_test.labels==hi_cancer_index]
print(len(hi_cancer_cluster))

#unstandardizing cluster
for col in unstandardize_dict.keys():
    hi_cancer_cluster[col]=(hi_cancer_cluster[col]*unstandardize_dict[col][1])+unstandardize_dict[col][0]
    
cluster_df['Cluster_D']=hi_cancer_cluster.sum()/len(hi_cancer_cluster)    
#note the binary columns will display as percentages, continuous/ordinal variables  will be average values
cluster_df.Cluster_D

836


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hi_cancer_cluster[col]=(hi_cancer_cluster[col]*unstandardize_dict[col][1])+unstandardize_dict[col][0]


Gender                                           0.699761
Age Group                                        9.999817
Num of Bad Mental Health Days                    2.080246
Could Afford Doctor                              0.049220
Years Since Last Checkup                         1.099826
Exercise in Past 30 Days                         0.636684
Hours of Sleeping                                7.056922
Heart Attack                                     0.165946
Heart Disease                                    0.160810
Stroke                                           0.091219
Asthma                                           0.186037
Depression                                       0.193082
Kidney Disease                                   0.086914
Arthritis                                        0.531161
Diabetes                                         0.298863
Married                                          0.536401
Deaf                                             0.189100
Blind         

In [17]:
second_best_cluster=X_test[X_test.labels==second_best_index]
print(len(second_best_cluster))

#unstandardizing cluster
for col in unstandardize_dict.keys():
    second_best_cluster[col]=(second_best_cluster[col]*unstandardize_dict[col][1])+unstandardize_dict[col][0]
    
cluster_df['Cluster_E']=second_best_cluster.sum()/len(second_best_cluster)    
#note the binary columns will display as percentages, continuous/ordinal variables  will be average values
cluster_df.Cluster_E

2010


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  second_best_cluster[col]=(second_best_cluster[col]*unstandardize_dict[col][1])+unstandardize_dict[col][0]


Gender                                           0.649254
Age Group                                        9.324855
Num of Bad Mental Health Days                    1.826708
Could Afford Doctor                              0.045350
Years Since Last Checkup                         1.077753
Exercise in Past 30 Days                         0.791169
Hours of Sleeping                                7.148834
Heart Attack                                     0.084324
Heart Disease                                    0.082206
Stroke                                           0.045502
Asthma                                           0.120428
Depression                                       0.159363
Kidney Disease                                   0.042515
Arthritis                                        0.447280
Diabetes                                         0.108099
Married                                          0.589968
Deaf                                             0.147894
Blind         

In [18]:
third_best_cluster=X_test[X_test.labels==third_best_index]
print(len(third_best_cluster))

#unstandardizing cluster
for col in unstandardize_dict.keys():
    third_best_cluster[col]=(third_best_cluster[col]*unstandardize_dict[col][1])+unstandardize_dict[col][0]
    
cluster_df['Cluster_F']=third_best_cluster.sum()/len(third_best_cluster)    
#note the binary columns will display as percentages, continuous/ordinal variables  will be average values
cluster_df.Cluster_F

7067


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  third_best_cluster[col]=(third_best_cluster[col]*unstandardize_dict[col][1])+unstandardize_dict[col][0]


Gender                                           0.354323
Age Group                                       10.599729
Num of Bad Mental Health Days                    0.891132
Could Afford Doctor                              0.038961
Years Since Last Checkup                         1.049624
Exercise in Past 30 Days                         0.739002
Hours of Sleeping                                7.188386
Heart Attack                                     0.076587
Heart Disease                                    0.091180
Stroke                                           0.063761
Asthma                                           0.131497
Depression                                       0.113778
Kidney Disease                                   0.068788
Arthritis                                        0.501031
Diabetes                                         0.216509
Married                                          0.498984
Deaf                                             0.139983
Blind         

In [13]:
#CH algo says 2 is the optimal cluster
n_clusters=2
ag = AgglomerativeClustering(n_clusters = n_clusters)
ag.fit(X_test)
#Finding cancer ratio for each cluster
X_test['labels']=ag.labels_
cancer_ratio=[]
for i in range(n_clusters):
    df=X_test[X_test.labels==i]
    cancer_ratio.append(df.Cancer.sum()/len(df))
cancer_ratio

[0.15233573411660564, 0.06390648334105772]

Nothing that meets our threshold

In [22]:
# Exporting cluster_df to use in cluster visualization notebook
cluster_df.to_csv('cluster_df')