In [30]:
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from sklearn import cluster
import pandas as pd
%matplotlib inline

# read data in, drop index column and rename target variable
data = pd.read_csv('final_df.csv')
data.drop(columns=['Unnamed: 0','idn.lieu'], inplace=True)
data.rename(columns={'Formldehyde':'Formaldehyde'}, inplace=True)

# dropping al chemicals except Formaldehyde
drop_list = 'acetaldehyde acroleine hexaldehyde benzene 1-methoxy-2-propanol trichloroethylene toluene tetrachloroethylene styrene 2-butoxy ethanol 124-trimethylbenzene 4-dichlorobenzene'.split()  
data.drop(columns=drop_list, inplace=True)

# show the data
data.head()

Unnamed: 0,Deodorants.ICOS1,EauDeToilette.ICOS2,ProdSoinCheveux.ICOS3,ProdSoinVisage.ICOS4,VernisOngleDissolv.ICOS5,DesodoAutreEnsens.QPD2b,ACTIVITE,VoitureDansGarage.DGG3n,Fumeurs.FUMEURn,Plantes.QPV,...,Type_Prof,Occupation,Source_ppale_Res,Structure_menage,HSRF,NIACe1,CHEM1,DCA3e1,FC316,Formaldehyde
0,10.5,14.0,0.0,7.0,0.0,0.0,3.15,4,3,1,...,6,1,1,3,12,1914,1,3,1,12.26
1,14.0,14.0,1.0,7.0,0.5,0.0,5.25,4,2,2,...,4,1,1,2,70,1961,1,3,1,10.17
2,7.0,7.0,0.0,7.0,0.0,0.0,2.35,4,1,1,...,5,1,1,1,40,1914,1,3,1,21.55
3,0.0,1.0,0.5,7.0,1.0,0.0,2.1,4,1,1,...,3,1,1,2,75,1967,1,3,1,11.03
4,14.0,10.5,10.5,7.0,0.0,7.0,7.0,4,2,2,...,6,1,1,3,50,1974,1,2,1,16.99


In [31]:
# Converting Formaldehyde to categorical using k-means clustering
n_clusters = 4
np.random.seed(0)

X = data['Formaldehyde'].values
X = X.astype(np.float)
X = X.reshape(-1, 1)
k_means = cluster.KMeans(n_clusters=n_clusters, n_init=10)
k_means.fit(X)
values = k_means.cluster_centers_.squeeze()
# print(values)
labels = k_means.labels_
# print(labels)
data['Formaldehyde_bin'] = labels

# print bin range for each cluster
for i in range(n_clusters):
    # select all rows where G3_bin equals i counter
    df = data.loc[data['Formaldehyde_bin'] == i]
    print('{} to {} |{}|'.format(df['Formaldehyde'].min(), df['Formaldehyde'].max(), df['Formaldehyde'].size))

1.02 to 14.74 |229|
41.6 to 70.75 |20|
24.85 to 39.96 |98|
14.77 to 24.73 |183|


In [32]:
data.loc[data['Formaldehyde_bin'] == 0, 'Formaldehyde_bin'] = 'FADH_c_med'
data.loc[data['Formaldehyde_bin'] == 1, 'Formaldehyde_bin'] = 'FADH_a_zero'
data.loc[data['Formaldehyde_bin'] == 2, 'Formaldehyde_bin'] = 'FADH_d_high'
data.loc[data['Formaldehyde_bin'] == 3, 'Formaldehyde_bin'] = 'FADH_b_low'

data.loc[data['Formaldehyde_bin'] == 'FADH_c_med', 'Formaldehyde_bin'] = 2
data.loc[data['Formaldehyde_bin'] == 'FADH_a_zero', 'Formaldehyde_bin'] = 0
data.loc[data['Formaldehyde_bin'] == 'FADH_d_high', 'Formaldehyde_bin'] = 3
data.loc[data['Formaldehyde_bin'] == 'FADH_b_low', 'Formaldehyde_bin'] = 1

# drop the original Formaldehyde column
data.drop(columns=['Formaldehyde'], inplace=True)

# show the data
data.head()

Unnamed: 0,Deodorants.ICOS1,EauDeToilette.ICOS2,ProdSoinCheveux.ICOS3,ProdSoinVisage.ICOS4,VernisOngleDissolv.ICOS5,DesodoAutreEnsens.QPD2b,ACTIVITE,VoitureDansGarage.DGG3n,Fumeurs.FUMEURn,Plantes.QPV,...,Type_Prof,Occupation,Source_ppale_Res,Structure_menage,HSRF,NIACe1,CHEM1,DCA3e1,FC316,Formaldehyde_bin
0,10.5,14.0,0.0,7.0,0.0,0.0,3.15,4,3,1,...,6,1,1,3,12,1914,1,3,1,2
1,14.0,14.0,1.0,7.0,0.5,0.0,5.25,4,2,2,...,4,1,1,2,70,1961,1,3,1,2
2,7.0,7.0,0.0,7.0,0.0,0.0,2.35,4,1,1,...,5,1,1,1,40,1914,1,3,1,1
3,0.0,1.0,0.5,7.0,1.0,0.0,2.1,4,1,1,...,3,1,1,2,75,1967,1,3,1,2
4,14.0,10.5,10.5,7.0,0.0,7.0,7.0,4,2,2,...,6,1,1,3,50,1974,1,2,1,1


In [33]:
# write out the data
data.to_csv('final_df_kmeans.csv', sep=',')