<h1><center>Numeric Feature Analysis</center></h1>
In this Notebook, I will explain how to clean the numeric (no-binary) feature before applying the classification algorithm.

In [1]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.utils import shuffle
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from boruta import boruta_py

import pandas as pd
import matplotlib.pyplot as plt
import copy
from time import time

from caimcaim import CAIMD

from utility import random_forest_classifer
from utility import random_forest_classifer_params

Let us load the data

In [2]:
data_train = pd.read_csv('data/train2.csv',index_col=0)

Let us devide in binary and no binary feature

In [3]:
filtering_binary =data_train.apply(pd.Series.nunique) ==2
data_train_binary_feature=data_train.loc[:,filtering_binary]

filtering_nobinary =data_train.apply(pd.Series.nunique) >2
filtering_nobinary ['TARGET']=True
data_train_nobinary_feature=data_train.loc[:,filtering_nobinary]

Let us compute the F1-accuracy for no binary features to measure the impact of the cleaning procedures.

In [4]:
matrix_features = data_train_nobinary_feature.as_matrix()[:,:-1]
labels = data_train_nobinary_feature['TARGET'].as_matrix()
f1_score_list,confusion_matrix_list = random_forest_classifer(matrix_features,labels)

#The score method used in this case is the F1 score, which is the harmonic mean of precision and recall#The sc 
F1_accuracy_str="F1 accuracy: %0.3f (+/- %0.3f)" % (np.mean(f1_score_list),
                                                    np.std(f1_score_list) * 2)
print(F1_accuracy_str)

F1 accuracy: 0.544 (+/- 0.011)


In [5]:
data_train_nobinary_happy=data_train_nobinary_feature.loc[data_train_nobinary_feature['TARGET']==0]
print('Size happy:',data_train_nobinary_happy.shape)
data_train_nobinary_unhappy=data_train_nobinary_feature.loc[data_train_nobinary_feature['TARGET']==1]
print('Size unhappy:',data_train_nobinary_unhappy.shape)

Size happy: (73012, 231)
Size unhappy: (3008, 231)


From the analysis described in **BinaryAndNumericFeatureAnalysis** Notebook, I decided to apply the following strategy
in cleaning the numeric features:

* Replace  9999 value in var3 with the median value
* Replace  the max value with a median value for the attributes where the value of  the std is greater than 200
* Binning the values of the attributes using the supervised strategy


Let us use a copy of the dataframe to save all the results of the cleaning operations.

In [6]:
data_train_nobinary_feature_clean = data_train_nobinary_feature.copy()

<h2><center> Replace Values </center></h2>

 Replace  9999 value in var3 with the median value

In [7]:
median_value = data_train_nobinary_feature_clean['var3'].median()
print('Median Value for var3\n',median_value)
print('Unique values for var3 before replacing\n',data_train_nobinary_feature_clean['var3'].value_counts().head())
data_train_nobinary_feature_clean['var3'].replace(to_replace=-999999,value=median_value,inplace=True)
print('Unique values for var3 after replacing\n',data_train_nobinary_feature_clean['var3'].value_counts().head())

Median Value for var3
 2.0
Unique values for var3 before replacing
  2         74165
 8           138
-999999      116
 9           110
 3           108
Name: var3, dtype: int64
Unique values for var3 after replacing
 2    74281
8      138
9      110
3      108
1      105
Name: var3, dtype: int64


In [8]:
matrix_features = data_train_nobinary_feature_clean.as_matrix()[:,:-1]
labels = data_train_nobinary_feature_clean['TARGET'].as_matrix()
f1_score_list,confusion_matrix_list = random_forest_classifer(matrix_features,labels)
#The score method used in this case is the F1 score, which is the harmonic mean of precision and recall#The sc 
F1_accuracy_str="F1 accuracy: %0.3f (+/- %0.3f)" % (np.mean(f1_score_list),
                                                    np.std(f1_score_list) * 2)
print(F1_accuracy_str)

F1 accuracy: 0.541 (+/- 0.006)


From the above result, we can deduce that this substitution has not changed the F1-accuracy, this was expected due to the random forest algorithm, which is robust to the outliers and it can handle different ranges of values.

 Replace  the max value with a median value for the attributes where the value of  the std is greater than 200

In [9]:
columns_nobinary = data_train_nobinary_feature.columns.tolist()
columns_nobinary.remove('TARGET')

attributes_std_name = []
attributes_std_value = []


for colum in columns_nobinary:
        data = data_train_nobinary_feature.loc[:,colum]
        std = data.std()
        attributes_std_value.append(std)
        attributes_std_name.append(colum)
       

In [10]:
th_std=200
list_name_attributes_to_check = []
for name_std,value_std in zip(attributes_std_name,attributes_std_value):
    if value_std>=th_std:
        list_name_attributes_to_check.append(name_std)
print('List attributes to check\n')
print('Len %d\n'%len(list_name_attributes_to_check))
print(list_name_attributes_to_check)

List attributes to check

Len 100

['var3', 'imp_ent_var16_ult1', 'imp_op_var39_comer_ult1', 'imp_op_var39_comer_ult3', 'imp_op_var41_comer_ult1', 'imp_op_var41_comer_ult3', 'imp_op_var41_efect_ult1', 'imp_op_var41_efect_ult3', 'imp_op_var41_ult1', 'imp_op_var39_efect_ult1', 'imp_op_var39_efect_ult3', 'imp_op_var39_ult1', 'imp_sal_var16_ult1', 'saldo_var1', 'saldo_var5', 'saldo_var8', 'saldo_var12', 'saldo_var13_corto', 'saldo_var13_largo', 'saldo_var13', 'saldo_var14', 'saldo_var17', 'saldo_var18', 'saldo_var20', 'saldo_var24', 'saldo_var26', 'saldo_var25', 'saldo_var30', 'saldo_var31', 'saldo_var33', 'saldo_var37', 'saldo_var42', 'saldo_var44', 'delta_imp_aport_var13_1y3', 'delta_imp_aport_var17_1y3', 'delta_imp_aport_var33_1y3', 'delta_imp_compra_var44_1y3', 'delta_imp_reemb_var17_1y3', 'delta_imp_trasp_var17_in_1y3', 'delta_imp_trasp_var33_in_1y3', 'delta_imp_venta_var44_1y3', 'delta_num_aport_var13_1y3', 'delta_num_aport_var17_1y3', 'delta_num_aport_var33_1y3', 'delta_num_compra_v

Let us also add the number of uniques values to the stats dataframe.

In [12]:
data_train_nobinary_feature_std_high = data_train_nobinary_feature.loc[:,list_name_attributes_to_check]
data_train_nobinary_feature_std_high_unique_values = data_train_nobinary_feature_std_high.apply(pd.Series.nunique)
stats_col_std_high = data_train_nobinary_feature_std_high.describe()
stats_col_std_high.loc[stats_col_std_high.shape[0]]=data_train_nobinary_feature_std_high_unique_values
stats_col_std_high.index.name ='stats'
stats_col_std_high=stats_col_std_high.rename({8: 'unique-value'}, axis='index')
stats_col_std_high.to_csv('data/stat_nobinary_attributes_std_high.csv')
stats_col_std_high.head(10)

Unnamed: 0_level_0,var3,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var41_comer_ult1,imp_op_var41_comer_ult3,imp_op_var41_efect_ult1,imp_op_var41_efect_ult3,imp_op_var41_ult1,imp_op_var39_efect_ult1,...,saldo_medio_var17_hace3,saldo_medio_var17_ult1,saldo_medio_var17_ult3,saldo_medio_var33_hace2,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
stats,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
count,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,...,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0
mean,-1523.199277,86.208265,72.363067,119.529632,68.803937,113.056934,68.20514,113.225058,137.242763,68.618087,...,36.46318,131.0316,109.2169,7.935824,12.21558,8.784074,31.505324,76.026165,56.614351,117235.8
std,39033.462364,1614.757313,339.315831,546.266294,319.605516,512.154823,531.897917,950.086398,697.712596,535.47375,...,8612.395,14956.53,13082.16,455.887218,783.207399,538.439211,2013.125393,4040.337842,2852.579397,182664.6
min,-999999.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5163.75
25%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67870.61
50%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,106409.2
75%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,118756.3
max,238.0,210000.0,12888.03,21024.81,12888.03,16566.81,45990.0,131100.0,47598.09,45990.0,...,2368559.0,3998687.0,3525777.0,50003.88,138831.63,91778.73,438329.22,681462.9,397884.3,22034740.0
unique-value,208.0,596.0,7551.0,9099.0,7421.0,8961.0,331.0,454.0,8032.0,336.0,...,18.0,119.0,119.0,43.0,48.0,48.0,99.0,141.0,141.0,57736.0


The following attributes are the ones where I replace the max with the median.

In [13]:
s='delta_imp_aport_var13_1y3	delta_imp_aport_var17_1y3	delta_imp_aport_var33_1y3	delta_imp_compra_var44_1y3	delta_imp_reemb_var17_1y3	delta_imp_trasp_var17_in_1y3	delta_imp_trasp_var33_in_1y3	delta_imp_venta_var44_1y3	delta_num_aport_var13_1y3	delta_num_aport_var17_1y3	delta_num_aport_var33_1y3	delta_num_compra_var44_1y3	delta_num_reemb_var17_1y3	delta_num_trasp_var17_in_1y3	delta_num_trasp_var33_in_1y3	delta_num_venta_var44_1y3'
colums_4_replace = s.split('	')
print('Colums where the max value is replaced withe median\n',colums_4_replace)

Colums where the max value is replaced withe median
 ['delta_imp_aport_var13_1y3', 'delta_imp_aport_var17_1y3', 'delta_imp_aport_var33_1y3', 'delta_imp_compra_var44_1y3', 'delta_imp_reemb_var17_1y3', 'delta_imp_trasp_var17_in_1y3', 'delta_imp_trasp_var33_in_1y3', 'delta_imp_venta_var44_1y3', 'delta_num_aport_var13_1y3', 'delta_num_aport_var17_1y3', 'delta_num_aport_var33_1y3', 'delta_num_compra_var44_1y3', 'delta_num_reemb_var17_1y3', 'delta_num_trasp_var17_in_1y3', 'delta_num_trasp_var33_in_1y3', 'delta_num_venta_var44_1y3']


In [14]:
for colum_to_replace in colums_4_replace:
    max_value = data_train_nobinary_feature_clean[colum_to_replace].max()
    median_value = data_train_nobinary_feature_clean[colum_to_replace].median()
    print('Max value for %s is %d replace with %0.3f '%(colum_to_replace,max_value,median_value))
    data_train_nobinary_feature_clean[colum_to_replace].replace(to_replace=max_value,value=median_value,inplace=True)

Max value for delta_imp_aport_var13_1y3 is 9999999999 replace with 0.000 
Max value for delta_imp_aport_var17_1y3 is 9999999999 replace with 0.000 
Max value for delta_imp_aport_var33_1y3 is 9999999999 replace with 0.000 
Max value for delta_imp_compra_var44_1y3 is 9999999999 replace with 0.000 
Max value for delta_imp_reemb_var17_1y3 is 9999999999 replace with 0.000 
Max value for delta_imp_trasp_var17_in_1y3 is 9999999999 replace with 0.000 
Max value for delta_imp_trasp_var33_in_1y3 is 9999999999 replace with 0.000 
Max value for delta_imp_venta_var44_1y3 is 9999999999 replace with 0.000 
Max value for delta_num_aport_var13_1y3 is 9999999999 replace with 0.000 
Max value for delta_num_aport_var17_1y3 is 9999999999 replace with 0.000 
Max value for delta_num_aport_var33_1y3 is 9999999999 replace with 0.000 
Max value for delta_num_compra_var44_1y3 is 9999999999 replace with 0.000 
Max value for delta_num_reemb_var17_1y3 is 9999999999 replace with 0.000 
Max value for delta_num_trasp_

In [15]:
data_train_nobinary_feature_clean_unique_values = data_train_nobinary_feature_clean.apply(pd.Series.nunique)
stats_col_clean = data_train_nobinary_feature_clean.describe()
stats_col_clean.loc[stats_col_clean.shape[0]]=data_train_nobinary_feature_clean_unique_values
stats_col_clean.index.name ='stats'
stats_col_clean=stats_col_clean.rename({8: 'unique-value'}, axis='index')
stats_col_clean.to_csv('data/stat_nobinary_clean_std_values.csv')
stats_col_clean.head(10)

Unnamed: 0_level_0,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,imp_op_var40_ult1,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
stats,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
count,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,...,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0
mean,2.716483,33.212865,86.208265,72.363067,119.529632,3.55913,6.472698,0.412946,0.567352,3.160715,...,7.935824,1.365146,12.21558,8.784074,31.505324,1.858575,76.026165,56.614351,117235.8,0.039569
std,9.447971,12.956486,1614.757313,339.315831,546.266294,93.155749,153.737066,30.604864,36.513513,95.268204,...,455.887218,113.959637,783.207399,538.439211,2013.125393,147.786584,4040.337842,2852.579397,182664.6,0.194945
min,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5163.75,0.0
25%,2.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67870.61,0.0
50%,2.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,106409.2,0.0
75%,2.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,118756.3,0.0
max,238.0,105.0,210000.0,12888.03,21024.81,8237.82,11073.57,6600.0,6600.0,8237.82,...,50003.88,20385.72,138831.63,91778.73,438329.22,24650.01,681462.9,397884.3,22034740.0,1.0
unique-value,207.0,100.0,596.0,7551.0,9099.0,293.0,346.0,23.0,29.0,224.0,...,43.0,24.0,48.0,48.0,99.0,33.0,141.0,141.0,57736.0,2.0


In [16]:
matrix_features = data_train_nobinary_feature_clean.as_matrix()[:,:-1]
labels = data_train_nobinary_feature_clean['TARGET'].as_matrix()
f1_score_list,confusion_matrix_list = random_forest_classifer(matrix_features,labels)
#The score method used in this case is the F1 score, which is the harmonic mean of precision and recall#The sc 
F1_accuracy_str="F1 accuracy: %0.3f (+/- %0.3f)" % (np.mean(f1_score_list),
                                                    np.std(f1_score_list) * 2)
print(F1_accuracy_str)

F1 accuracy: 0.545 (+/- 0.005)


From the above result, we can deduce that this substitution has not changed the F1-accuracy, this was expected due to the random forest algorithm, which is robust to the outliers and it can handle different ranges of values.

<h2><center> Binning the Values </center></h2>

* There are different strategies for binning feature values (discretization techniques). In 2013, Garcia et al. have published a [*survey*](http://sci2s.ugr.es/sites/default/files/files/Teaching/GraduatesCourses/SIGE/Other/2013-Garcia-IEEETKDE.pdf) of the main methods.

* According to the survey, CAIM discretization algorithm is one of the algorithms that perform better in different experiments, so I decided to use it as discretization technique for this problem.

* These are some relevant links related to the CAIM discretization algorithm

 * [IEEE Paper](https://ieeexplore.ieee.org/document/1269594/)
 * [Paper](http://sci2s.ugr.es/keel/pdf/algorithm/articulo/2008-Tsai-IS.pdf)
 * [Code](https://github.com/lisette-espin/pychimerge)
 * [Python Libary used in my analysis](https://github.com/airysen/caimcaim)

* A test of the library can be found in the TestSnapshot.ipynb


In [18]:
caim = CAIMD()
# I use a balanced dataset to fit the data first
data_train_happy=data_train_nobinary_feature_clean.loc[data_train['TARGET']==0]
data_train_unhappy=data_train_nobinary_feature_clean.loc[data_train['TARGET']==1]
s_perc = 0.5
data_train_unhappy_s = data_train_unhappy.sample(int(s_perc*data_train_unhappy.shape[0]),random_state=21)
data_train_happy_s= data_train_happy.sample(data_train_unhappy_s.shape[0],random_state=32)
data_train_sample_homogenous_nobinary_feature_clean = pd.concat([data_train_happy_s,data_train_unhappy_s])

matrix_for_caim=data_train_sample_homogenous_nobinary_feature_clean.as_matrix()[:,:-1]
labels_form_caim = data_train_sample_homogenous_nobinary_feature_clean.as_matrix()[:,-1]
matrix_nobinary_features_normalized = caim.fit(matrix_for_caim, labels_form_caim)

full_matrix_for_caim = data_train_nobinary_feature_clean.as_matrix()[:,:-1]
full_matrix_for_caim_t = caim.transform(full_matrix_for_caim)

Categorical [19, 20, 22, 23, 24, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 44, 57, 58, 59, 60, 61, 62, 68, 72, 75, 80, 83, 84, 89, 93, 94, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 120, 121, 122, 123, 124, 126, 127, 128, 129, 132, 133, 134, 135, 136, 137, 138, 140, 141, 142, 143, 144, 145, 146, 147, 148, 160, 161, 162, 163, 170, 178, 179, 183, 184, 185, 186, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226]
# 0  GLOBAL CAIM  381.42383292383295
# 1  GLOBAL CAIM  710.7997072579831
# 2  GLOBAL CAIM  384.2217307975822
# 3  GLOBAL CAIM  386.40198856043185
# 4  GLOBAL CAIM  384.1137890759839
# 5  GLOBAL CAIM  378.9474698909887
# 6  GLOBAL CAIM  378.1435251455292
# 7  GLOBAL CAIM  377.87537437603993
# 8  GLOBAL CAIM  378.3339995558517
# 9  GLOBAL CAIM  378.785133618467
# 10  GLOBAL CAIM  385.87097792487845
# 11  GLOBAL CAIM  386.1798225686889
# 12  GLOBAL CAIM  407.13431485654525
# 13  GLOBAL CAIM  410.028

In [19]:
# I save the result because the CAIM algorithm requires a bit of time to produce the results, so 
#I can load the file avoiding to run CAIM each time. 
np.save('data/matrix_nobinary_features_normalized',full_matrix_for_caim_t)

In [20]:
matrix_nobinary_features_normalized = np.load('data/matrix_nobinary_features_normalized.npy')

In [21]:
print(matrix_nobinary_features_normalized.shape)
print(data_train_nobinary_feature.shape)

(76020, 230)
(76020, 231)


In [22]:
labels = data_train_nobinary_feature['TARGET'].as_matrix()
matrix_nobinary_features_normalized_with_labels=np.hstack([matrix_nobinary_features_normalized,labels.reshape(-1,1)])
print(matrix_nobinary_features_normalized_with_labels.shape)

(76020, 231)


In [23]:
data_train_nobinary_clean_normalized=pd.DataFrame(matrix_nobinary_features_normalized_with_labels,columns=data_train_nobinary_feature.columns.values)
print(data_train_nobinary_clean_normalized.shape)

(76020, 231)


In [24]:
matrix_features = data_train_nobinary_clean_normalized.as_matrix()[:,:-1]
labels = data_train_nobinary_clean_normalized['TARGET'].as_matrix()
f1_score_list,confusion_matrix_list = random_forest_classifer(matrix_features,labels)
#The score method used in this case is the F1 score, which is the harmonic mean of precision and recall#The sc 
F1_accuracy_str="F1 accuracy: %0.3f (+/- %0.3f)" % (np.mean(f1_score_list),
                                                    np.std(f1_score_list) * 2)
F1_accuracy = np.mean(f1_score_list)
print(F1_accuracy_str)

F1 accuracy: 0.565 (+/- 0.006)


* From the above result, we can deduce that this substitution has improved a bit the F1-accuracy, this was expected due to the random forest algorithm, which is robust to the outliers and it can handle different ranges of values.

* The binning strategy is more relevant for the normalization steps required by other classification algorithms.



Let us check the range of values after the binning operation 

In [45]:
data_train_nobinary_clean_normalized_unique_values = data_train_nobinary_clean_normalized.apply(pd.Series.nunique)
stats2 = data_train_nobinary_clean_normalized.describe()
stats2.loc[stats2.shape[0]]=data_train_nobinary_clean_normalized_unique_values
stats2.index.name ='stats'
stats2.head(10)

Unnamed: 0_level_0,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,imp_op_var40_ult1,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
stats,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
count,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,...,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0
mean,0.009603,0.585359,15.805637,0.031728,1.151985,0.682305,0.004038,0.087069,0.137451,1.170639,...,7.935824,1.365146,12.21558,8.784074,31.505324,1.858575,50.090754,44.156333,2371.553,0.039569
std,0.866896,0.817429,1371.96287,0.175277,131.206207,64.302427,0.06342,23.937571,27.639643,82.067548,...,455.887218,113.959637,783.207399,538.439211,2013.125393,147.786584,3929.170431,2816.61893,156466.3,0.194945
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
max,238.0,105.0,210000.0,1.0,21024.81,8237.82,1.0,6600.0,6600.0,8237.82,...,50003.88,20385.72,138831.63,91778.73,438329.22,24650.01,681462.9,397884.3,22034740.0,1.0
8,3.0,4.0,12.0,2.0,7.0,11.0,2.0,3.0,4.0,19.0,...,43.0,24.0,48.0,48.0,99.0,33.0,22.0,36.0,39.0,2.0


In [46]:
stats2=stats2.rename({8: 'unique-value'}, axis='index')
stats2.to_csv('data/stat_nobinary_attributes_clean_normalized_with_caim.csv')

In [50]:
data_train_nobinary_clean_normalized_scaled = data_train_nobinary_clean_normalized.copy()

for colum_to_replace in data_train_nobinary_clean_normalized_scaled.columns.tolist():
    if colum_to_replace!='TARGET':
        scaler = MinMaxScaler(feature_range=(-1, 1))
        values = data_train_nobinary_clean_normalized_scaled[colum_to_replace].as_matrix().reshape(-1,1)
        values_s = scaler.fit_transform(values)
        data_train_nobinary_clean_normalized_scaled[colum_to_replace]=values_s  

In [51]:
data_train_nobinary_clean_normalized_scaled_unique_values_2 =  data_train_nobinary_clean_normalized_scaled.apply(pd.Series.nunique)
stats3 =  data_train_nobinary_clean_normalized_scaled.describe()
stats3.loc[stats3.shape[0]]= data_train_nobinary_clean_normalized_scaled_unique_values_2
stats3.index.name ='stats'
stats3.head(10)

Unnamed: 0_level_0,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,imp_op_var40_ult1,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
stats,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
count,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,...,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0
mean,-0.999919,-0.98885,-0.999849,-0.936543,-0.99989,-0.999834,-0.991923,-0.999974,-0.999958,-0.999716,...,-0.999683,-0.999866,-0.999824,-0.999809,-0.999856,-0.999849,-0.999853,-0.999778,-0.999785,0.039569
std,0.007285,0.01557,0.013066,0.350555,0.012481,0.015612,0.126841,0.007254,0.008376,0.019925,...,0.018234,0.01118,0.011283,0.011733,0.009185,0.011991,0.011532,0.014158,0.014202,0.194945
min,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
25%,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
50%,-1.0,-0.980952,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
75%,-1.0,-0.980952,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,3.0,4.0,12.0,2.0,7.0,11.0,2.0,3.0,4.0,19.0,...,43.0,24.0,48.0,48.0,99.0,33.0,22.0,36.0,39.0,2.0


In [52]:
stats3=stats3.rename({9: 'unique-value'}, axis='index')
stats3.to_csv('data/stat_nobinary_attributes_clean_normalized_with_caim_min_max.csv')

In [53]:
matrix_features = data_train_nobinary_clean_normalized_scaled.as_matrix()[:,:-1]
labels = data_train_nobinary_clean_normalized_scaled['TARGET'].as_matrix()
f1_score_list,confusion_matrix_list = random_forest_classifer(matrix_features,labels)
#The score method used in this case is the F1 score, which is the harmonic mean of precision and recall#The sc 
F1_accuracy_str="F1 accuracy: %0.3f (+/- %0.3f)" % (np.mean(f1_score_list),
                                                    np.std(f1_score_list) * 2)
F1_accuracy = np.mean(f1_score_list)
print(F1_accuracy_str)

F1 accuracy: 0.561 (+/- 0.016)


In [54]:
print('Extract (randomly) one confusion matrix (Real vs Prediction) from the previous run: ')
shuffle(confusion_matrix_list,random_state=15)
cnf_matrix = confusion_matrix_list[0]
dataframe=pd.DataFrame(cnf_matrix,index=['Real happy',' Real unhappy'],columns=['Predicted happy',' Predicted unhappy'])
pd.set_option('display.float_format', lambda x: '%.4f' % x)
print(dataframe)

Extract (randomly) one confusion matrix (Real vs Prediction) from the previous run: 
               Predicted happy   Predicted unhappy
Real happy              0.8670              0.1330
 Real unhappy           0.5060              0.4940


In [55]:
data_train_nobinary_clean_normalized_scaled.to_csv('data/dataframe_train_4_classification.csv')