# Extracting features 

* Analyzing features
* Compute ROC-AUC single features 
* Statictical tests
* Selecting Features:
    * Discarded features with correlation >= 0.75
    * Discarded features with p-value > 0.05

# 1) Computing ROC_AUC and Z-scores

In [1]:
from sklearn.metrics import roc_auc_score
from scipy.stats import zscore

def feat_rocauc(df, features=None, apply_sort=True):
    rows = []
    if features is None:
        features = [col for col in df.columns if col not in ['target', 'fn']]
    for feat in features:
        roc_auc = roc_auc_score(df['target'], df[feat])
        roc_auc = max(roc_auc, 1 - roc_auc)
        
        d = {
            'feature': feat,
            'roc_auc': roc_auc,
            'mean_difference': df[feat][df['target'] == 1].mean() - df[feat][df['target'] == 0].mean()
        }
        rows.append(d)
    res = pd.DataFrame(rows)
    if apply_sort:
        res = res.sort_values('roc_auc', ascending=False)
    return res

def calc_zscore(df, outpath, features=None):
    if features is None:
        features = [col for col in df.columns if col not in ['target', 'fn']]
    
    dzs=[]
    for feat in features:
        dz = zscore(df[feat])
        dzs.append(dz)
    
    dz = pd.DataFrame(dzs).transpose()
    dz.columns = features
    dz['target'] = df['target']
    dz.to_csv('{}/zscores.csv'.format(outpath), index=False)
    return dz

In [3]:
import pandas as pd

path = '../mldataset'

df_features = pd.read_csv(path+'/allfeatures.csv')
df_features['target'] = df_features['target'].apply(lambda x: 0 if x=='healthy' else 1 if x=='trauma' else None)
rocdata = feat_rocauc(df_features)
zdata = calc_zscore(df_features, path)

In [4]:
df_features

Unnamed: 0,coh_nofilt_fp1_fp2,coh_nofilt_fp1_f7,coh_nofilt_fp1_f3,coh_nofilt_fp1_fz,coh_nofilt_fp1_f4,coh_nofilt_fp1_f8,coh_nofilt_fp1_t3,coh_nofilt_fp1_c3,coh_nofilt_fp1_c4,coh_nofilt_fp1_t4,...,psi_alpha_o2_t3,psi_alpha_o2_c3,psi_alpha_o2_c4,psi_alpha_o2_t4,psi_alpha_o2_t5,psi_alpha_o2_p3,psi_alpha_o2_pz,psi_alpha_o2_p4,psi_alpha_o2_t6,psi_alpha_o2_o1
0,0.849397,0.877339,0.857365,0.864015,0.795604,0.715263,0.611386,0.674974,0.672999,0.609560,...,1.137330,-2.535512,2.851893,-2.413355,-7.886247,-3.430358,-2.940278,4.755336,-2.372887,-6.151342
1,0.682347,0.795053,0.775378,0.657702,0.641092,0.713337,0.647718,0.536908,0.380917,0.541196,...,-0.069698,-0.223584,2.826703,-5.645081,-0.023241,-0.928270,1.025139,2.125412,0.000000,-0.129469
2,0.897231,0.847988,0.861601,0.828406,0.752612,0.744819,0.581250,0.791440,0.517047,0.518814,...,5.719148,4.841813,-4.843460,-2.764540,-1.046704,-2.972760,-6.569496,-2.928576,-0.443500,-0.663475
3,0.755840,0.844175,0.833709,0.886321,0.793390,0.741746,0.808783,0.861057,0.766067,0.809645,...,1.910294,0.064580,1.450907,-1.034041,-0.911907,-1.693191,-1.263543,-3.622760,-2.166221,1.226381
4,0.639640,0.906704,0.887078,0.892046,0.588968,0.631242,0.710337,0.635790,0.448039,0.564691,...,2.343783,-0.019518,-0.481569,1.608233,0.394639,0.247216,0.442689,0.863781,-0.006938,-0.153289
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,0.396877,0.656498,0.576064,0.494556,0.420211,0.428643,0.563125,0.661773,0.500596,0.401992,...,-134.552255,-148.020175,-174.643489,-128.884761,-141.926243,-110.660944,-153.216632,-11.502160,-156.376620,-173.192321
181,0.462236,0.467286,0.384045,0.431943,0.400531,0.356827,0.378937,0.367412,0.380954,0.357517,...,-15.981099,-18.816348,-18.183475,-11.601831,-7.541968,-16.935812,-17.919610,-14.430107,-6.426578,-7.388289
182,0.833737,0.924463,0.834045,0.797401,0.716631,0.788221,0.774200,0.666065,0.550072,0.700391,...,5.923263,7.676919,19.279992,37.209661,3.557597,3.770691,4.755026,0.985402,3.500394,0.911847
183,0.611631,0.875651,0.906435,0.882049,0.470444,0.513214,0.730145,0.792010,0.396715,0.473648,...,-26.785091,51.345199,-32.453846,-37.773608,70.624488,-18.838973,-11.220369,-21.244065,-10.816045,-16.059869


In [5]:
rocdata

Unnamed: 0,feature,roc_auc,mean_difference
1253,bands_beta_t3,0.810360,0.008131
1017,env_beta_t3_c4,0.806555,-0.128339
37,coh_nofilt_f7_t3,0.800117,-0.124482
343,coh_beta_f7_t3,0.799532,-0.125516
1170,env_theta_t3_c4,0.784021,-0.154765
...,...,...,...
537,coh_theta_f4_c4,0.500439,0.002242
555,coh_theta_f8_o1,0.500293,-0.003045
1307,psi_alpha_f4_fp2,0.500293,43.552703
209,coh_alpha_f3_p3,0.500293,0.001285


In [6]:
zdata

Unnamed: 0,coh_nofilt_fp1_fp2,coh_nofilt_fp1_f7,coh_nofilt_fp1_f3,coh_nofilt_fp1_fz,coh_nofilt_fp1_f4,coh_nofilt_fp1_f8,coh_nofilt_fp1_t3,coh_nofilt_fp1_c3,coh_nofilt_fp1_c4,coh_nofilt_fp1_t4,...,psi_alpha_o2_c3,psi_alpha_o2_c4,psi_alpha_o2_t4,psi_alpha_o2_t5,psi_alpha_o2_p3,psi_alpha_o2_pz,psi_alpha_o2_p4,psi_alpha_o2_t6,psi_alpha_o2_o1,target
0,0.987489,0.211688,0.100064,0.443818,1.594748,0.466620,-0.746230,0.027259,2.449322,0.104547,...,0.059372,0.090258,0.054598,-0.196693,0.011475,0.024303,1.057359,-0.099844,-0.090393,1
1,-0.441101,-0.625986,-0.615480,-1.616411,0.103185,0.448982,-0.463462,-0.924556,-1.262475,-0.497315,...,0.067530,0.089964,0.038321,0.044965,0.059036,0.098705,0.529726,0.039642,0.073762,1
2,1.396558,-0.087108,0.137040,0.088233,1.179735,0.737234,-0.980773,0.830163,0.467471,-0.694360,...,0.085402,0.000318,0.052829,0.013510,0.020173,-0.043791,-0.484239,0.013572,0.059205,1
3,0.187400,-0.125921,-0.106394,0.666568,1.573383,0.709097,0.790078,1.310097,3.632033,1.866053,...,0.068546,0.073884,0.061545,0.017653,0.044496,0.055763,-0.623511,-0.087696,0.110722,1
4,-0.806318,0.510624,0.359390,0.723732,-0.399993,-0.302688,0.023888,-0.242873,-0.409493,-0.290471,...,0.068250,0.051298,0.074854,0.057808,0.081380,0.087776,0.276609,0.039234,0.073113,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,-2.882393,-2.036478,-2.355005,-3.245579,-2.029074,-2.157704,-1.121838,-0.063750,0.258411,-1.722835,...,-0.453940,-1.984240,-0.582405,-4.316207,-2.026831,-2.795284,-2.204330,-9.152708,-4.643877,0
181,-2.323450,-3.962662,-4.030864,-3.870831,-2.219054,-2.815252,-2.555350,-2.093037,-1.262012,-2.114385,...,0.001929,-0.155595,0.008318,-0.186112,-0.245246,-0.256749,-2.791755,-0.338134,-0.124111,0
182,0.853562,0.691410,-0.103458,-0.221383,0.832397,1.134626,0.520924,-0.034158,0.887150,0.904200,...,0.095405,0.282264,0.254169,0.155016,0.148357,0.168687,0.301010,0.245407,0.102148,0
183,-1.045851,0.194501,0.528330,0.623907,-1.544150,-1.383359,0.178055,0.834091,-1.061720,-1.091997,...,0.249479,-0.322382,-0.123502,2.216214,-0.281422,-0.131054,-4.158818,-0.596162,-0.360496,0


In [8]:
# From ROC-AUC > 0.7

col_lst = rocdata.feature[rocdata.roc_auc > 0.7]

data_roc = df_features[col_lst]
data_roc['fn'] = df_features['fn'].values
data_roc['target'] = df_features.target.values
data_roc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,bands_beta_t3,env_beta_t3_c4,coh_nofilt_f7_t3,coh_beta_f7_t3,env_theta_t3_c4,coh_nofilt_fp1_f3,env_theta_c3_t4,env_nofilt_t3_c4,env_beta_f7_t3,env_beta_t3_t4,...,env_theta_p4_o1,env_alpha_c3_t6,bands_beta_f7,env_beta_fp1_t3,env_nofilt_t3_t5,env_beta_fp2_t4,env_beta_f8_c4,env_alpha_t3_t6,fn,target
0,0.006494,0.209763,0.735782,0.667533,0.276645,0.857365,0.277616,0.178191,0.312612,0.183950,...,0.508074,0.230874,0.005105,0.243268,0.297026,0.437217,0.568807,0.211406,00b2d6e257e2f615.csv,1
1,0.011056,0.041008,0.740455,0.765772,0.207531,0.775378,0.259119,0.264171,0.585330,0.307613,...,0.342553,0.604461,0.012776,0.394411,0.617678,0.458256,0.241849,0.910290,09769097749fb286.csv,1
2,0.012101,0.148892,0.684884,0.727593,0.104273,0.861601,0.082459,0.143100,0.475297,0.248138,...,0.593786,0.311667,0.009095,0.337379,0.525999,0.336872,0.257847,0.482144,0b84dd748e7d5edd.csv,1
3,0.012688,0.167710,0.824584,0.630405,0.134421,0.833709,0.128373,0.154306,0.284954,0.322907,...,0.642076,0.111782,0.024153,0.492537,0.578808,0.070639,0.172966,0.129185,158ce5e17a662599.csv,1
4,0.012842,0.125339,0.785085,0.812996,0.286608,0.887078,0.394275,0.273084,0.700757,0.574683,...,0.551569,0.281395,0.011563,0.558177,0.285899,0.441406,0.247363,0.331207,17df70855fa4922a.csv,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,0.052195,0.266896,0.654506,0.698181,0.273795,0.576064,0.541565,0.200652,0.614004,0.620714,...,0.218715,0.653576,0.026020,0.353724,0.784301,0.628807,0.417615,0.561565,b5abacb75ebc40b8.csv,0
181,0.002670,0.528726,0.809818,0.863759,0.477249,0.384045,0.458986,0.410420,0.805238,0.669253,...,0.688086,0.440851,0.000920,0.209650,0.907689,0.247190,0.770076,0.630820,bb47addde80c51c0.csv,0
182,0.005858,0.537452,0.892363,0.910603,0.725392,0.834045,0.647903,0.562738,0.829301,0.830257,...,0.675698,0.606590,0.007597,0.660518,0.638949,0.546191,0.571439,0.744485,bc9ecd77d29ef5fb.csv,0
183,0.011235,0.184923,0.887011,0.883980,0.370937,0.906435,0.513153,0.141832,0.780307,0.565672,...,0.575972,0.477781,0.013584,0.546525,0.730775,0.207750,0.223607,0.630716,bcfaa165d982034b.csv,0


In [9]:
data_roc.to_csv('../mldataset/featuresROC.csv', index=False)

# 2) Statistical Analysis

* Paired sampled t-test 
  * Dependent sample t-test: univariate test that tests for a significant difference between 2 related variables. 

**H0 :- means difference between Trauma and Healthy is 0**

**H1:- mean difference between Trauma and Healthy is not 0**

* Z-test
  * Sample size > 30. 
  * Data points should be independent from each other (one data point doesn’t affect another one).
  * Data should be normally distributed. 
  * Your data should be randomly selected from a population, where each item has an equal chance of being selected.



### Paired sampled t-test

In [10]:
from scipy import stats

pstat1 = []
pstat2 = []
pstat3 = []
for i in df_features.drop(['fn','target'], axis=1).columns:
    ttest,pval = stats.ttest_rel(df_features[i].loc[:50], df_features[i].loc[51:101])
    if pval<0.05:
        pstat1.append('R')
    else:
        pstat1.append('A')

    ttest,pval = stats.ttest_rel(df_features[i].loc[:50], df_features[i].loc[101:151])
    if pval<0.05:
        pstat2.append('R')
    else:
        pstat2.append('A')

    ttest,pval = stats.ttest_rel(df_features[i].loc[:50], df_features[i].loc[134::])
    if pval<0.05:
        pstat3.append('R')
    else:
        pstat3.append('A')

paired = pd.DataFrame(zip(df_features.drop(['fn','target'], axis=1).columns, pstat1,pstat2,pstat3), columns=['features', 'Hyp-test-gr1','Hyp-test-gr2','Hyp-test-gr3'])

In [11]:
print('Rejected Features: ')
col_R = []
for i in paired.columns:
    try:
        print(paired['features'][paired[i] == 'R'].value_counts().sum())
        col_R.append(paired['features'][paired[i] == 'R'])
    except:
        continue

col_lst = list(set(col_R[1]) & set(col_R[2]) & set(col_R[3]))

Rejected Features: 
0
193
791
664


In [12]:
data_paired = df_features[col_lst]
data_paired['fn'] = df_features['fn'].values
data_paired['target'] = df_features.target.values
data_paired

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,env_beta_f3_fz,coh_beta_f3_t3,env_nofilt_c4_t5,env_theta_c3_o2,env_beta_t3_t4,env_theta_c4_p3,coh_beta_t3_t4,env_theta_p4_o1,env_beta_fp2_t3,coh_theta_t3_c4,...,env_beta_f8_t4,env_theta_t5_p4,env_theta_t3_p4,coh_theta_c4_t5,env_nofilt_p3_o1,env_beta_f7_t3,coh_beta_p4_o2,coh_theta_t5_p4,fn,target
0,0.847019,0.632954,0.324699,0.235976,0.183950,0.485218,0.581635,0.508074,0.208650,0.460960,...,0.632933,0.324051,0.275391,0.543355,0.569075,0.312612,0.883526,0.589208,00b2d6e257e2f615.csv,1
1,0.290819,0.610631,0.202487,0.537827,0.307613,0.150419,0.568414,0.342553,0.390412,0.400259,...,0.643856,0.284388,0.153838,0.386432,0.574524,0.585330,0.501182,0.492679,09769097749fb286.csv,1
2,0.628980,0.666819,0.163031,0.254988,0.248138,-0.062439,0.653731,0.593786,0.255404,0.629363,...,0.559096,0.432062,0.180939,0.595620,0.579706,0.475297,0.837673,0.794213,0b84dd748e7d5edd.csv,1
3,0.804738,0.754586,0.164764,0.256973,0.322907,0.220381,0.636662,0.642076,-0.032699,0.559110,...,0.132934,0.450122,0.285619,0.634620,0.691403,0.284954,0.622675,0.674047,158ce5e17a662599.csv,1
4,0.822643,0.631569,0.149241,0.373794,0.574683,0.125058,0.728432,0.551569,0.375855,0.525159,...,0.683366,0.477082,0.344213,0.476117,0.750143,0.700757,0.777689,0.600768,17df70855fa4922a.csv,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,0.575328,0.716489,0.238589,0.831327,0.620714,0.375121,0.461698,0.218715,0.564727,0.413854,...,0.515482,0.898029,0.723763,0.420014,0.276754,0.614004,0.663974,0.634926,b5abacb75ebc40b8.csv,0
181,0.472764,0.786597,0.340552,0.464775,0.669253,0.471414,0.731402,0.688086,0.204538,0.660907,...,0.955976,0.637146,0.635745,0.600799,0.644872,0.805238,0.819646,0.775096,bb47addde80c51c0.csv,0
182,0.923448,0.711283,0.489888,0.469352,0.830257,0.611209,0.913746,0.675698,0.541073,0.807364,...,0.816476,0.732724,0.611598,0.775935,0.782429,0.829301,0.889545,0.818091,bc9ecd77d29ef5fb.csv,0
183,0.914002,0.747107,0.158245,0.356712,0.565672,0.396136,0.719970,0.575972,0.163712,0.458871,...,0.502791,0.519837,0.408262,0.447566,0.806341,0.780307,0.849695,0.616185,bcfaa165d982034b.csv,0


In [13]:
data_paired.to_csv('../mldataset/featuresPaired.csv', index=False)

### Z-test

In [15]:
!pip install statsmodels

Defaulting to user installation because normal site-packages is not writeable
Collecting statsmodels
  Downloading statsmodels-0.12.0-cp36-cp36m-manylinux1_x86_64.whl (9.5 MB)
[K     |████████████████████████████████| 9.5 MB 2.4 MB/s eta 0:00:01
Collecting patsy>=0.5
  Downloading patsy-0.5.1-py2.py3-none-any.whl (231 kB)
[K     |████████████████████████████████| 231 kB 1.9 MB/s eta 0:00:01
Installing collected packages: patsy, statsmodels
Successfully installed patsy-0.5.1 statsmodels-0.12.0
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [16]:
from statsmodels.stats import weightstats as stests

pstat1 = []
pstat2 = []
pstat3 = []
for i in df_features.drop(['fn','target'], axis=1).columns:
    ztest ,pval = stests.ztest(df_features[i].loc[:50], df_features[i].loc[51:101])
    if pval<0.05:
        pstat1.append('R')
    else:
        pstat1.append('A')

    ztest ,pval = stests.ztest(df_features[i].loc[:50], df_features[i].loc[101:151])
    if pval<0.05:
        pstat2.append('R')
    else:
        pstat2.append('A')

    ztest ,pval = stests.ztest(df_features[i].loc[:50], df_features[i].loc[134::])
    if pval<0.05:
        pstat3.append('R')
    else:
        pstat3.append('A')

ztest = pd.DataFrame(zip(df_features.drop(['fn','target'], axis=1).columns, pstat1,pstat2,pstat3), columns=['features', 'Hyp-test-gr1','Hyp-test-gr2','Hyp-test-gr3'])

In [17]:
print('Rejected Features: ')
col_R = []
for i in ztest.columns:
    try:
        print(ztest['features'][ztest[i] == 'R'].value_counts().sum())
        col_R.append(ztest['features'][ztest[i] == 'R'])
    except:
        continue

col_lst = list(set(col_R[1]) & set(col_R[2]) & set(col_R[3]))

Rejected Features: 
0
181
793
687


In [18]:
data_ztest = df_features[col_lst]
data_ztest['fn'] = df_features['fn'].values
data_ztest['target'] = df_features.target.values
data_ztest

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,env_beta_f3_fz,coh_beta_f3_t3,env_nofilt_c4_t5,env_theta_c3_o2,env_beta_t3_t4,env_theta_c4_p3,coh_beta_t3_t4,env_theta_p4_o1,env_beta_fp2_t3,coh_theta_c4_t6,...,env_beta_f8_t4,env_theta_t5_p4,env_theta_t3_p4,coh_theta_c4_t5,env_nofilt_p3_o1,env_beta_f7_t3,coh_beta_p4_o2,coh_theta_t5_p4,fn,target
0,0.847019,0.632954,0.324699,0.235976,0.183950,0.485218,0.581635,0.508074,0.208650,0.721391,...,0.632933,0.324051,0.275391,0.543355,0.569075,0.312612,0.883526,0.589208,00b2d6e257e2f615.csv,1
1,0.290819,0.610631,0.202487,0.537827,0.307613,0.150419,0.568414,0.342553,0.390412,0.427821,...,0.643856,0.284388,0.153838,0.386432,0.574524,0.585330,0.501182,0.492679,09769097749fb286.csv,1
2,0.628980,0.666819,0.163031,0.254988,0.248138,-0.062439,0.653731,0.593786,0.255404,0.662582,...,0.559096,0.432062,0.180939,0.595620,0.579706,0.475297,0.837673,0.794213,0b84dd748e7d5edd.csv,1
3,0.804738,0.754586,0.164764,0.256973,0.322907,0.220381,0.636662,0.642076,-0.032699,0.672534,...,0.132934,0.450122,0.285619,0.634620,0.691403,0.284954,0.622675,0.674047,158ce5e17a662599.csv,1
4,0.822643,0.631569,0.149241,0.373794,0.574683,0.125058,0.728432,0.551569,0.375855,0.619894,...,0.683366,0.477082,0.344213,0.476117,0.750143,0.700757,0.777689,0.600768,17df70855fa4922a.csv,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,0.575328,0.716489,0.238589,0.831327,0.620714,0.375121,0.461698,0.218715,0.564727,0.429550,...,0.515482,0.898029,0.723763,0.420014,0.276754,0.614004,0.663974,0.634926,b5abacb75ebc40b8.csv,0
181,0.472764,0.786597,0.340552,0.464775,0.669253,0.471414,0.731402,0.688086,0.204538,0.750011,...,0.955976,0.637146,0.635745,0.600799,0.644872,0.805238,0.819646,0.775096,bb47addde80c51c0.csv,0
182,0.923448,0.711283,0.489888,0.469352,0.830257,0.611209,0.913746,0.675698,0.541073,0.809107,...,0.816476,0.732724,0.611598,0.775935,0.782429,0.829301,0.889545,0.818091,bc9ecd77d29ef5fb.csv,0
183,0.914002,0.747107,0.158245,0.356712,0.565672,0.396136,0.719970,0.575972,0.163712,0.604931,...,0.502791,0.519837,0.408262,0.447566,0.806341,0.780307,0.849695,0.616185,bcfaa165d982034b.csv,0


In [19]:
data_ztest.to_csv('../mldataset/featuresZTEST.csv', index=False)

# 3) Correlation Features extraction

> Drop Features with correlation > 0.75

In [20]:
correlated_features = set()
correlation_matrix = df_features.corr()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) >= 0.75:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

print('%i features in train dataset have correlation >= 0.75' % (len(correlated_features)))
#print(correlated_features)

1344 features in train dataset have correlation >= 0.75


In [21]:
## Create a new dataset with those columns with correlation less than 0.75
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

corr = df_features.drop(['fn'], axis=1).corr()

columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
    for j in range(i+1, corr.shape[0]):
        if corr.iloc[i,j] >= 0.75:
            if columns[j]:
                columns[j] = False

selected_columns = df_features.drop(['fn'], axis=1).columns[columns]
data_corr = df_features[selected_columns]

In [22]:
data_corr['fn'] = df_features['fn']
data_corr.to_csv('../mldataset/featuresCORR.csv', index=False)
data_corr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,coh_nofilt_fp1_fp2,coh_nofilt_fp1_f7,coh_nofilt_fp1_f4,coh_nofilt_fp1_f8,coh_nofilt_fp1_c4,coh_nofilt_fp1_t4,coh_nofilt_fp1_t5,coh_nofilt_fp1_p4,coh_nofilt_fp1_t6,coh_nofilt_fp2_f4,...,psi_alpha_f7_fp1,psi_alpha_f3_fp1,psi_alpha_c3_fp2,psi_alpha_t5_t4,psi_alpha_p3_fz,psi_alpha_p3_t5,psi_alpha_pz_f3,psi_alpha_p4_fz,psi_alpha_o2_p4,fn
0,0.849397,0.877339,0.795604,0.715263,0.672999,0.609560,0.553537,0.605240,0.521541,0.809565,...,0.216391,3.112093,1.101696,5.187863,-0.239708,-2.076287,-6.989389,-8.970168,4.755336,00b2d6e257e2f615.csv
1,0.682347,0.795053,0.641092,0.713337,0.380917,0.541196,0.506529,0.426447,0.614262,0.606051,...,3.030619,-0.462982,-0.866468,-3.297603,1.469035,0.477261,2.056682,0.132953,2.125412,09769097749fb286.csv
2,0.897231,0.847988,0.752612,0.744819,0.517047,0.518814,0.560733,0.585119,0.529997,0.825504,...,-0.756763,-7.066790,6.460609,-1.795647,6.680764,-0.196549,8.369347,4.905203,-2.928576,0b84dd748e7d5edd.csv
3,0.755840,0.844175,0.793390,0.741746,0.766067,0.809645,0.836142,0.695082,0.724137,0.832163,...,-1.948080,-0.167501,-2.489858,-2.491709,-3.089818,0.241671,-1.394407,-1.104091,-3.622760,158ce5e17a662599.csv
4,0.639640,0.906704,0.588968,0.631242,0.448039,0.564691,0.605225,0.386214,0.484784,0.815439,...,-0.056151,1.982460,2.610120,1.689208,4.518065,-0.475950,2.085679,1.028894,0.863781,17df70855fa4922a.csv
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,0.396877,0.656498,0.420211,0.428643,0.500596,0.401992,0.597901,0.426036,0.396817,0.372904,...,12.279593,-12.066417,14.962540,21.071078,-2.732024,-10.408949,1.002641,-126.399870,-11.502160,b5abacb75ebc40b8.csv
181,0.462236,0.467286,0.400531,0.356827,0.380954,0.357517,0.388330,0.362472,0.358965,0.398898,...,-1.016832,1.426892,1.880279,-3.556074,-0.778761,6.662844,-0.208786,-4.767681,-14.430107,bb47addde80c51c0.csv
182,0.833737,0.924463,0.716631,0.788221,0.550072,0.700391,0.615598,0.490563,0.643444,0.799767,...,-0.194475,-1.247829,-4.247688,32.584740,-1.576256,0.087189,4.365065,0.870656,0.985402,bc9ecd77d29ef5fb.csv
183,0.611631,0.875651,0.470444,0.513214,0.396715,0.473648,0.639057,0.370264,0.503734,0.724264,...,29.455904,26.989620,2.031076,-98.289081,77.557467,82.928221,35.931270,104.813118,-21.244065,bcfaa165d982034b.csv


# 4) Selecting Features based on p-value


* Selecting the columns based on how they affect the p-value. 

* Null hypothesis: “The selected combination of dependent variables do not have any effect on the independent variable”. 
    * Applying small regression model and calculate the p values. 
    * If the p values is higher than the threshold, we discard that combination of features.

In [31]:
# Done assuming the Correlation feature extraction!!

# Removing 'Target' column because it is the column we are trying to predict
selec_columns = df_features.drop(['fn'], axis=1).columns[columns]

selected_cols = []
for i in selec_columns:
    if i != 'target':
        selected_cols.append(i)

In [32]:
selected_columns = selected_cols

#import statsmodels.formula.api as sm
import statsmodels.regression.linear_model as sm

def backwardElimination(x, Y, sl, columns):
    numVars = len(x[0])

    for i in range(0, numVars):
        regressor_OLS = sm.OLS(Y, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        if maxVar > sl:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, 1)
                    columns = np.delete(columns, j)
                    
    regressor_OLS.summary()
    return x, columns

SL = 0.05
data_modeled, selected_columns = backwardElimination(data_corr.drop(['fn','target'],axis=1).values, data_corr['target'].values, SL, selected_columns)

In [33]:
result = pd.DataFrame()
result['target'] = data_corr['target']

data_pval = pd.DataFrame(data_modeled, columns=selected_columns)
data_pval

Unnamed: 0,coh_nofilt_fp1_c4,coh_nofilt_fp2_c3,coh_nofilt_f4_f8,coh_nofilt_f8_t4,coh_nofilt_t4_p4,coh_nofilt_t4_t6,coh_nofilt_t5_p3,coh_nofilt_t5_o1,coh_nofilt_p4_t6,coh_alpha_fp2_pz,...,coh_theta_f3_t3,env_nofilt_fp1_t4,env_nofilt_f4_c4,env_nofilt_f8_c4,env_nofilt_f8_t4,env_beta_fp1_c4,env_beta_fp2_t5,bands_gamma_fp1,bands_beta_t3,psi_alpha_fp2_fp1
0,0.672999,0.549203,0.851620,0.772885,0.724641,0.783557,0.900922,0.836930,0.868253,0.526640,...,0.557030,0.302625,0.630256,0.475648,0.654537,0.406085,0.188201,0.000176,0.006494,0.321819
1,0.380917,0.396774,0.812375,0.735646,0.533897,0.641441,0.805095,0.889171,0.485035,0.495919,...,0.610566,0.622319,0.409803,0.413256,0.795929,0.153925,0.239554,0.000265,0.011056,3.356577
2,0.517047,0.713608,0.790806,0.722790,0.652574,0.823771,0.732795,0.908678,0.746679,0.672865,...,0.756044,0.239469,0.373261,0.291227,0.630412,0.186785,0.396535,0.000048,0.012101,-1.559049
3,0.766067,0.749919,0.809885,0.781487,0.784892,0.831993,0.961486,0.954441,0.953717,0.566018,...,0.670649,0.404723,0.566994,0.259906,0.368198,0.310849,0.056654,0.000345,0.012688,0.190865
4,0.448039,0.371057,0.831079,0.741596,0.488869,0.633956,0.783253,0.842702,0.768984,0.557156,...,0.583260,0.526258,0.340932,0.320422,0.729032,0.156791,0.238352,0.001411,0.012842,-0.252957
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,0.500596,0.392811,0.542254,0.423986,0.452564,0.530629,0.850935,0.524886,0.463810,0.341981,...,0.662200,0.275681,0.271957,0.329791,0.446227,0.673633,0.615834,0.007562,0.052195,-20.573506
181,0.380954,0.379634,0.763106,0.950383,0.892393,0.940121,0.885386,0.862981,0.848179,0.385140,...,0.807472,0.126474,0.727015,0.682744,0.948118,0.226216,0.214389,0.000061,0.002670,0.478146
182,0.550072,0.465025,0.859951,0.872154,0.681088,0.772996,0.948238,0.955194,0.766539,0.345600,...,0.707616,0.564721,0.479159,0.562449,0.820959,0.393832,0.277069,0.001029,0.005858,-1.519827
183,0.396715,0.363774,0.784897,0.669324,0.572603,0.676711,0.913620,0.919220,0.820510,0.396008,...,0.757037,0.144990,0.163735,0.146113,0.350062,0.127400,0.097446,0.001164,0.011235,19.136037


In [36]:
data_pval['fn'] = df_features['fn']
data_pval.to_csv('../mldataset/featuresPVAL.csv', index=False)