# After feature sorting (code 17), see the correlation between top features.

Also elinimating the highly correlated ones.

In [31]:
import numpy as np
import mne
from scipy import signal
from scipy.interpolate import RectBivariateSpline
from mne.filter import resample, filter_data
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from lspopt import spectrogram_lspopt
from matplotlib.colors import Normalize, ListedColormap

import logging
LOGGING_TYPES = dict(DEBUG=logging.DEBUG, INFO=logging.INFO, WARNING=logging.WARNING,
                     ERROR=logging.ERROR, CRITICAL=logging.CRITICAL)
logger = logging.getLogger('yasa')

%matplotlib qt


In [32]:
# load reference_df     
reference_df = pd.read_csv("reference_df.csv", index_col="name")
reference_df.head(3)

Unnamed: 0_level_0,hypno,df_feat,eeg
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P18_N3 L,/Users/amirhosseindaraie/Desktop/data/synced-h...,feature/P18_N3 L.csv,/Users/amirhosseindaraie/Desktop/data/autoscor...
P18_N2 R,/Users/amirhosseindaraie/Desktop/data/synced-h...,feature/P18_N2 R.csv,/Users/amirhosseindaraie/Desktop/data/autoscor...
P17_N2 L,/Users/amirhosseindaraie/Desktop/data/synced-h...,feature/P17_N2 L.csv,/Users/amirhosseindaraie/Desktop/data/autoscor...


In [33]:
# load csv    
rankings_df = pd.read_csv("rankings_df aug.csv", index_col="method_name")
rankings_df.head(3)

Unnamed: 0_level_0,ab,sb,ag,sg,lziv,iqr,bs,ta_b,gs,alpha,...,median,mean_psd,E,WEn,ds,mean_distance,diffEnt,renyi,skew,mean
method_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
f_classif,2.0,1.0,3.0,4.0,6.0,9.0,7.0,5.0,15.0,8.0,...,61.0,64.0,66.0,62.0,71.0,63.0,67.0,69.0,72.0,73.0
chiSqr,1.0,2.0,4.0,8.0,7.0,6.0,9.0,12.0,3.0,10.0,...,69.0,66.0,65.0,70.0,62.0,71.0,68.0,67.0,72.0,73.0


In [34]:
rankings_df.columns

Index(['ab', 'sb', 'ag', 'sg', 'lziv', 'iqr', 'bs', 'ta_b', 'gs', 'alpha',
       'tb', 'ga', 'gb', 'ad', 'ba', 'sigma', 'bubbleEnt1', 'at',
       'spec_entropy', 'bubbleEnt2', 'higuchi', 'sd', 'iqr_psd', 'tg', 'std',
       'beta', 'bg', 'nzc', 'petrosian', 'gamma', 'asi', 'gt', 'perm_entropy',
       'sa', 'sample_entropy', 'theta', 'hcomp_psd', 'bt', 'st', 'td', 'gb_da',
       'gd', 'ta_ab', 'delta', 'bd', 'svd_entropy', 'hmob_psd', 'hmob',
       'app_entropy', 'skew_psd', 'kurt_psd', 'ta', 'hcomp', 'dfa', 'db', 'dg',
       'central_tendency_measure', 'std_psd', 'ts', 'katz', 'kurt', 'da', 'dt',
       'median', 'mean_psd', 'E', 'WEn', 'ds', 'mean_distance', 'diffEnt',
       'renyi', 'skew', 'mean'],
      dtype='object')

In [35]:
# load features

### to load features for augmented eeg:
df_feat_loc = reference_df.iloc[0].df_feat
df_feat_loc = df_feat_loc.split(".")[0] + " aug.csv"
df_feat = pd.read_csv(df_feat_loc, index_col=False)

# Correlation Analysis of Top features

In [36]:
def pearson_corr(df, output_var="hmob", thresh=0.8):
    # Using Pearson Correlation
    corr_matrix = df.corr()

    # Correlation with output variable
    cor_target = abs(corr_matrix[output_var])
    # Selecting highly correlated features
    relevant_features = cor_target[cor_target >= thresh]
    print(f"RELEVANT ONES to {output_var}:")
    print(relevant_features)
    print()

    return relevant_features


def pearson_correlation_plot(df, thresh=0.95):
    plt.figure(figsize=(12, 10))
    plt.suptitle(f"Pearson Correlation Plot for Top 30 Features (thresh = {thresh})")
    corr_matrix = df.corr()
    corr_matrix[abs(corr_matrix) >= thresh] = abs(corr_matrix)
    corr_matrix[abs(corr_matrix) < thresh] = 0
    # print(corr_matrix.shape)
    g = sns.heatmap(
        abs(corr_matrix), cmap=plt.cm.Reds, xticklabels=True, yticklabels=True
    )
    g.set_xticklabels(g.get_xmajorticklabels(), fontsize=7)
    g.set_yticklabels(g.get_ymajorticklabels(), fontsize=7)
    plt.tight_layout()
    # plt.savefig(f"fs_pearson_correlation_top_30_thresh_{thresh}.png", format="png")
    # plt.savefig(f"fs_pearson_correlation_top_30_thresh_{thresh}.svg", format="svg")
    plt.show()


# pearson_correlation_plot(df_feat.loc[:, top_features], thresh=0.7)


In [37]:
top_features = rankings_df.columns.values.tolist()
for idx, feature in enumerate(top_features): 
    # Sort df_feat based on top-features in an ascending order
    df_feat = df_feat.loc[:, top_features] 
    # Find highly correlated features from current onward
    highly_corr_feats = pearson_corr(df_feat.loc[:, feature:], output_var=feature, thresh=0.90)
    # Omit ccurrent (itself) and convert to list
    highly_corr_feats = highly_corr_feats.index.tolist()[1:] # convert to list, omit itself
    # Remove correlated features and update `top_features` list
    for feat2remove in highly_corr_feats:
        top_features.remove(feat2remove)
    

RELEVANT ONES to ab:
ab            1.000000
ag            0.965444
bubbleEnt1    0.901810
bubbleEnt2    0.911902
Name: ab, dtype: float64

RELEVANT ONES to sb:
sb    1.00000
sg    0.95578
Name: sb, dtype: float64

RELEVANT ONES to lziv:
lziv    1.0
Name: lziv, dtype: float64

RELEVANT ONES to iqr:
iqr    1.0
Name: iqr, dtype: float64

RELEVANT ONES to bs:
bs    1.0
Name: bs, dtype: float64

RELEVANT ONES to ta_b:
ta_b    1.000000
tb      0.973815
tg      0.964516
Name: ta_b, dtype: float64

RELEVANT ONES to gs:
gs    1.0
Name: gs, dtype: float64

RELEVANT ONES to alpha:
alpha    1.000000
ad       0.986048
Name: alpha, dtype: float64

RELEVANT ONES to ga:
ga    1.000000
ba    0.928849
Name: ga, dtype: float64

RELEVANT ONES to gb:
gb    1.0
Name: gb, dtype: float64

RELEVANT ONES to sigma:
sigma    1.000000
sd       0.982264
Name: sigma, dtype: float64

RELEVANT ONES to at:
at    1.0
Name: at, dtype: float64

RELEVANT ONES to spec_entropy:
spec_entropy    1.0
Name: spec_entropy, dtype: 

In [38]:
pearson_correlation_plot(df_feat.loc[:, top_features], thresh=0.7)

In [39]:
top_feat_initial = rankings_df.columns.values.tolist()
top_feat_updated = top_features

for i, feat in enumerate(top_feat_initial):
    if feat in top_feat_updated:
        print(f"{i}- {feat}: ✅")
    else:
        print(f"{i}- {feat}: ❌")


0- ab: ✅
1- sb: ✅
2- ag: ❌
3- sg: ❌
4- lziv: ✅
5- iqr: ✅
6- bs: ✅
7- ta_b: ✅
8- gs: ✅
9- alpha: ✅
10- tb: ❌
11- ga: ✅
12- gb: ✅
13- ad: ❌
14- ba: ❌
15- sigma: ✅
16- bubbleEnt1: ❌
17- at: ✅
18- spec_entropy: ✅
19- bubbleEnt2: ❌
20- higuchi: ✅
21- sd: ❌
22- iqr_psd: ✅
23- tg: ❌
24- std: ✅
25- beta: ✅
26- bg: ✅
27- nzc: ✅
28- petrosian: ❌
29- gamma: ✅
30- asi: ✅
31- gt: ✅
32- perm_entropy: ❌
33- sa: ✅
34- sample_entropy: ✅
35- theta: ✅
36- hcomp_psd: ✅
37- bt: ❌
38- st: ✅
39- td: ❌
40- gb_da: ❌
41- gd: ❌
42- ta_ab: ✅
43- delta: ✅
44- bd: ❌
45- svd_entropy: ✅
46- hmob_psd: ✅
47- hmob: ❌
48- app_entropy: ❌
49- skew_psd: ✅
50- kurt_psd: ❌
51- ta: ❌
52- hcomp: ✅
53- dfa: ❌
54- db: ✅
55- dg: ❌
56- central_tendency_measure: ✅
57- std_psd: ✅
58- ts: ❌
59- katz: ❌
60- kurt: ✅
61- da: ✅
62- dt: ✅
63- median: ✅
64- mean_psd: ❌
65- E: ❌
66- WEn: ✅
67- ds: ❌
68- mean_distance: ✅
69- diffEnt: ✅
70- renyi: ✅
71- skew: ✅
72- mean: ✅


In [46]:
# write top features into a file
with open('top_features_correlation_90.txt', 'w') as f:
    for s in top_feat_updated:
        f.write(s + '\n')

# read top features from file
with open('top_features_correlation_90.txt', 'r') as f:
    top_feat = [line.rstrip('\n') for line in f]


In [47]:
len(top_feat)

46