In [21]:
import pandas as pd
import numpy as np
import src.functions as src
from scipy import stats
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.metrics import f1_score, precision_score, recall_score, balanced_accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit

features = "dataset2_features.csv"
features = pd.read_csv(features, sep=',', header=0, index_col=None)
Y = features[['labels']]

X_all = features.drop('labels', axis=1)
all_pre= ['3_','4_','6_','7_','10_','11_','12_','15_','16_','18_','25_','26_','27_','28_','29_','30_','33_','34_']
X_all_pre=X_all[[col for col in X_all.columns if any(col.startswith(prefix) for prefix in all_pre)]]
all_post=  ['1_','2_','5_','7_','8_','9_','12_','13_','14_','17_','19_','20_','21_','22_','23_','24_','31_','32_','35_','36_']
X_all_post=X_all[[col for col in X_all.columns if any(col.startswith(prefix) for prefix in all_post)]]
print('All Features:')
print('Total: ' + str(len(X_all.columns))) 
print('Pre: ' + str(len(X_all_pre.columns)))
print('Post: ' + str(len(X_all_post.columns)))

theta_prefixes= ['1_','2_','3_','4_','5_','6_','7_','8_','9_','10_','11_','12_','13_','14_','15_','16_','17_','18_']
X_theta= X_all[[col for col in X_all.columns if any(col.startswith(prefix) for prefix in theta_prefixes)]]
theta_pre= ['3_','4_','6_','7_','10_','11_','12_','15_','16_','18_']
X_theta_pre= X_theta[[col for col in X_theta.columns if any(col.startswith(prefix) for prefix in theta_pre)]]
theta_post= ['1_','2_','5_','7_','8_','9_','12_','13_','14_','17_']
X_theta_post= X_theta[[col for col in X_theta.columns if any(col.startswith(prefix) for prefix in theta_post)]]
print('Theta Features:')
print('Total: ' + str(len(X_theta.columns)))
print('Pre: ' + str(len(X_theta_pre.columns)))
print('Post: ' + str(len(X_theta_post.columns)))

frequency_prefixes= ['1_','2_','3_','4_','5_','6_','7_','8_','9_','10_','11_','12_','13_','14_','15_','16_','17_','18_','19_','20_','21_','22_','23_','24_','25_','26_','27_','28_','29_','30_']
X_frequency= X_all[[col for col in X_all.columns if any(col.startswith(prefix) for prefix in frequency_prefixes)]]
frequency_pre=['3_','4_','6_','7_','10_','11_','12_','15_','16_','18_','25_','26_','27_','28_','29_','30_']
X_frequency_pre= X_frequency[[col for col in X_frequency.columns if any(col.startswith(prefix) for prefix in frequency_pre)]]
frequency_post=['1_','2_','5_','7_','8_','9_','12_','13_','14_','17_','19_','20_','21_','22_','23_','24_']
X_frequency_post= X_frequency[[col for col in X_frequency.columns if any(col.startswith(prefix) for prefix in frequency_post)]]
print('Frequency Features:')
print('Total: ' + str(len(X_frequency.columns)))
print('Pre: ' + str(len(X_frequency_pre.columns)))
print('Post: ' + str(len(X_frequency_post.columns)))

temporal_prefixes= ['31_','32_','33_','34_','35_','36_']
X_temporal= X_all[[col for col in X_all.columns if any(col.startswith(prefix) for prefix in temporal_prefixes)]]
temporal_pre= ['33_','34_']
X_temporal_pre= X_temporal[[col for col in X_temporal.columns if any(col.startswith(prefix) for prefix in temporal_pre)]] 
temporal_post= ['31_','32_','35_','36_']
X_temporal_post= X_temporal[[col for col in X_temporal.columns if any(col.startswith(prefix) for prefix in temporal_post)]]
print('Temporal Features:')
print('Total: ' + str(len(X_temporal.columns)))
print('Pre: ' + str(len(X_temporal_pre.columns)))
print('Post: ' + str(len(X_temporal_post.columns)))


All Features:
Total: 1549
Pre: 804
Post: 806
Theta Features:
Total: 587
Pre: 324
Post: 324
Frequency Features:
Total: 1307
Pre: 684
Post: 684
Temporal Features:
Total: 242
Pre: 120
Post: 122


In [2]:
# Remove features with coefficient of variation < 0.2
variance = X_all.std()/X_all.mean()
low_variance = [i for i in variance.index if variance[i] < 0.2]
X1 = X_all.drop(low_variance, axis=1)
print(len(X1.columns))

y0 = Y[Y['labels'] == 0]
feat_corr = np.zeros(len(X1.columns))
feat_ttest = np.zeros(len(X1.columns))

Y = np.ravel(Y)
i = 0
splits = 100
sss = StratifiedShuffleSplit(n_splits=splits, test_size=0.3, random_state=0)
for train_index, test_index in sss.split(X1, Y):
    X_train, X_test = X1.iloc[train_index, :], X1.iloc[test_index, :]
    Y_train, Y_test = Y[train_index], Y[test_index]

    ind_y0 = [j for j in range(0, len(X_train)) if X_train.index[j] in y0.index]
    ind_y1 = [j for j in range(0, len(X_train)) if X_train.index[j] not in y0.index]
    X_ind_y0 = X_train.iloc[ind_y0, :]
    X_ind_y1 = X_train.iloc[ind_y1, :]

    corr = []

    # Remove correlated features
    for col1 in range(0, len(X1.columns)-1):
        for col2 in range(col1+1, len(X1.columns)):
            if abs(np.corrcoef(X_train[X1.columns[col1]], X_train[X1.columns[col2]])[0, 1]) > 0.9:
                if stats.ttest_ind(X_ind_y0[X1.columns[col1]], X_ind_y1[X1.columns[col1]]).pvalue < \
                        stats.ttest_ind(X_ind_y0[X1.columns[col2]], X_ind_y1[X1.columns[col2]]).pvalue:
                    if col2 not in corr:
                        feat_corr[col2] += 1
                        corr.append(col2)
                    elif col1 not in corr:
                        feat_corr[col1] += 1
                        corr.append(col1)

    # Relevance: t-test between independent variables and output
    ttest = [stats.ttest_ind(X_ind_y0[col], X_ind_y1[col])[1] for col in X1.columns]
    ttest_order = np.argsort(ttest)
    feat_ttest += np.argsort(ttest_order)  # min(p-value) -> +0, max(p-value) -> +len(X.columns)-1

    i += 1

print(len(list(X1.columns[feat_corr > splits/2])))
X1 = X1.drop(list(X1.columns[feat_corr > splits/2]), axis=1)  # Remove correlated features
feat_ttest = feat_ttest[feat_corr <= splits/2]

X1 = X1.drop(X1.columns[np.argsort(np.argsort(feat_ttest)) >= 40], axis=1)
feat_ttest = feat_ttest[np.argsort(np.argsort(feat_ttest)) < 40]
Xcol_sort = [x for _, x in sorted(zip(feat_ttest, X1.columns))]
print(Xcol_sort)

1484
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
29
['1_0', '7_0', '8_F1', '12_F1', '8_F3', '8_FZ', '12_FZ', '12_AF3', '8_FC1', '8_F2', '8_AF3', '12_F3', '21_F1', '17_FT7', '33_CZ', '21_FC1', '8_FC2', '21_FCZ', '21_F3', '8_F4', '21_FZ', '21_CZ', '12_FC1', '12_F4', '12_F2', '35_CZ', '8_FC3', '8_POZ', '35_CPZ', '35_C1', '17_F1', '20_PO4', '8_TP8', '18_FT7', '8_AF4', '21_F2', '12_FC3', '8_OZ', '10_P4', '27_P4']


In [5]:
f1score = []
precision = []
recall = []
specificity = []
npv = []
bal_acc = []

splits = 100
sss = StratifiedShuffleSplit(n_splits=splits, test_size=0.3, random_state=0)
svm = SVC(class_weight='balanced')
scaler = preprocessing.StandardScaler()
X_col = X_all.columns
Y=np.array(Y)

for train_index, test_index in sss.split(X_all, Y):
    X_train, X_test = X_all.iloc[train_index, :], X_all.iloc[test_index, :]
    Y_train, Y_test = Y[train_index], Y[test_index]
    scal = scaler.fit(X_train)
    X_train = scal.transform(X_train)  # Variables standardization
    X_test = scal.transform(X_test)  # Variables standardization
    X_train = pd.DataFrame(X_train, columns=X_col)
    X_test = pd.DataFrame(X_test, columns=X_col)
    clf = svm.fit(X_train, Y_train)
    Y_predicted = clf.predict(X_test)
    f1score.append(f1_score(Y_test, Y_predicted))
    precision.append(precision_score(Y_test, Y_predicted))  # Precision = Positive predictive value
    npv.append(precision_score(Y_test, Y_predicted, pos_label=0))  # Negative predictive value
    recall.append(recall_score(Y_test, Y_predicted))  # Recall = Sensitivity
    specificity.append(recall_score(Y_test, Y_predicted, pos_label=0))
    bal_acc.append(balanced_accuracy_score(Y_test, Y_predicted))

# print(np.mean(f1score))
# print(np.mean(precision))
print(np.mean(recall))
print(np.std(recall))
print(np.mean(specificity))
print(np.std(specificity))
# print(np.mean(npv))
print(np.mean(bal_acc))
print(np.std(bal_acc))

0.5740753424657534
0.027829025046090826
0.7290807017543861
0.014642915377759783
0.6515780221100697
0.013140676809983404


In [4]:
from importlib import reload
reload(src)

<module 'src.functions' from 'c:\\Users\\User\\Documents\\GitHub\\Tese\\src\\functions.py'>

In [24]:
from mrmr import mrmr_classif

selected_features = mrmr_classif(X=X_all, y=Y, K=30)
combined_df=pd.DataFrame()
for feature in selected_features:
    combined_df=pd.concat([combined_df, X_all[feature]], axis=1)
X=combined_df
src.classification(30,X,Y)
print("Done")

100%|██████████| 30/30 [00:13<00:00,  2.27it/s]


In [12]:
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
import scipy.signal as signal
from sklearn.svm import SVC
from sklearn.metrics import f1_score, precision_score, recall_score, balanced_accuracy_score, confusion_matrix
import pandas as pd
from mrmr import mrmr_classif

selected_features = mrmr_classif(X=X_all, y=Y, K=30)
combined_df=pd.DataFrame()
for feature in selected_features:
    combined_df=pd.concat([combined_df, X_all[feature]], axis=1)
X=combined_df


param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['rbf', 'poly']
}
grid = GridSearchCV(SVC(class_weight='balanced'), param_grid, refit=True, verbose=2, n_jobs=-1)
grid.fit(X, Y)

# Print the best parameters found
print(f"Best parameters found: {grid.best_params_}")

# Now use the best parameters for Stratified Shuffle Split
best_params = grid.best_params_
svm = SVC(class_weight='balanced', **best_params)

f1score = []
precision = []
recall = []
specificity = []
npv = []
bal_acc = []

splits = 100
sss = StratifiedShuffleSplit(n_splits=splits, test_size=0.3, random_state=0)
scaler = StandardScaler()
X_col = X.columns
Y = np.array(Y)

for train_index, test_index in sss.split(X, Y):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    Y_train, Y_test = Y[train_index], Y[test_index]
    scal = scaler.fit(X_train)
    X_train = scal.transform(X_train)  # Variables standardization
    X_test = scal.transform(X_test)  # Variables standardization
    X_train = pd.DataFrame(X_train, columns=X_col)
    X_test = pd.DataFrame(X_test, columns=X_col)
    clf = svm.fit(X_train, Y_train)
    Y_predicted = clf.predict(X_test)
    f1score.append(f1_score(Y_test, Y_predicted))
    precision.append(precision_score(Y_test, Y_predicted))  # Precision = Positive predictive value
    npv.append(precision_score(Y_test, Y_predicted, pos_label=0))  # Negative predictive value
    recall.append(recall_score(Y_test, Y_predicted))  # Recall = Sensitivity
    specificity.append(recall_score(Y_test, Y_predicted, pos_label=0))
    bal_acc.append(balanced_accuracy_score(Y_test, Y_predicted))

print("Number of features used: "+ str(X.shape[1]))
print("Mean Sensitivity: {:.4f}".format(np.mean(recall)))
print("Std Sensitivity: {:.4f}".format(np.std(recall)))
print("Mean Specificity: {:.4f}".format(np.mean(specificity)))
print("Std Specificity: {:.4f}".format(np.std(specificity)))
print("Mean Balanced Accuracy: {:.4f}".format(np.mean(bal_acc)))
print("Std Balanced Accuracy: {:.4f}".format(np.std(bal_acc)))

100%|██████████| 30/30 [00:13<00:00,  2.23it/s]


Fitting 5 folds for each of 30 candidates, totalling 150 fits


KeyboardInterrupt: 

In [None]:
bal_acc #teste de proporções, testar quao acima do chance level está
#ROC curve
#R^2
#
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['rbf', 'poly']
}