<p style = "font-size: 120%">The purpose of this notebook is firstly to split the data into training, validation and testing sets. Secondly, we make sure that the augmented samples are distributed into the same sets. Then, we perform 3 feature selection methods. The features that have been selected from these methods are used from murmur_classification.ipynb.</p>

In [None]:
import pandas as pd
import numpy as np
import feature_extraction_ML as fe
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif

1) Load murmur_dataset.csv (contains all features that have been extracted).
2) Get the augmented samples to a different dataframe.
3) Drop 'raw' audio
4) There are very few nan and +/- inf values, replace them with 0. 

In [2]:
data = pd.read_csv('./murmor_dataset.csv') 
print(f"Classes' distribution:")
print(data.groupby('MURMUR').count()['Patient_ID'])
duplicates = data[data.duplicated(['Patient_ID'],keep=False)]
duplicates = duplicates.sort_values(by=['Patient_ID'])
data.drop_duplicates(subset=['Patient_ID'],keep=False, inplace=True)
print(f'dataframe without duplicate samples shape: {data.shape}')
print(f'augmented positive samples dataframe shape: {duplicates.shape}')
y = data.MURMUR
y = y.replace({'Present':1,'Absent':0})
#data = data.drop(columns=['Patient_ID', 'AV', 'MV', 'PV', 'TV','MURMUR'])
data = data.fillna(0)
data.replace([np.inf, -np.inf], 0, inplace=True)

Classes' distribution:
MURMUR
Absent     457
Present    301
Name: Patient_ID, dtype: int64
dataframe without duplicate samples shape: (482, 134)
augmented positive samples dataframe shape: (276, 134)


# Data splitting

<p style = "font-size: 120%">Split non-augmented data to 3 datasets. Training, validation and test sets.
We will use the training set in order to fit our classifiers.
We will use the validation set for hyperparameter tuning and overfitting avoidance.
The test set will be used for the final evaluation of our hypothesis. 
We won't make any choice about our classifier or the features that will be used using this set. </p>

In [3]:
X_train, X_test, y_train, y_test  = train_test_split(data, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val   = train_test_split(X_train, y_train, test_size=0.25, random_state=1) 

In [9]:
X_train[X_train.MURMUR == 'Present']

Unnamed: 0,Patient_ID,AV,MV,PV,TV,MURMUR,mean_ae_AV,mean_ae_MV,mean_ae_PV,mean_ae_TV,...,TV_mfcc_4,TV_mfcc_5,TV_mfcc_6,TV_mfcc_7,TV_mfcc_8,TV_mfcc_9,TV_mfcc_10,TV_mfcc_11,TV_mfcc_12,TV_mfcc_13
381,84853,[-0.32071152 -0.3690945 -0.4061405 ... 0.03...,[0.02689058 0.02877489 0.02915622 ... 0.020653...,[ 2.5454190e-01 2.8530437e-01 3.0490834e-01 ...,[-0.04431367 -0.04773 -0.04849122 ... 0.00...,Present,0.114988,0.059698,0.081672,0.099381,...,-45.099998,-6.469932,43.002464,26.521521,-10.089214,-6.099591,18.528557,15.840605,-6.780711,-10.43191
279,68740,[-0.00107041 0.00030819 0.0022726 ... -0.00...,[0.00575381 0.00347267 0.00015037 ... 0.002418...,[ 0.07834814 0.09137795 0.10214932 ... -0.00...,[-0.04281918 -0.04845003 -0.05216073 ... -0.13...,Present,0.108956,0.104725,0.120759,0.127204,...,-11.87351,1.969592,30.473291,19.694462,-4.756458,-1.894067,17.539341,18.654253,1.439033,-6.187788
264,68560,[-0.00690438 -0.00765683 -0.00772769 ... 0.00...,[-0.03061199 -0.03425114 -0.03647234 ... 0.00...,[-0.0007251 0.00046321 0.00184034 ... -0.16...,[-0.08235708 -0.09972838 -0.11564512 ... -0.03...,Present,0.124864,0.048924,0.115491,0.069486,...,-20.867231,-3.683221,30.017776,19.649952,-6.227643,-2.699888,17.6665,17.352932,-1.547783,-8.01897
305,69147,[0.87454104 0.9344029 0.93962663 ... 0.042954...,[0.02934257 0.02862482 0.02526757 ... 0.003145...,[-0.08232868 -0.09072251 -0.09473116 ... -0.08...,[ 0.04842271 0.04895305 0.04575125 ... -0.02...,Present,0.332488,0.06024,0.179405,0.085859,...,-12.862217,3.795122,32.966862,20.411263,-4.799927,-0.453283,19.924604,19.930161,2.12527,-3.390017
455,85037,[-0.05865807 -0.0609215 -0.05823147 ... -0.00...,[-0.04976786 -0.05097215 -0.04967351 ... -0.00...,[-0.13420086 -0.14631353 -0.15121481 ... -0.00...,[-0.04196195 -0.04097916 -0.03588885 ... 0.00...,Present,0.079657,0.079738,0.27851,0.047787,...,-19.111893,4.49892,38.960548,25.528564,-3.260988,-0.52184,19.892933,18.953281,-0.644779,-6.775635
313,73497,[ 0.00017026 0.00021305 0.00023874 ... -0.00...,[ 0.06155743 0.06246849 0.05889456 ... -0.05...,[-0.01367015 -0.01503068 -0.01580867 ... -0.02...,[-0.01996365 -0.02282994 -0.02499411 ... -0.00...,Present,0.093869,0.0913,0.096608,0.121561,...,-64.67968,-14.967181,44.527027,26.076262,-14.748684,-8.036978,21.482121,19.030283,-5.476946,-8.406257
476,85108,[-0.01744048 -0.023086 -0.02767903 ... 0.00...,[-0.00168675 -0.00141836 -0.00193858 ... -0.00...,[-0.03482354 -0.03961224 -0.04267656 ... -0.00...,[-0.17923222 -0.20948201 -0.2333751 ... 0.00...,Present,0.092911,0.13736,0.081934,0.21313,...,-9.735985,2.917276,28.876076,24.232521,5.527267,4.577996,16.363583,15.984452,2.506456,-3.658571
494,85133,[-0.01909152 -0.01901845 -0.017233 ... 0.00...,[-0.02606167 -0.03656613 -0.04664699 ... 0.01...,[0.05460768 0.05433484 0.04779535 ... 0.001005...,[0.013046 0.01394327 0.0140164 ... 0.005751...,Present,0.102855,0.251927,0.130682,0.078561,...,-38.359058,0.5387,47.293655,30.685221,-5.388805,-2.971128,19.522072,16.628063,-4.965974,-8.16896
470,85087,[-0.0496546 -0.0540709 -0.05482747 ... -0.00...,[-0.5183226 -0.4917284 -0.41388342 ... 0.06...,[0.02650809 0.02906571 0.0299975 ... 0.011182...,[-0.04289301 -0.0567697 -0.0707407 ... 0.01...,Present,0.112458,0.278309,0.157376,0.594548,...,-6.002911,3.282756,28.628666,19.701778,-1.92553,-1.687784,10.781678,8.918666,-3.892886,-6.473047
297,69096,[-0.3428956 -0.38172892 -0.4039996 ... -0.00...,[0.03114605 0.03512637 0.03768527 ... 0.009606...,[0.04595495 0.04875618 0.04901313 ... 0.000965...,[0.0048005 0.00676966 0.00905633 ... 0.018801...,Present,0.112199,0.16319,0.102265,0.203634,...,-51.546177,-15.826929,35.350727,22.853411,-11.308782,-7.21162,17.79497,17.36075,-3.381328,-8.284492


In [4]:
print(f'Training dataset size = {X_train.shape} , {round(X_train.shape[0]/data.shape[0], 2)} %')
print(f'Validation dataset size = {X_val.shape}  , {round(X_val.shape[0]/data.shape[0] , 2)} %')
print(f'Test dataset size = {X_test.shape} , {round(X_test.shape[0]/data.shape[0], 2)} %')

Training dataset size = (288, 128) , 0.6 %
Validation dataset size = (97, 128)  , 0.2 %
Test dataset size = (97, 128) , 0.2 %


<p style = "font-size: 120%">Now, we need to distribute the augmented samples to the 3 datasets. We are doing so, in order to make sure that samples coming from the same patients have been assigned ton the same dataset. By using the split_augmented_samples() method we define also the percentage of positive samples we would like to add to each dataset.</p>

In [5]:
X_to_train,y_to_train, X_to_val, y_to_val, X_to_test, y_to_test = fe.split_augmented_samples(duplicates,train_percentage=.8, validation_percentage=.1, test_percentage=.1)

92.0 unique patients' samples have been duplicated, by a factor of 3
74 unique patients' samples to add to training dataset
9 unique patients' samples to add to validation dataset
9 unique patients' samples to add to testing dataset


In [6]:
X_train = X_train.append(X_to_train)
y_train = y_train.append(y_to_train)

X_val = X_val.append(X_to_val)
y_val = y_val.append(y_to_val)

X_test = X_test.append(X_to_test)
y_test = y_test.append(y_to_test)

In [7]:
total = X_train.shape[0] + X_val.shape[0] + X_test.shape[0]
print(f'total samples : {total}')
print(f'final training set size : {X_train.shape[0]} samples, {round(X_train.shape[0]/total,4) * 100} %')
print(f'final validation set size : {X_val.shape[0]} samples, {round(X_val.shape[0]/total,4) * 100} %')
print(f'final testing set size : {X_test.shape[0]} samples, {round(X_test.shape[0]/total,4) * 100} %')

total samples : 758
final training set size : 510 samples, 67.28 %
final validation set size : 124 samples, 16.36 %
final testing set size : 124 samples, 16.36 %


In [8]:
print(f'positive sample rate in training set : {round(y_train.sum()/y_train.shape[0],2)}')
print(f'positive sample rate in validation set : {round(y_val.sum()/y_val.shape[0],2)}')
print(f'positive sample rate in testing set : {round(y_test.sum()/y_test.shape[0], 2)}')

positive sample rate in training set : 0.47
positive sample rate in validation set : 0.27
positive sample rate in testing set : 0.24


In [9]:
X_train.to_csv("../train_val_test_datasets/X_train.csv", index=False)
X_val.to_csv("../train_val_test_datasets/X_val.csv", index=False)
X_test.to_csv("../train_val_test_datasets/X_test.csv", index=False)

y_train.to_csv("../train_val_test_datasets/y_train.csv", index=False)
y_val.to_csv("../train_val_test_datasets/y_val.csv", index=False)
y_test.to_csv("../train_val_test_datasets/y_test.csv", index=False)

# Feature Selection using Lasso Logistic Regression

<p style = "font-size: 120%">We combine training and validation sets and use Cross Validation technique in the grid searches.</p>

In [10]:
X_train = X_train.append(X_val)
y_train = y_train.append(y_val)

In [16]:
scaler = StandardScaler()
logistic = LogisticRegression(max_iter=10000, penalty= 'l1', solver='liblinear')
pipe = Pipeline(steps=[("scaler", scaler), ("logistic", logistic)])
param_grid = {
    "logistic__C": [0.1,0.2,0.6,1,1.6,2,4,10,20],
}

grid_search = GridSearchCV(pipe,  
                           param_grid,
                           scoring = 'f1',
                           cv = 3,
                           verbose=0)


grid_search.fit(X_train, y_train)   
print(grid_search.best_params_)
coefficients = grid_search.best_estimator_.named_steps['logistic'].coef_[0]
importance = np.abs(coefficients)
print(f'lasso regression performed one kind of feature elimination (=0 value on the coefficients) on {np.array(data.columns)[importance == 0].shape[0]} features')
feat_importances = pd.Series(importance)
feat_importances = feat_importances[feat_importances >0]

{'logistic__C': 0.2}
lasso regression performed one kind of feature elimination (=0 value on the coefficients) on 81 features


save the selected features to a .txt file

In [None]:
feat_importances

In [None]:
imp_features = feat_importances.to_dict()
columns = data.columns
result = []
new_columns = []
for col, value in imp_features.items():
    result.append((columns[col],value))
    new_columns.append(columns[col])
print(f'{feat_importances.shape[0]} features have been selected')
with open(r'../important_features/logistic_regression_lasso.txt', 'w') as fp:
    for item in new_columns:
        fp.write("%s\n" % item)

#  Feature Selection using ANOVA F-value

In [13]:
selector = SelectKBest(f_classif, k=94)
selector.fit(X_train,y_train)
anova_result = selector.get_feature_names_out(columns)
with open(r'../important_features/anova.txt', 'w') as fp:
    for item in anova_result:
        fp.write("%s\n" % item)

# Recursive Feature Elimination

In [14]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

min_features_to_select = 20  # Minimum number of features to consider
clf = LogisticRegression()
cv = StratifiedKFold(5)

rfecv = RFECV(
    estimator=clf,
    step=1,
    cv=cv,
    scoring="accuracy",
    min_features_to_select=min_features_to_select,
    n_jobs=2,
)
rfecv.fit(X_train, y_train)

print(f"Optimal number of features: {rfecv.n_features_}")

Optimal number of features: 84


In [15]:
rfe = rfecv.get_feature_names_out(columns)
with open(r'../important_features/rfe.txt', 'w') as fp:
    for item in rfe:
        fp.write("%s\n" % item)