In [1]:
import pandas as pd
import numpy as np
import feature_extraction_ML as fe
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif

1) Load murmur_dataset.csv (contains all features that have been extracted).
2) Get the augmented samples to a different dataframe.
3) Drop 'raw' audio
4) There are very few nan and +/- inf values, replace them with 0. 

In [2]:
data = pd.read_csv('../murmor_dataset.csv') 

print(f"Classes' distribution:")
print(data.groupby('MURMUR').count()['Patient_ID'])
duplicates = data[data.duplicated(['Patient_ID'],keep=False)]
duplicates = duplicates.sort_values(by=['Patient_ID'])
data.drop_duplicates(subset=['Patient_ID'],keep=False, inplace=True)
print(f'dataframe without duplicate samples shape: {data.shape}')
print(f'augmented positive samples dataframe shape: {duplicates.shape}')
y = data.MURMUR
y = y.replace({'Present':1,'Absent':0})
data = data.drop(columns=['Patient_ID', 'AV', 'MV', 'PV', 'TV','MURMUR'])
data = data.fillna(0)
data.replace([np.inf, -np.inf], 0, inplace=True)

Classes' distribution:
MURMUR
Absent     457
Present    203
Name: Patient_ID, dtype: int64
dataframe without duplicate samples shape: (488, 134)
augmented positive samples dataframe shape: (172, 134)


# Data splitting

Split non-augmented data to 3 datasets. Training, validation and test sets.
We will use the training set in order to fit our classifiers.
We will use the validation set for hyperparameter tuning.
The test set will be used for the final evaluation of our hypothesis. 
We won't make any choice about our classifier or the features will be used using this set. 

In [3]:
X_train, X_test, y_train, y_test  = train_test_split(data, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val   = train_test_split(X_train, y_train, test_size=0.25, random_state=1) 

In [4]:
print(f'Training dataset size = {X_train.shape} , {round(X_train.shape[0]/data.shape[0], 2)} %')
print(f'Validation dataset size = {X_val.shape}  , {round(X_val.shape[0]/data.shape[0] , 2)} %')
print(f'Test dataset size = {X_test.shape} , {round(X_test.shape[0]/data.shape[0], 2)} %')

Training dataset size = (292, 128) , 0.6 %
Validation dataset size = (98, 128)  , 0.2 %
Test dataset size = (98, 128) , 0.2 %


Now, we need to distribute the augmented samples to the 3 datasets. We are doing so, in order to make sure that samples coming from the same patients have been assigned ton the same dataset. Full explanation can be found in the report. By using the split_augmented_samples() method we define also the percentage of positive samples we would like to add to each dataset.

In [5]:
X_to_train,y_to_train, X_to_val, y_to_val, X_to_test, y_to_test = fe.split_augmented_samples(duplicates,train_percentage=.8, validation_percentage=.1, test_percentage=.1)

86.0 unique patients' samples have been duplicated, by a factor of 2
70 unique patients' samples to add to training dataset
8 unique patients' samples to add to validation dataset
8 unique patients' samples to add to testing dataset


In [6]:
X_train = X_train.append(X_to_train)
y_train = y_train.append(y_to_train)

X_val = X_val.append(X_to_val)
y_val = y_val.append(y_to_val)

X_test = X_test.append(X_to_test)
y_test = y_test.append(y_to_test)

In [7]:
total = X_train.shape[0] + X_val.shape[0] + X_test.shape[0]
print(f'total samples : {total}')
print(f'final training set size : {X_train.shape[0]} samples, {round(X_train.shape[0]/total,4) * 100} %')
print(f'final validation set size : {X_val.shape[0]} samples, {round(X_val.shape[0]/total,4) * 100} %')
print(f'final testing set size : {X_test.shape[0]} samples, {round(X_test.shape[0]/total,4) * 100} %')

total samples : 660
final training set size : 432 samples, 65.45 %
final validation set size : 114 samples, 17.27 %
final testing set size : 114 samples, 17.27 %


In [8]:
print(f'positive sample rate in training set : {round(y_train.sum()/y_train.shape[0],2)}')
print(f'positive sample rate in validation set : {round(y_val.sum()/y_val.shape[0],2)}')
print(f'positive sample rate in testing set : {round(y_test.sum()/y_test.shape[0], 2)}')

positive sample rate in training set : 0.36
positive sample rate in validation set : 0.18
positive sample rate in testing set : 0.23


In [9]:
X_train.to_csv("../train_val_test_datasets/X_train.csv", index=False)
X_val.to_csv("../train_val_test_datasets/X_val.csv", index=False)
X_test.to_csv("../train_val_test_datasets/X_test.csv", index=False)


y_train.to_csv("../train_val_test_datasets/y_train.csv", index=False)
y_val.to_csv("../train_val_test_datasets/y_val.csv", index=False)
y_test.to_csv("../train_val_test_datasets/y_test.csv", index=False)

# Feature Selection using Lasso Logistic Regression

We combine training and validation sets and use Cross Validation technique in the grid searches

In [10]:
X_train = X_train.append(X_val)
y_train = y_train.append(y_val)

In [11]:
scaler = StandardScaler()
logistic = LogisticRegression(max_iter=100000, penalty= 'l1', solver='liblinear')
pipe = Pipeline(steps=[("scaler", scaler), ("logistic", logistic)])
param_grid = {
    "logistic__C": [0.1,0.2,0.6,1,1.6,2,4,10,20,100],
}

grid_search = GridSearchCV(pipe,  
                           param_grid,
                           scoring = 'f1',
                           cv = 4,
                           verbose=0)


grid_search.fit(X_train, y_train)   
print(grid_search.best_params_)
coefficients = grid_search.best_estimator_.named_steps['logistic'].coef_[0]
importance = np.abs(coefficients)
print(f'lasso regression performed one kind of feature elimination (=0 value on the coefficients) on {np.array(data.columns)[importance == 0].shape[0]} features')
feat_importances = pd.Series(importance)
feat_importances = feat_importances[feat_importances >0]

{'logistic__C': 3}
lasso regression performed one kind of feature elimination (=0 value on the coefficients) on 34 features


save the selected features to a .txt file

In [12]:
imp_features = feat_importances.to_dict()
columns = data.columns
result = []
new_columns = []
for col, value in imp_features.items():
    result.append((columns[col],value))
    new_columns.append(columns[col])
print(f'{feat_importances.shape[0]} features have been selected')
with open(r'../important_features/logistic_regression_lasso.txt', 'w') as fp:
    for item in new_columns:
        fp.write("%s\n" % item)

94 features have been selected


#  Feature Selection using ANOVA F-value

In [33]:
selector = SelectKBest(f_classif, k=94)
selector.fit(X_train,y_train)
anova_result = selector.get_feature_names_out(columns)
with open(r'../important_features/anova.txt', 'w') as fp:
    for item in anova_result:
        fp.write("%s\n" % item)

# Recursive Feature Elimination

In [36]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

min_features_to_select = 20  # Minimum number of features to consider
clf = LogisticRegression()
cv = StratifiedKFold(5)

rfecv = RFECV(
    estimator=clf,
    step=1,
    cv=cv,
    scoring="accuracy",
    min_features_to_select=min_features_to_select,
    n_jobs=2,
)
rfecv.fit(X_train, y_train)

print(f"Optimal number of features: {rfecv.n_features_}")

Optimal number of features: 66


In [39]:
rfe = rfecv.get_feature_names_out(columns)
with open(r'../important_features/rfe.txt', 'w') as fp:
    for item in rfe:
        fp.write("%s\n" % item)