In [1]:
import warnings
for warn in [UserWarning, FutureWarning]: warnings.filterwarnings("ignore", category = warn)

import os
import time
import torch
import torchaudio

import numpy as np
import pandas as pd

from sklearn.metrics import f1_score, recall_score, precision_score, balanced_accuracy_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import scipy

from tqdm import tqdm
import librosa

from flaml import AutoML

In [2]:
SEED = 1984

np.random.seed(SEED)
torch.manual_seed(SEED)

gen = torch.Generator()
gen.manual_seed(SEED)

SR = 8_000
SEQUENCE_LENGTH = 300 * SR
MFCC = 64

In [3]:
DATA_DIR = os.path.join('..', 'data')
VOICES_DIR = os.path.join(DATA_DIR, 'Voices_wav')
APHASIA_DIR = os.path.join(VOICES_DIR, 'Aphasia')
NORM_DIR = os.path.join(VOICES_DIR, 'Norm')

In [4]:
train_data = pd.read_csv(os.path.join(DATA_DIR, 'train_filenames.csv'))
val_data = pd.read_csv(os.path.join(DATA_DIR, 'val_filenames.csv'))
test_data = pd.read_csv(os.path.join(DATA_DIR, 'test_filenames.csv'))

In [5]:
train_data["file_name"] = train_data.apply(lambda x: os.path.join(APHASIA_DIR, x['file_name']) if x['label'] == 1 else os.path.join(NORM_DIR, x['file_name']), axis=1)
val_data["file_name"] = val_data.apply(lambda x: os.path.join(APHASIA_DIR, x['file_name']) if x['label'] == 1 else os.path.join(NORM_DIR, x['file_name']), axis=1)
test_data["file_name"] = test_data.apply(lambda x: os.path.join(APHASIA_DIR, x['file_name']) if x['label'] == 1 else os.path.join(NORM_DIR, x['file_name']), axis=1)

In [6]:
mfcc_class = torchaudio.transforms.MFCC(sample_rate=SR, n_mfcc=MFCC, log_mels=True, melkwargs={"n_fft": 20_000, "win_length": 10_000, "hop_length": 5_000, "n_mels": 200})

In [7]:
def preprocess_function_mfcc(path):
    x, sr = librosa.load(path, sr=SR)
    x = x[..., :SEQUENCE_LENGTH]
    x = np.pad(x, (0, SEQUENCE_LENGTH - x.shape[0]), mode='constant')
    mfcc = mfcc_class(torch.Tensor(x)).numpy().flatten().squeeze()
    # print(mfcc.shape)
    return mfcc

In [8]:
train_data["mfcc"] = train_data["file_name"].apply(preprocess_function_mfcc)
val_data["mfcc"] = val_data["file_name"].apply(preprocess_function_mfcc)
test_data["mfcc"] = test_data["file_name"].apply(preprocess_function_mfcc)

In [9]:
train_data = shuffle(train_data).reset_index(drop=True)
val_data = shuffle(val_data).reset_index(drop=True)
test_data = shuffle(test_data).reset_index(drop=True)

In [10]:
def custom_balanced_accuracy(
    X_val,
    y_val,
    estimator,
    labels,
    X_train,
    y_train,
    weight_val=None,
    weight_train=None,
    *args,
):
    start = time.time()
    y_pred = estimator.predict_proba(X_val)
    pred_time = (time.time() - start) / len(X_val)
    val_acc = balanced_accuracy_score(y_val, np.argmax(y_pred, axis=-1), sample_weight=weight_val)
    return 1 - val_acc, {
        "val_acc": val_acc,
        "pred_time": pred_time,
    }

In [11]:
pre_automl = AutoML()
pre_automl.fit(np.vstack(train_data["mfcc"]), train_data["label"], task="classification", time_budget=150, X_val=np.vstack(val_data["mfcc"]), y_val=val_data["label"], metric=custom_balanced_accuracy, seed=SEED, estimator_list=['lgbm', 'xgboost', 'xgb_limitdepth', 'rf', 'extra_tree', 'catboost'], verbose=False)

automl = AutoML()
automl.fit(np.vstack(train_data["mfcc"]), train_data["label"], task="classification", time_budget=800, X_val=np.vstack(val_data["mfcc"]), y_val=val_data["label"], metric=custom_balanced_accuracy, seed=SEED, estimator_list=['lgbm', 'xgboost', 'xgb_limitdepth', 'rf', 'extra_tree', 'catboost'], starting_points=pre_automl.best_config_per_estimator, verbose=False)

In [12]:
preds = automl.predict(np.vstack(test_data["mfcc"])) 

print(f"Accuracy: {accuracy_score(test_data["label"], preds):.3f}")
print(f"Precision: {precision_score(test_data["label"], preds):.3f}")
print(f"Recall: {recall_score(test_data["label"], preds):.3f}")
print(f"F1 Score: {f1_score(test_data["label"], preds):.3f}")
print(f"Balanced Accuracy: {balanced_accuracy_score(test_data["label"], preds):.3f}")

Accuracy: 0.860
Precision: 0.890
Recall: 0.931
F1 Score: 0.910
Balanced Accuracy: 0.787


In [13]:
print(classification_report(test_data["label"], preds))

              precision    recall  f1-score   support

           0       0.75      0.64      0.69        42
           1       0.89      0.93      0.91       130

    accuracy                           0.86       172
   macro avg       0.82      0.79      0.80       172
weighted avg       0.86      0.86      0.86       172



In [14]:
preds

array([0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1])

In [15]:
print(automl.best_estimator)

lgbm


In [16]:
print(automl.best_config)

{'n_estimators': 27, 'num_leaves': 9, 'min_child_samples': 65, 'learning_rate': 1.0, 'log_max_bin': 8, 'colsample_bytree': 0.9095005754420106, 'reg_alpha': 0.002067913126494385, 'reg_lambda': 0.023795353137305457}


In [17]:
automl.model.estimator

In [18]:
best_estimator_checkpoint = {automl.best_estimator: automl.best_config_per_estimator[automl.best_estimator]} 
best_estimator_checkpoint

{'lgbm': {'n_estimators': 27,
  'num_leaves': 9,
  'min_child_samples': 65,
  'learning_rate': 1.0,
  'log_max_bin': 8,
  'colsample_bytree': 0.9095005754420106,
  'reg_alpha': 0.002067913126494385,
  'reg_lambda': 0.023795353137305457}}

Подбираем параметры для лучшей модели

In [19]:
post_automl = AutoML()
post_automl.fit(np.vstack(train_data["mfcc"]), train_data["label"], task="classification", time_budget=300, X_val=np.vstack(val_data["mfcc"]), y_val=val_data["label"], metric=custom_balanced_accuracy, seed=SEED, estimator_list=[automl.best_estimator], starting_points=best_estimator_checkpoint, verbose=False)

In [20]:
preds = post_automl.predict(np.vstack(test_data["mfcc"])) 

print(f"Accuracy: {accuracy_score(test_data["label"], preds):.3f}")
print(f"Precision: {precision_score(test_data["label"], preds):.3f}")
print(f"Recall: {recall_score(test_data["label"], preds):.3f}")
print(f"F1 Score: {f1_score(test_data["label"], preds):.3f}")
print(f"Balanced Accuracy: {balanced_accuracy_score(test_data["label"], preds):.3f}")

Accuracy: 0.860
Precision: 0.890
Recall: 0.931
F1 Score: 0.910
Balanced Accuracy: 0.787
