In [1]:
import warnings

for warn in [UserWarning, FutureWarning]: warnings.filterwarnings("ignore", category = warn)

import os
import time
import torch
import torchaudio

import numpy as np
import pandas as pd

from sklearn.metrics import f1_score, recall_score, precision_score, balanced_accuracy_score, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import scipy

from tqdm import tqdm
import librosa

from src.utils import *

from flaml import AutoML

from transformers import AutoFeatureExtractor

from disvoice.prosody.prosody import Prosody

################################################################################
###          (please add 'export KALDI_ROOT=<your_path>' in your $HOME/.profile)
###          (or run as: KALDI_ROOT=<your_path> python <your_script>.py)
################################################################################

2025-03-15 13:24:06.896380: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-15 13:24:06.905462: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742034246.917670   10831 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E000

In [2]:
SEED = 1984

np.random.seed(SEED)
torch.manual_seed(SEED)

gen = torch.Generator()
gen.manual_seed(SEED)

SR = 8_000
SEQUENCE_LENGTH = 300 * SR
MFCC = 64

In [3]:
DATA_DIR = os.path.join('..', 'data')
VOICES_DIR = os.path.join(DATA_DIR, 'Voices_wav')
APHASIA_DIR = os.path.join(VOICES_DIR, 'Aphasia')
NORM_DIR = os.path.join(VOICES_DIR, 'Norm')

In [4]:
train_data = pd.read_csv(os.path.join(DATA_DIR, 'train_filenames.csv'))
val_data = pd.read_csv(os.path.join(DATA_DIR, 'val_filenames.csv'))
test_data = pd.read_csv(os.path.join(DATA_DIR, 'test_filenames.csv'))

In [5]:
train_data["file_name"] = train_data.apply(lambda x: os.path.join(APHASIA_DIR, x['file_name']) if x['label'] == 1 else os.path.join(NORM_DIR, x['file_name']), axis=1)
val_data["file_name"] = val_data.apply(lambda x: os.path.join(APHASIA_DIR, x['file_name']) if x['label'] == 1 else os.path.join(NORM_DIR, x['file_name']), axis=1)
test_data["file_name"] = test_data.apply(lambda x: os.path.join(APHASIA_DIR, x['file_name']) if x['label'] == 1 else os.path.join(NORM_DIR, x['file_name']), axis=1)

In [6]:
mfcc_class = torchaudio.transforms.MFCC(sample_rate=SR, n_mfcc=MFCC, log_mels=True, melkwargs={"n_fft": 20_000, "win_length": 10_000, "hop_length": 5_000, "n_mels": 200})

In [7]:
chroma_stft_kwargs = {"n_fft": 20_000, "win_length": 10_000, "hop_length": 5_000, "n_chroma": 12}
spectral_stft_kwargs = {"n_fft": 20_000, "win_length": 10_000, "hop_length": 5_000}

In [8]:
prosody = Prosody()

In [9]:
def preprocess_function_mfcc(path):
    y, sr = librosa.load(path, sr=SR)
    y = y[..., :SEQUENCE_LENGTH]
    y = np.pad(y, (0, SEQUENCE_LENGTH - y.shape[0]), mode='constant')
    mfcc = mfcc_class(torch.Tensor(y))

    return mfcc.numpy().flatten().squeeze()

def preprocess_function_chroma(path):
    y, sr = librosa.load(path, sr=SR)
    y = y[..., :SEQUENCE_LENGTH]
    y = np.pad(y, (0, SEQUENCE_LENGTH - y.shape[0]), mode='constant')
    
    chroma = librosa.feature.chroma_stft(y=y, sr=SR, **chroma_stft_kwargs)

    return chroma.flatten().squeeze()

def preprocess_function_spectral(path):
    y, sr = librosa.load(path, sr=SR)
    y = y[..., :SEQUENCE_LENGTH]
    y = np.pad(y, (0, SEQUENCE_LENGTH - y.shape[0]), mode='constant')
    
    centroid = librosa.feature.spectral_centroid(y=y, sr=sr, **spectral_stft_kwargs)
    bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr, p=2, **spectral_stft_kwargs)
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr, fmin=50, **spectral_stft_kwargs)
    flatness = librosa.feature.spectral_flatness(y=y, **spectral_stft_kwargs)
    rolloff = librosa.feature.spectral_rolloff(y=y, **spectral_stft_kwargs)

    return np.hstack((centroid, bandwidth, contrast.flatten().squeeze()[None, ...], flatness, rolloff))

def preprocess_function_zcr(path):
    y, sr = librosa.load(path, sr=SR)
    y = y[..., :SEQUENCE_LENGTH]
    y = np.pad(y, (0, SEQUENCE_LENGTH - y.shape[0]), mode='constant')
    
    zcr = librosa.feature.zero_crossing_rate(y, frame_length=10_000, hop_length=5_000)
    return zcr
    
def preprocess_function_simple(path):
    y, sr = librosa.load(path, sr=SR)
    y = y[..., :SEQUENCE_LENGTH]
    y = np.pad(y, (0, SEQUENCE_LENGTH - y.shape[0]), mode='constant')
    
    speech_duration, speech_count, _, mean_speach_duration, silence_duration, duration_count, _, mean_silence_duration = get_speech_and_silence_timestamps(torch.Tensor(y), sr=SR)
    return np.array([speech_duration, speech_count, mean_speach_duration, silence_duration, duration_count, mean_silence_duration, speech_duration / (silence_duration + 1e-6)])

def preprocess_function_prosody(path):
    features = prosody.prosody_static(path, plots=False)
    
    return features
    

In [10]:
def get_features(data_name, prep_function=None):
    if not os.path.exists(os.path.join(DATA_DIR, f"train_data_{data_name}.npy")) and not (prep_function is None):
        _train_data = np.vstack((train_data["file_name"].apply(prep_function)).to_numpy())
        _val_data = np.vstack((val_data["file_name"].apply(prep_function)).to_numpy())
        _test_data = np.vstack((test_data["file_name"].apply(prep_function)).to_numpy())
    
        np.save(os.path.join(DATA_DIR, f"train_data_{data_name}.npy"), _train_data)
        np.save(os.path.join(DATA_DIR, f"val_data_{data_name}.npy"), _val_data)
        np.save(os.path.join(DATA_DIR, f"test_data_{data_name}.npy"), _test_data)
    else:
        _train_data = np.load(os.path.join(DATA_DIR, f"train_data_{data_name}.npy"))
        _val_data = np.load(os.path.join(DATA_DIR, f"val_data_{data_name}.npy"))
        _test_data = np.load(os.path.join(DATA_DIR, f"test_data_{data_name}.npy"))
    
    return _train_data, _val_data, _test_data

In [11]:
train_data_prosody, val_data_prosody, test_data_prosody = get_features("prosody", preprocess_function_prosody)

In [12]:
train_data_mfcc, val_data_mfcc, test_data_mfcc = get_features("mfcc", preprocess_function_mfcc)

In [13]:
train_data_chroma, val_data_chroma, test_data_chroma = get_features("chroma", preprocess_function_chroma)

In [14]:
train_data_spectral, val_data_spectral, test_data_spectral = get_features("spectral", preprocess_function_spectral)

In [15]:
train_data_zrc, val_data_zrc, test_data_zrc = get_features("zrc", preprocess_function_zcr)

In [16]:
train_data_simple, val_data_simple, test_data_simple = get_features("simple", preprocess_function_simple)

In [17]:
train_all_features = np.hstack((train_data_mfcc, train_data_chroma, train_data_spectral, train_data_simple, train_data_zrc, train_data_prosody))
val_all_features = np.hstack((val_data_mfcc, val_data_chroma, val_data_spectral, val_data_simple, val_data_zrc, val_data_prosody))
test_all_features = np.hstack((test_data_mfcc, test_data_chroma, test_data_spectral, test_data_simple, test_data_zrc, test_data_prosody))

In [18]:
train_data_mfcc = np.hstack((train_data_mfcc, train_data_zrc))
val_data_mfcc = np.hstack((val_data_mfcc, val_data_zrc))
test_data_mfcc = np.hstack((test_data_mfcc, test_data_zrc))

In [19]:
train_data_chroma = np.hstack((train_data_chroma, train_data_zrc))
val_data_chroma = np.hstack((val_data_chroma, val_data_zrc))
test_data_chroma = np.hstack((test_data_chroma, test_data_zrc))

In [20]:
train_data_spectral = np.hstack((train_data_spectral, train_data_zrc))
val_data_spectral = np.hstack((val_data_spectral, val_data_zrc))
test_data_spectral = np.hstack((test_data_spectral, test_data_zrc))

In [21]:
train_data_simple = np.hstack((train_data_simple, train_data_zrc))
val_data_simple = np.hstack((val_data_simple, val_data_zrc))
test_data_simple = np.hstack((test_data_simple, test_data_zrc))

In [22]:
train_data_prosody = np.hstack((train_data_prosody, train_data_zrc))
val_data_prosody = np.hstack((val_data_prosody, val_data_zrc))
test_data_prosody = np.hstack((test_data_prosody, test_data_zrc))

In [23]:
def custom_balanced_accuracy(
    X_val,
    y_val,
    estimator,
    labels,
    X_train,
    y_train,
    weight_val=None,
    weight_train=None,
    *args,
):
    start = time.time()
    y_pred = estimator.predict_proba(X_val)
    pred_time = (time.time() - start) / len(X_val)
    val_acc = balanced_accuracy_score(y_val, np.argmax(y_pred, axis=-1), sample_weight=weight_val)
    return 1 - val_acc, {
        "val_acc": val_acc,
        "pred_time": pred_time,
    }

## Prosody features

In [24]:
pre_automl = AutoML()
pre_automl.fit(train_data_prosody, train_data["label"], task="classification", time_budget=150, X_val=val_data_prosody, y_val=val_data["label"], metric=custom_balanced_accuracy, seed=SEED, estimator_list=['lgbm', 'xgboost', 'xgb_limitdepth', 'rf', 'extra_tree', 'catboost'], verbose=False)

automl_prosody = AutoML()
automl_prosody.fit(train_data_prosody, train_data["label"], task="classification", time_budget=800, X_val=val_data_prosody, y_val=val_data["label"], metric=custom_balanced_accuracy, seed=SEED, estimator_list=['lgbm', 'xgboost', 'xgb_limitdepth', 'rf', 'extra_tree', 'catboost'], starting_points=pre_automl.best_config_per_estimator, verbose=False)

In [25]:
preds = automl_prosody.predict(test_data_prosody) 

print(classification_report(test_data["label"], preds))

              precision    recall  f1-score   support

           0       0.83      0.60      0.69        42
           1       0.88      0.96      0.92       130

    accuracy                           0.87       172
   macro avg       0.86      0.78      0.81       172
weighted avg       0.87      0.87      0.86       172



## MFCC features

In [26]:
pre_automl = AutoML()
pre_automl.fit(train_data_mfcc, train_data["label"], task="classification", time_budget=150, X_val=val_data_mfcc, y_val=val_data["label"], metric=custom_balanced_accuracy, seed=SEED, estimator_list=['lgbm', 'xgboost', 'xgb_limitdepth', 'rf', 'extra_tree', 'catboost'], verbose=False)

automl_mfcc = AutoML()
automl_mfcc.fit(train_data_mfcc, train_data["label"], task="classification", time_budget=800, X_val=val_data_mfcc, y_val=val_data["label"], metric=custom_balanced_accuracy, seed=SEED, estimator_list=['lgbm', 'xgboost', 'xgb_limitdepth', 'rf', 'extra_tree', 'catboost'], starting_points=pre_automl.best_config_per_estimator, verbose=False)

In [27]:
preds = automl_mfcc.predict(test_data_mfcc) 

print(classification_report(test_data["label"], preds))

              precision    recall  f1-score   support

           0       0.58      0.52      0.55        42
           1       0.85      0.88      0.86       130

    accuracy                           0.79       172
   macro avg       0.71      0.70      0.71       172
weighted avg       0.78      0.79      0.79       172



## Chroma features

In [28]:
pre_automl = AutoML()
pre_automl.fit(train_data_chroma, train_data["label"], task="classification", time_budget=150, X_val=val_data_chroma, y_val=val_data["label"], metric=custom_balanced_accuracy, seed=SEED, estimator_list=['lgbm', 'xgboost', 'xgb_limitdepth', 'rf', 'extra_tree', 'catboost'], verbose=False)

automl_chroma = AutoML()
automl_chroma.fit(train_data_chroma, train_data["label"], task="classification", time_budget=800, X_val=val_data_chroma, y_val=val_data["label"], metric=custom_balanced_accuracy, seed=SEED, estimator_list=['lgbm', 'xgboost', 'xgb_limitdepth', 'rf', 'extra_tree', 'catboost'], starting_points=pre_automl.best_config_per_estimator, verbose=False)

In [29]:
preds = automl_chroma.predict(test_data_chroma) 

print(classification_report(test_data["label"], preds))

              precision    recall  f1-score   support

           0       0.66      0.60      0.62        42
           1       0.87      0.90      0.89       130

    accuracy                           0.83       172
   macro avg       0.77      0.75      0.76       172
weighted avg       0.82      0.83      0.82       172



## Spectral features

In [30]:
pre_automl = AutoML()
pre_automl.fit(train_data_spectral, train_data["label"], task="classification", time_budget=150, X_val=val_data_spectral, y_val=val_data["label"], metric=custom_balanced_accuracy, seed=SEED, estimator_list=['lgbm', 'xgboost', 'xgb_limitdepth', 'rf', 'extra_tree', 'catboost'], verbose=False)

automl_spectral = AutoML()
automl_spectral.fit(train_data_spectral, train_data["label"], task="classification", time_budget=800, X_val=val_data_spectral, y_val=val_data["label"], metric=custom_balanced_accuracy, seed=SEED, estimator_list=['lgbm', 'xgboost', 'xgb_limitdepth', 'rf', 'extra_tree', 'catboost'], starting_points=pre_automl.best_config_per_estimator, verbose=False)

In [31]:
preds = automl_spectral.predict(test_data_spectral) 

print(classification_report(test_data["label"], preds))

              precision    recall  f1-score   support

           0       0.63      0.52      0.57        42
           1       0.85      0.90      0.88       130

    accuracy                           0.81       172
   macro avg       0.74      0.71      0.72       172
weighted avg       0.80      0.81      0.80       172



## Simple features

In [32]:
pre_automl = AutoML()
pre_automl.fit(train_data_simple, train_data["label"], task="classification", time_budget=150, X_val=val_data_simple, y_val=val_data["label"], metric=custom_balanced_accuracy, seed=SEED, estimator_list=['lgbm', 'xgboost', 'xgb_limitdepth', 'rf', 'extra_tree', 'catboost'], verbose=False)

automl_spectral = AutoML()
automl_spectral.fit(train_data_simple, train_data["label"], task="classification", time_budget=800, X_val=val_data_simple, y_val=val_data["label"], metric=custom_balanced_accuracy, seed=SEED, estimator_list=['lgbm', 'xgboost', 'xgb_limitdepth', 'rf', 'extra_tree', 'catboost'], starting_points=pre_automl.best_config_per_estimator, verbose=False)

In [33]:
preds = automl_spectral.predict(test_data_simple) 

print(classification_report(test_data["label"], preds))

              precision    recall  f1-score   support

           0       0.58      0.62      0.60        42
           1       0.87      0.85      0.86       130

    accuracy                           0.80       172
   macro avg       0.73      0.74      0.73       172
weighted avg       0.80      0.80      0.80       172



## All features

In [34]:
pre_automl = AutoML()
pre_automl.fit(train_all_features, train_data["label"], task="classification", time_budget=150, X_val=val_all_features, y_val=val_data["label"], metric=custom_balanced_accuracy, seed=SEED, estimator_list=['lgbm', 'xgboost', 'xgb_limitdepth', 'rf', 'extra_tree', 'catboost'], verbose=False)

automl_all_features = AutoML()
automl_all_features.fit(train_all_features, train_data["label"], task="classification", time_budget=800, X_val=val_all_features, y_val=val_data["label"], metric=custom_balanced_accuracy, seed=SEED, estimator_list=['lgbm', 'xgboost', 'xgb_limitdepth', 'rf', 'extra_tree', 'catboost'], starting_points=pre_automl.best_config_per_estimator, verbose=False)

In [35]:
preds = automl_all_features.predict(test_all_features) 

print(classification_report(test_data["label"], preds))

              precision    recall  f1-score   support

           0       0.83      0.60      0.69        42
           1       0.88      0.96      0.92       130

    accuracy                           0.87       172
   macro avg       0.86      0.78      0.81       172
weighted avg       0.87      0.87      0.86       172



## Simple + MFCC

In [36]:
train_all_features = np.hstack((train_data_mfcc[:, :-1], train_data_simple))
val_all_features = np.hstack((val_data_mfcc[:, :-1], val_data_simple))
test_all_features = np.hstack((test_data_mfcc[:, :-1], test_data_simple))

In [37]:
pre_automl = AutoML()
pre_automl.fit(train_all_features, train_data["label"], task="classification", time_budget=150, X_val=val_all_features, y_val=val_data["label"], metric=custom_balanced_accuracy, seed=SEED, estimator_list=['lgbm', 'xgboost', 'xgb_limitdepth', 'rf', 'extra_tree', 'catboost'], verbose=False)

automl_all_features = AutoML()
automl_all_features.fit(train_all_features, train_data["label"], task="classification", time_budget=800, X_val=val_all_features, y_val=val_data["label"], metric=custom_balanced_accuracy, seed=SEED, estimator_list=['lgbm', 'xgboost', 'xgb_limitdepth', 'rf', 'extra_tree', 'catboost'], starting_points=pre_automl.best_config_per_estimator, verbose=False)

In [38]:
preds = automl_all_features.predict(test_all_features) 

print(classification_report(test_data["label"], preds))

              precision    recall  f1-score   support

           0       0.76      0.62      0.68        42
           1       0.88      0.94      0.91       130

    accuracy                           0.86       172
   macro avg       0.82      0.78      0.80       172
weighted avg       0.85      0.86      0.86       172



## Experiments with wav2vec

In [39]:
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base", sampling_rate=SR, )

In [40]:
def preprocess_function_wav2vec(path):
    y, sr = librosa.load(path, sr=SR)
    y = y[..., :SEQUENCE_LENGTH]
    y = np.pad(y, (0, SEQUENCE_LENGTH - y.shape[0]), mode='constant')
    
    inputs = feature_extractor(
        torch.tensor(y), sampling_rate=feature_extractor.sampling_rate, max_length=4_000, truncation=True, padding="max_length"
    )
    # print(inputs)
    return inputs['input_values']

In [41]:
train_data_wav2vec = np.vstack(train_data["file_name"].apply(preprocess_function_wav2vec))
val_data_wav2vec = np.vstack(val_data["file_name"].apply(preprocess_function_wav2vec))
test_data_wav2vec = np.vstack(test_data["file_name"].apply(preprocess_function_wav2vec))

In [42]:
pre_automl = AutoML()
pre_automl.fit(train_data_wav2vec, train_data["label"], task="classification", time_budget=150, X_val=val_data_wav2vec, y_val=val_data["label"], metric=custom_balanced_accuracy, seed=SEED, estimator_list=['lgbm', 'xgboost', 'xgb_limitdepth', 'rf', 'extra_tree', 'catboost'], verbose=False)

automl_wav2vec = AutoML()
automl_wav2vec.fit(train_data_wav2vec, train_data["label"], task="classification", time_budget=800, X_val=val_data_wav2vec, y_val=val_data["label"], metric=custom_balanced_accuracy, seed=SEED, estimator_list=['lgbm', 'xgboost', 'xgb_limitdepth', 'rf', 'extra_tree', 'catboost'], starting_points=pre_automl.best_config_per_estimator, verbose=False)

In [43]:
preds = automl_wav2vec.predict(test_data_wav2vec) 
print(classification_report(test_data["label"], preds))

              precision    recall  f1-score   support

           0       0.67      0.52      0.59        42
           1       0.86      0.92      0.88       130

    accuracy                           0.82       172
   macro avg       0.76      0.72      0.74       172
weighted avg       0.81      0.82      0.81       172



## Wav2vec with PCA (for some reason....)

In [44]:
train_data_wav2vec.T.shape

(4000, 472)

In [45]:
from sklearn.decomposition import PCA

pca = PCA(n_components=250)

train_data_wav2vec_pca = pca.fit_transform(train_data_wav2vec)
val_data_wav2vec_pca = pca.transform(val_data_wav2vec)
test_data_wav2vec_pca = pca.transform(test_data_wav2vec)

In [46]:
pre_automl = AutoML()
pre_automl.fit(train_data_wav2vec_pca, train_data["label"], task="classification", time_budget=150, X_val=val_data_wav2vec_pca, y_val=val_data["label"], metric=custom_balanced_accuracy, seed=SEED, estimator_list=['lgbm', 'xgboost', 'xgb_limitdepth', 'rf', 'extra_tree', 'catboost'], verbose=False)

automl_wav2vec_pca = AutoML()
automl_wav2vec_pca.fit(train_data_wav2vec_pca, train_data["label"], task="classification", time_budget=800, X_val=val_data_wav2vec_pca, y_val=val_data["label"], metric=custom_balanced_accuracy, seed=SEED, estimator_list=['lgbm', 'xgboost', 'xgb_limitdepth', 'rf', 'extra_tree', 'catboost'], starting_points=pre_automl.best_config_per_estimator, verbose=False)

In [47]:
preds = automl_wav2vec_pca.predict(test_data_wav2vec_pca) 
print(classification_report(test_data["label"], preds))

              precision    recall  f1-score   support

           0       0.67      0.43      0.52        42
           1       0.83      0.93      0.88       130

    accuracy                           0.81       172
   macro avg       0.75      0.68      0.70       172
weighted avg       0.79      0.81      0.79       172



## Meta classification

In [48]:
test_data_mfcc = pd.concat([test_data, pd.DataFrame(test_data_mfcc)], axis=1)
test_data_mfcc.head()

Unnamed: 0,file_name,label,0,1,2,3,4,5,6,7,...,31735,31736,31737,31738,31739,31740,31741,31742,31743,31744
0,../data/Voices_wav/Norm/N-0988-RAT-1-robb.wav,0,-133.22081,-86.438217,-50.23315,2.749326,-67.72876,-75.160988,-42.209667,-38.456062,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,../data/Voices_wav/Norm/N-0988-RAT-1-bike.wav,0,-127.859444,-57.90229,4.641314,19.603796,38.635742,42.423389,42.932121,39.175602,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,../data/Voices_wav/Norm/N-0982-RAT-1-robb.wav,0,-133.322495,-91.622047,-31.57976,-27.227444,26.310484,67.676003,48.572411,-63.792099,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,../data/Voices_wav/Norm/N-0982-RAT-1-bike.wav,0,-133.801498,-88.44072,-59.631699,-19.610685,-25.806852,48.195946,70.444359,26.359131,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,../data/Voices_wav/Norm/N-0922-RAT-1-bike.wav,0,-132.111694,-78.014793,-81.933823,-88.991348,-93.623466,-76.771866,-21.169638,-0.50726,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
test_data_mfcc["ID"] = test_data_mfcc["file_name"].apply(lambda x: str(x).split("-")[0] + str(x).split("-")[1])
test_data_mfcc.head()

Unnamed: 0,file_name,label,0,1,2,3,4,5,6,7,...,31736,31737,31738,31739,31740,31741,31742,31743,31744,ID
0,../data/Voices_wav/Norm/N-0988-RAT-1-robb.wav,0,-133.22081,-86.438217,-50.23315,2.749326,-67.72876,-75.160988,-42.209667,-38.456062,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,../data/Voices_wav/Norm/N0988
1,../data/Voices_wav/Norm/N-0988-RAT-1-bike.wav,0,-127.859444,-57.90229,4.641314,19.603796,38.635742,42.423389,42.932121,39.175602,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,../data/Voices_wav/Norm/N0988
2,../data/Voices_wav/Norm/N-0982-RAT-1-robb.wav,0,-133.322495,-91.622047,-31.57976,-27.227444,26.310484,67.676003,48.572411,-63.792099,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,../data/Voices_wav/Norm/N0982
3,../data/Voices_wav/Norm/N-0982-RAT-1-bike.wav,0,-133.801498,-88.44072,-59.631699,-19.610685,-25.806852,48.195946,70.444359,26.359131,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,../data/Voices_wav/Norm/N0982
4,../data/Voices_wav/Norm/N-0922-RAT-1-bike.wav,0,-132.111694,-78.014793,-81.933823,-88.991348,-93.623466,-76.771866,-21.169638,-0.50726,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,../data/Voices_wav/Norm/N0922


Здесь мы просто поменяли метрику. Т.к. у большинства пациентов есть по несколько аудиозаписей, то здесь было решено предсказывать не класс аудиозаписи, а класс самого пациента. (Получаем предсказания для всех аудио и берём моду от них)

In [50]:
IDs = test_data_mfcc["ID"].unique()

all_preds = []
all_labels = []

for participant_id in tqdm(IDs):
    participant_samples = test_data_mfcc[test_data_mfcc["ID"] == participant_id]
    labels = participant_samples["label"]
    features = participant_samples.iloc[:, 2:-1]

    pred = scipy.stats.mode(automl_mfcc.predict(features.values))

    all_preds.append(pred.mode)

    all_labels.append(labels.values[0])

all_preds = np.array(all_preds)
all_labels = np.array(all_labels)

print(classification_report(all_labels, all_preds))

100%|██████████| 72/72 [00:00<00:00, 248.67it/s]

              precision    recall  f1-score   support

           0       0.67      0.76      0.71        21
           1       0.90      0.84      0.87        51

    accuracy                           0.82        72
   macro avg       0.78      0.80      0.79        72
weighted avg       0.83      0.82      0.82        72






In [51]:
test_data_prosody = pd.concat([test_data, pd.DataFrame(test_data_prosody)], axis=1)
test_data_prosody["ID"] = test_data_prosody["file_name"].apply(lambda x: str(x).split("-")[0] + str(x).split("-")[1])

In [52]:
IDs = test_data_prosody["ID"].unique()

all_preds = []
all_labels = []

for participant_id in tqdm(IDs):
    participant_samples = test_data_prosody[test_data_prosody["ID"] == participant_id]
    labels = participant_samples["label"]
    features = participant_samples.iloc[:, 2:-1]

    pred = scipy.stats.mode(automl_prosody.predict(features.values))

    all_preds.append(pred.mode)

    all_labels.append(labels.values[0])

all_preds = np.array(all_preds)
all_labels = np.array(all_labels)

print(classification_report(all_labels, all_preds))

100%|██████████| 72/72 [00:00<00:00, 246.54it/s]


              precision    recall  f1-score   support

           0       0.88      0.71      0.79        21
           1       0.89      0.96      0.92        51

    accuracy                           0.89        72
   macro avg       0.89      0.84      0.86        72
weighted avg       0.89      0.89      0.89        72

