In [1]:
import warnings

from catboost import CatBoostClassifier

for warn in [UserWarning, FutureWarning]: warnings.filterwarnings("ignore", category = warn)

import os
import time
import torch
import torchaudio

import numpy as np
import pandas as pd

from sklearn.metrics import f1_score, recall_score, precision_score, balanced_accuracy_score, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import scipy

from tqdm import tqdm
import librosa

from src.utils import *

from flaml import AutoML

from transformers import AutoFeatureExtractor

from disvoice.prosody.prosody import Prosody

from catboost import CatBoostClassifier
import optuna

################################################################################
###          (please add 'export KALDI_ROOT=<your_path>' in your $HOME/.profile)
###          (or run as: KALDI_ROOT=<your_path> python <your_script>.py)
################################################################################

2025-03-15 17:01:06.950473: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-15 17:01:06.959583: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742047266.971511 1119188 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E000

In [2]:
SEED = 1984

np.random.seed(SEED)
torch.manual_seed(SEED)

gen = torch.Generator()
gen.manual_seed(SEED)

SR = 8_000
SEQUENCE_LENGTH = 300 * SR
MFCC = 64
DATA_DIR = os.path.join('..', 'data')
VOICES_DIR = os.path.join(DATA_DIR, 'Voices_wav')
APHASIA_DIR = os.path.join(VOICES_DIR, 'Aphasia')
NORM_DIR = os.path.join(VOICES_DIR, 'Norm')
train_data = pd.read_csv(os.path.join(DATA_DIR, 'train_filenames.csv'))
val_data = pd.read_csv(os.path.join(DATA_DIR, 'val_filenames.csv'))
test_data = pd.read_csv(os.path.join(DATA_DIR, 'test_filenames.csv'))
train_data["file_name"] = train_data.apply(
    lambda x: os.path.join(APHASIA_DIR, x['file_name']) if x['label'] == 1 else os.path.join(NORM_DIR, x['file_name']),
    axis=1)
val_data["file_name"] = val_data.apply(
    lambda x: os.path.join(APHASIA_DIR, x['file_name']) if x['label'] == 1 else os.path.join(NORM_DIR, x['file_name']),
    axis=1)
test_data["file_name"] = test_data.apply(
    lambda x: os.path.join(APHASIA_DIR, x['file_name']) if x['label'] == 1 else os.path.join(NORM_DIR, x['file_name']),
    axis=1)

In [3]:
def get_features(data_name, prep_function=None):
    if not os.path.exists(os.path.join(DATA_DIR, f"train_data_{data_name}.npy")) and not (prep_function is None):
        _train_data = np.vstack((train_data["file_name"].apply(prep_function)).to_numpy())
        _val_data = np.vstack((val_data["file_name"].apply(prep_function)).to_numpy())
        _test_data = np.vstack((test_data["file_name"].apply(prep_function)).to_numpy())
    
        np.save(os.path.join(DATA_DIR, f"train_data_{data_name}.npy"), _train_data)
        np.save(os.path.join(DATA_DIR, f"val_data_{data_name}.npy"), _val_data)
        np.save(os.path.join(DATA_DIR, f"test_data_{data_name}.npy"), _test_data)
    else:
        _train_data = np.load(os.path.join(DATA_DIR, f"train_data_{data_name}.npy"))
        _val_data = np.load(os.path.join(DATA_DIR, f"val_data_{data_name}.npy"))
        _test_data = np.load(os.path.join(DATA_DIR, f"test_data_{data_name}.npy"))
    
    return _train_data, _val_data, _test_data

In [4]:
train_data_prosody, val_data_prosody, test_data_prosody = get_features("prosody")
train_data_mfcc, val_data_mfcc, test_data_mfcc = get_features("mfcc")
train_data_chroma, val_data_chroma, test_data_chroma = get_features("chroma")
train_data_spectral, val_data_spectral, test_data_spectral = get_features("spectral")
train_data_zrc, val_data_zrc, test_data_zrc = get_features("zrc")
train_data_simple, val_data_simple, test_data_simple = get_features("simple")

In [5]:
def objective_catboost(trial, X_train, X_val, y_train, y_val):
    params = {
        # "iterations": 1000,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=False),
        "depth": trial.suggest_int("depth", 2, 10, log=False),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0, log=False),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0, log=False),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100, log=False),
        "n_estimators": trial.suggest_int("n_estimators", 50, 150, log=True),
    }
    
    classifier = CatBoostClassifier(**params, verbose=False)
    
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_val)
    balanced_acc = balanced_accuracy_score(y_val, y_pred)
    return balanced_acc

In [6]:
# X_tune_train_mfcc, X_tune_val_mfcc, y_tune_train_mfcc, y_tune_val_mfcc = train_test_split(val_data_mfcc, val_data["label"], random_state=SEED, stratify=val_data["label"], train_size=0.7) 

In [7]:
# study = optuna.create_study(direction="maximize")
# optuna.logging.disable_default_handler()
# study.optimize(lambda trial: objective_catboost(trial, X_train=X_tune_train_mfcc, X_val=X_tune_val_mfcc, y_train=y_tune_train_mfcc, y_val=y_tune_val_mfcc), n_trials=100, gc_after_trial=True)
# mfcc_catboost_params = study.best_params

### Catboost and MFCC data

In [8]:
mfcc_catboost_params = {'learning_rate': 0.06604054650459307,
 'depth': 3,
 'subsample': 0.8079542483536528,
 'colsample_bylevel': 0.5638899261036381,
 'min_data_in_leaf': 91,
 'n_estimators': 52}

In [9]:
cb_mfcc = CatBoostClassifier(**mfcc_catboost_params, verbose=False)

cb_mfcc.fit(train_data_mfcc, train_data["label"])

<catboost.core.CatBoostClassifier at 0x79e0e7e04440>

In [10]:
preds = cb_mfcc.predict(test_data_mfcc)

print(classification_report(test_data["label"], preds))

              precision    recall  f1-score   support

           0       0.76      0.45      0.57        42
           1       0.84      0.95      0.90       130

    accuracy                           0.83       172
   macro avg       0.80      0.70      0.73       172
weighted avg       0.82      0.83      0.82       172



In [11]:
test_data_mfcc_ids = pd.concat([test_data, pd.DataFrame(test_data_mfcc)], axis=1)
test_data_mfcc_ids["ID"] = test_data_mfcc_ids["file_name"].apply(lambda x: str(x).split("-")[0] + str(x).split("-")[1])
IDs = test_data_mfcc_ids["ID"].unique()

all_preds = []
all_labels = []

for participant_id in tqdm(IDs):
    participant_samples = test_data_mfcc_ids[test_data_mfcc_ids["ID"] == participant_id]
    labels = participant_samples["label"]
    features = participant_samples.iloc[:, 2:-1]

    pred = scipy.stats.mode(cb_mfcc.predict(features.values))

    all_preds.append(pred.mode)

    all_labels.append(labels.values[0])

all_preds = np.array(all_preds)
all_labels = np.array(all_labels)

print(classification_report(all_labels, all_preds))

100%|██████████| 72/72 [00:05<00:00, 12.57it/s]

              precision    recall  f1-score   support

           0       0.87      0.62      0.72        21
           1       0.86      0.96      0.91        51

    accuracy                           0.86        72
   macro avg       0.86      0.79      0.81        72
weighted avg       0.86      0.86      0.85        72






### Catboost and Prosody data

In [12]:
# X_tune_train_prosody, X_tune_val_prosody, y_tune_train_prosody, y_tune_val_prosody = train_test_split(val_data_prosody,
#                                                                                           val_data["label"],
#                                                                                           random_state=SEED,
#                                                                                           stratify=val_data["label"],
#                                                                                           train_size=0.7)
# study = optuna.create_study(direction="maximize")
# optuna.logging.disable_default_handler()
# study.optimize(lambda trial: objective_catboost(trial, X_train=X_tune_train_prosody, X_val=X_tune_val_prosody, y_train=y_tune_train_prosody, y_val=y_tune_val_prosody), n_trials=100, gc_after_trial=True)
# prosody_catboost_params = study.best_params
prosody_catboost_params = {'learning_rate': 0.05111140841167912,
                         'depth': 5,
                         'subsample': 0.5829725032194245,
                         'colsample_bylevel': 0.5560076535650318,
                         'min_data_in_leaf': 3,
                         'n_estimators': 80}

In [13]:
cb_prosody = CatBoostClassifier(**prosody_catboost_params, verbose=False)

cb_prosody.fit(train_data_prosody, train_data["label"])
preds = cb_prosody.predict(test_data_prosody)

print(classification_report(test_data["label"], preds))

              precision    recall  f1-score   support

           0       0.88      0.52      0.66        42
           1       0.86      0.98      0.92       130

    accuracy                           0.87       172
   macro avg       0.87      0.75      0.79       172
weighted avg       0.87      0.87      0.85       172



In [14]:
test_data_prosody_ids = pd.concat([test_data, pd.DataFrame(test_data_prosody)], axis=1)
test_data_prosody_ids["ID"] = test_data_prosody_ids["file_name"].apply(lambda x: str(x).split("-")[0] + str(x).split("-")[1])
IDs = test_data_prosody_ids["ID"].unique()

all_preds = []
all_labels = []

for participant_id in tqdm(IDs):
    participant_samples = test_data_prosody_ids[test_data_prosody_ids["ID"] == participant_id]
    labels = participant_samples["label"]
    features = participant_samples.iloc[:, 2:-1]

    pred = scipy.stats.mode(cb_prosody.predict(features.values))

    all_preds.append(pred.mode)

    all_labels.append(labels.values[0])

all_preds = np.array(all_preds)
all_labels = np.array(all_labels)

print(classification_report(all_labels, all_preds))

100%|██████████| 72/72 [00:00<00:00, 763.38it/s]

              precision    recall  f1-score   support

           0       0.93      0.62      0.74        21
           1       0.86      0.98      0.92        51

    accuracy                           0.88        72
   macro avg       0.90      0.80      0.83        72
weighted avg       0.88      0.88      0.87        72






### Catboost and all features

In [15]:
train_data_all_features = np.hstack([train_data_mfcc, train_data_prosody, train_data_chroma, train_data_spectral, train_data_zrc, train_data_simple])
val_data_all_features = np.hstack([val_data_mfcc, val_data_prosody, val_data_chroma, val_data_spectral, val_data_zrc, val_data_simple])
test_data_all_features = np.hstack([test_data_mfcc, test_data_prosody, test_data_chroma, test_data_spectral, test_data_zrc, test_data_simple])

In [16]:
# del train_data_mfcc, train_data_prosody, train_data_chroma, train_data_spectral, train_data_zrc, train_data_simple
# del val_data_mfcc, val_data_prosody, val_data_chroma, val_data_spectral, val_data_zrc, val_data_simple
# del test_data_mfcc, test_data_prosody, test_data_chroma, test_data_spectral, test_data_zrc, test_data_simple

In [17]:
cb_all = CatBoostClassifier(verbose=False)

cb_all.fit(train_data_all_features, train_data["label"])

<catboost.core.CatBoostClassifier at 0x79e0e7db50a0>

In [18]:
preds = cb_all.predict(test_data_all_features)

print(classification_report(test_data["label"], preds))

              precision    recall  f1-score   support

           0       0.80      0.57      0.67        42
           1       0.87      0.95      0.91       130

    accuracy                           0.86       172
   macro avg       0.84      0.76      0.79       172
weighted avg       0.86      0.86      0.85       172



In [19]:
test_data_all_features_ids = pd.concat([test_data, pd.DataFrame(test_data_all_features)], axis=1)
test_data_all_features_ids["ID"] = test_data_all_features_ids["file_name"].apply(lambda x: str(x).split("-")[0] + str(x).split("-")[1])
IDs = test_data_all_features_ids["ID"].unique()

all_preds = []
all_labels = []

for participant_id in tqdm(IDs):
    participant_samples = test_data_all_features_ids[test_data_all_features_ids["ID"] == participant_id]
    labels = participant_samples["label"]
    features = participant_samples.iloc[:, 2:-1]

    pred = scipy.stats.mode(cb_all.predict(features.values))

    all_preds.append(pred.mode)

    all_labels.append(labels.values[0])

all_preds = np.array(all_preds)
all_labels = np.array(all_labels)

print(classification_report(all_labels, all_preds))

100%|██████████| 72/72 [00:08<00:00,  8.55it/s]

              precision    recall  f1-score   support

           0       0.83      0.71      0.77        21
           1       0.89      0.94      0.91        51

    accuracy                           0.88        72
   macro avg       0.86      0.83      0.84        72
weighted avg       0.87      0.88      0.87        72




