In [1]:
import warnings
for warn in [UserWarning, FutureWarning]: warnings.filterwarnings("ignore", category = warn)

import os
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch import hub
from torch.utils.data import Dataset, DataLoader
import torchaudio
from torchaudio.transforms import Resample

import numpy as np
import pandas as pd

from sklearn.metrics import f1_score, recall_score, precision_score, balanced_accuracy_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import scipy

from src.utils import *

from tqdm import tqdm

from datasets import load_dataset, Dataset, Audio
import librosa

from models.cnn import MobileNet
from models.CarefulWhisper import CarefulWhisper

from flaml import AutoML

  process_text = re.sub("[^\w\s]+", " ", process_text)
2025-08-07 16:21:21.062852: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-07 16:21:21.149247: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754572881.189939   11906 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754572881.205302   11906 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-08-07 16:21:21.296972: I tensorflow/core/platform/cpu_feature_guard.cc:210] This Te

In [2]:
SEED = 1984

np.random.seed(SEED)
torch.manual_seed(SEED)

gen = torch.Generator()
gen.manual_seed(SEED)

<torch._C.Generator at 0x7c79a0539a50>

In [3]:
DATA_DIR = os.path.join('..', 'data')
VOICES_DIR = os.path.join(DATA_DIR, 'Voices_wav')
APHASIA_DIR = os.path.join(VOICES_DIR, 'Aphasia')
NORM_DIR = os.path.join(VOICES_DIR, 'Norm')

# dataset_train = AphasiaDatasetWaveform(os.path.join(DATA_DIR, "train_filenames_mc.csv"), VOICES_DIR, target_sample_rate=16_000, file_format="wav")
# dataset_val = AphasiaDatasetWaveform(os.path.join(DATA_DIR, "val_filenames_mc.csv"), VOICES_DIR, target_sample_rate=16_000, file_format="wav")
# dataset_test = AphasiaDatasetWaveform(os.path.join(DATA_DIR, "test_filenames_mc.csv"), VOICES_DIR, target_sample_rate=16_000, file_format="wav")

In [4]:
model = CarefulWhisper("cuda")

Device set to use cuda
Device set to use cuda:0


In [5]:
dataset_train = pd.read_csv(os.path.join(DATA_DIR, 'train_filenames_mc.csv'))

In [6]:
train_data = []

for ind, x in tqdm(dataset_train.iterrows()):
    if x["label"] == 0:
        file_path = os.path.join(NORM_DIR, x["file_name"])
    else:
        file_path = os.path.join(APHASIA_DIR, x["file_name"])
    waveform, sample_rate = torchaudio.load(file_path)

    if sample_rate != 16_000:
        resampler = Resample(sample_rate, 16_000)
        waveform = resampler(waveform)
    waveform = waveform[..., :16_000 * 20]
    output = model(waveform).squeeze()
    train_data.append((output, x["label"]))

0it [00:00, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
10it [04:04, 23.67s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
281it [1:50:28, 23.01s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (1157 > 1024). Running this sequence through the model will result in indexing errors
485it [3:09:50, 23.48s/it]


In [10]:
train_data_np = np.array([np.concatenate((arr, [num])) for arr, num in train_data])

In [11]:
np.save('train_data_careful_whisper_mc.npy', train_data_np)

In [12]:
dataset_val = pd.read_csv(os.path.join(DATA_DIR, 'val_filenames_mc.csv'))
val_data = []

for ind, x in tqdm(dataset_val.iterrows()):
    if x["label"] == 0:
        file_path = os.path.join(NORM_DIR, x["file_name"])
    else:
        file_path = os.path.join(APHASIA_DIR, x["file_name"])
    waveform, sample_rate = torchaudio.load(file_path)

    if sample_rate != 16_000:
        resampler = Resample(sample_rate, 16_000)
        waveform = resampler(waveform)
    waveform = waveform[..., :16_000 * 20]
    output = model(waveform).squeeze()
    val_data.append((output, x["label"]))

149it [57:58, 23.35s/it]


In [13]:
val_data_np = np.array([np.concatenate((arr, [num])) for arr, num in val_data])

In [14]:
np.save('val_data_careful_whisper_mc.npy', val_data_np)

In [15]:
dataset_test = pd.read_csv(os.path.join(DATA_DIR, 'test_filenames_mc.csv'))
test_data = []

for ind, x in tqdm(dataset_test.iterrows()):
    if x["label"] == 0:
        file_path = os.path.join(NORM_DIR, x["file_name"])
    else:
        file_path = os.path.join(APHASIA_DIR, x["file_name"])
    waveform, sample_rate = torchaudio.load(file_path)

    if sample_rate != 16_000:
        resampler = Resample(sample_rate, 16_000)
        waveform = resampler(waveform)
    waveform = waveform[..., :16_000 * 20]
    output = model(waveform).squeeze()
    test_data.append((output, x["label"]))

140it [54:29, 23.35s/it]


In [16]:
test_data_np = np.array([np.concatenate((arr, [num])) for arr, num in test_data])

In [17]:
np.save('test_data_careful_whisper_mc.npy', test_data_np)