In [1]:
import warnings
for warn in [UserWarning, FutureWarning]: warnings.filterwarnings("ignore", category = warn)

import os
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch import hub
from torch.utils.data import Dataset, DataLoader
import torchaudio
from torchaudio.transforms import Resample

import numpy as np
import pandas as pd

from sklearn.metrics import f1_score, recall_score, precision_score, balanced_accuracy_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import scipy

from src.utils import *

from tqdm import tqdm

from datasets import load_dataset, Dataset, Audio
import librosa

from models.cnn import MobileNet
from models.CarefulWhisper import CarefulWhisper, WhisperW2V

from flaml import AutoML

2025-08-30 13:11:39.243092: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-30 13:11:39.358997: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756548699.411335   36262 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756548699.424148   36262 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1756548699.532917   36262 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
SEED = 1984

np.random.seed(SEED)
torch.manual_seed(SEED)

gen = torch.Generator()
gen.manual_seed(SEED)

<torch._C.Generator at 0x735039631eb0>

In [3]:
DATA_DIR = os.path.join('..', 'data')
VOICES_DIR = os.path.join(DATA_DIR, 'Voices_wav')
APHASIA_DIR = os.path.join(VOICES_DIR, 'Aphasia')
NORM_DIR = os.path.join(VOICES_DIR, 'Norm')

# dataset_train = AphasiaDatasetWaveform(os.path.join(DATA_DIR, "train_filenames_mc.csv"), VOICES_DIR, target_sample_rate=16_000, file_format="wav")
# dataset_val = AphasiaDatasetWaveform(os.path.join(DATA_DIR, "val_filenames_mc.csv"), VOICES_DIR, target_sample_rate=16_000, file_format="wav")
# dataset_test = AphasiaDatasetWaveform(os.path.join(DATA_DIR, "test_filenames_mc.csv"), VOICES_DIR, target_sample_rate=16_000, file_format="wav")

In [4]:
model = WhisperW2V("cuda")

Device set to use cuda


In [5]:
def preprocess_audio(dataset):
    data = []
    for ind, x in tqdm(dataset.iterrows()):
        if x["label"] == 0:
            file_path = os.path.join(NORM_DIR, x["file_name"])
        else:
            file_path = os.path.join(APHASIA_DIR, x["file_name"])
        waveform, sample_rate = torchaudio.load(file_path)
    
        if sample_rate != 16_000:
            resampler = Resample(sample_rate, 16_000)
            waveform = resampler(waveform)
        waveform = waveform[..., :16_000 * 20]
        clean, acoustic = model(waveform)
        data.append((clean, acoustic, x["label"]))
    return data

In [6]:
dataset_train = pd.read_csv(os.path.join(DATA_DIR, 'test_filenames.csv'))

In [7]:
train_texts = preprocess_audio(dataset_train)

0it [00:00, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
10it [04:10, 24.88s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
172it [1:06:56, 23.35s/it]


In [8]:
df = pd.DataFrame(data=train_texts, columns=['clean', 'acoustic', 'target'])

In [9]:
df.to_csv(os.path.join(DATA_DIR, 'test_text_data.csv'), index=False)

In [10]:
df = pd.read_csv(os.path.join(DATA_DIR, 'test_text_data.csv'))

In [11]:
train_data_json = []

for i, row in df.iterrows():
    instance = {}
    instance["query"] = row["clean"]
    if row["target"] == 0:
        positive_text = df[df["target"] != 0].sample(1)["acoustic"].values[0]
        instance["pos"] = [positive_text]
        instance["neg"] = [row["acoustic"]]
    if row["target"] == 1:
        instance["pos"] = [row["acoustic"]]
        negative_text = df[df["target"] == 0].sample(1)["acoustic"].values[0]
        instance["neg"] = [negative_text]
    instance["pos_scores"] = [1.0]
    instance["neg_scores"] = [0.0]
    instance["prompt"] = row["acoustic"]
    train_data_json.append(instance)

In [12]:
train_data_json[0]

{'query': 'Было прекрасное осеннее утро, ну, или августовское, можно сказать, даже, теплое такое, хорошее.',
 'pos': ['ать историю рассказать беди двушка шел с качина звиной звершино проехала и моне изяскила это собери себе вот'],
 'neg': ['было прекрасное синее утро нуля августовское можно скоять дляжен теплое такое хорошее и'],
 'pos_scores': [1.0],
 'neg_scores': [0.0],
 'prompt': 'было прекрасное синее утро нуля августовское можно скоять дляжен теплое такое хорошее и'}

In [13]:
import json

json.dump(train_data_json, open(os.path.join(DATA_DIR, 'train.json'), 'w'))