In [1]:
from src.utils import *

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from tqdm import tqdm

import torch

import scipy
import librosa

from sklearn.model_selection import train_test_split

# from pyannote.audio import Pipeline
# from pyannote.audio.pipelines.utils.hook import ProgressHook

In [2]:
DATA_DIR = os.path.join('..', 'data')
VOICES_DIR = os.path.join(DATA_DIR, 'Voices_wav')
APHASIA_DIR = os.path.join(VOICES_DIR, 'Aphasia')
NORM_DIR = os.path.join(VOICES_DIR, 'Norm')

In [3]:
meta_data_aphasia = pd.read_excel(os.path.join(DATA_DIR, "Demo RAT discourse production.xlsx"), sheet_name='пациенты')
meta_data_norm = pd.read_excel(os.path.join(DATA_DIR, "Demo RAT discourse production.xlsx"), sheet_name='норма')

meta_data_norm.drop(meta_data_norm.index[-1], axis=0, inplace=True)
meta_data_aphasia["Subj_ID"] = meta_data_aphasia["Subj_ID"].astype(str)
meta_data_norm.rename(columns={"№": "Subj_ID"}, inplace=True)
meta_data_norm["Subj_ID"] = meta_data_norm["Subj_ID"].astype(str)

In [4]:
meta_data_aphasia.rename(columns={'Stroked hemisphere (L/R/LR)': 'Stroked hemisphere',
                          'Aphasia/Norm (A/N), A includes all patients, even with just dysarthria': 'Aphasia/Norm', 
                          'Aphasia_Severity \n0 - no aphasia\n1 - very mild\n2 - mild\n3 - mild-moderate\n4 - moderate\n5 - moderate-severe\n6 - severe\n7 - very severe': 'Aphasia_Severity',
                          'Aphasia_Types\n1 - efferent motor\n2 - afferent motor\n3 - complex motor\n4 - dynamic\n5 - acoustic-mnestic\n6 - sensory\n7 - semantic\n8 - other': 'Aphasia_Types',
                          'Dominant_Aphasia\n1 - efferent motor\n2 - afferent motor\n3 - complex motor\n4 - dynamic\n5 - acoustic-mnestic\n6 - sensory\n7 - semantic\n8 - other': 'Dominant_Aphasia',
                          'Dysarthria (1/0)': 'Dysarthria',
                          'Dysarthria_Severity \n0 - no dysarthria\n1 - very mild\n2 - mild\n3 - mild-moderate\n4 - moderate\n5 - moderate-severe\n6 - severe\n7 - very severe': 'Dysarthria_Severity',
                          }, inplace=True)

In [5]:
sf_aphasia = []
sf_norm = []

noise_aphasia = {}
noise_norm = {}

empty_files = []

In [6]:
for audio_file in tqdm(os.listdir(APHASIA_DIR)):
    y, sr = librosa.load(os.path.join(APHASIA_DIR, audio_file), sr=None)
    duration = librosa.get_duration(y=y, sr=sr)
    
    spectral_flatness = np.mean(librosa.feature.spectral_flatness(y=y))
    roll_off = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
    
    if spectral_flatness > 0.5:
        empty_files.append(audio_file)
        continue
        
    sf_aphasia.append([audio_file, spectral_flatness, roll_off])
    ind = audio_file.split('-')[1]
    if ind not in noise_aphasia:
        noise_aphasia[str(ind)] = False
    if roll_off < 1_000:
        noise_aphasia[str(ind)] = True

100%|██████████| 608/608 [00:32<00:00, 18.98it/s]


In [7]:
for audio_file in tqdm(os.listdir(NORM_DIR)):
    y, sr = librosa.load(os.path.join(NORM_DIR, audio_file), sr=None)
    duration = librosa.get_duration(y=y, sr=sr)
    
    spectral_flatness = np.mean(librosa.feature.spectral_flatness(y=y))
    roll_off = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
    
    if spectral_flatness > 0.5:
        empty_files.append(audio_file)
        continue
    
    sf_norm.append([audio_file, spectral_flatness, roll_off])
    ind = audio_file.split('-')[1]
    if ind[0] == '0':
        ind = ind[1:]
    ind += '-НРАТ'
    
    if ind not in noise_norm:
        noise_norm[str(ind)] = False
    if roll_off < 1_000:
        noise_norm[str(ind)] = True

100%|██████████| 202/202 [00:03<00:00, 51.04it/s]


In [8]:
[x for x in noise_aphasia.keys() if x not in meta_data_aphasia["Subj_ID"].tolist()]

['409', '426', 'L45']

In [9]:
[x for x in noise_norm.keys() if x not in meta_data_norm["Subj_ID"].tolist()]

[]

In [10]:
meta_data_aphasia.shape

(353, 34)

In [11]:
meta_data_aphasia = meta_data_aphasia[meta_data_aphasia["Subj_ID"].isin(noise_aphasia.keys())]
meta_data_aphasia.shape

(253, 34)

In [12]:
meta_data_norm.shape

(101, 17)

In [13]:
meta_data_norm = meta_data_norm[meta_data_norm["Subj_ID"].isin(noise_norm.keys())]
meta_data_norm.shape

(101, 17)

In [14]:
meta_data_aphasia["has_noise"] = meta_data_aphasia["Subj_ID"].map(noise_aphasia)
meta_data_norm["has_noise"] = meta_data_norm["Subj_ID"].map(noise_norm)

In [15]:
meta_data_aphasia.head()

Unnamed: 0.1,Unnamed: 0,Subj_ID,Date_of_birth,Age,Sex (m/f),Education in years,Education level,Stroke_date,Stroked hemisphere,N of strokes,...,Time2.2_file_name,Time3.1_file_name,Time3.2_file_name,Time4.1_file_name,Time4.2_file_name,Time5.1_file_name,Time5.2_file_name,Time6.1_file_name,Time6.2_file_name,has_noise
0,13-РАТ,1,1957-12-04 00:00:00,58.0,f,,высшее,2005-03-27 00:00:00,L,1.0,...,,,,,,,,,,False
1,22-РАТ,7,1964-03-21 00:00:00,52.0,m,,высшее,2006-06-20 00:00:00,L,1.0,...,,,,,,,,,,False
2,55-РАТ,13,1962-12-06 00:00:00,53.0,m,,сред спец,2008-09-19 00:00:00,L,1.0,...,,A-13-RAT-3-robb,,A-13-RAT-4-bike,,,,,,True
3,Афанасенко СС,35,1963-10-21 00:00:00,53.0,m,,сред спец,2011-09-27 00:00:00,L,1.0,...,,A-35-RAT-3-bike,A-35-RAT-3-robb,,,,,,,True
4,Муксеева ЕА,51,1971-09-22 00:00:00,45.0,f,,высшее,2012-06-16 00:00:00,L,1.0,...,,,,,,,,,,False


In [16]:
meta_data_norm.head()

Unnamed: 0,Subj_ID,Порядок предъявления проб,Дата проведения РАТа,Возрастная группа,Дата рождения,Пол,Образование,Профессия,Место жительства,Рукость,Биллингвизм,Неврологические/психиатрические расстройства,Медицинские препараты,Проблемы со слухом/зрением,Комментарии,bike_file_name,robbery_file_name,has_noise
0,901-НРАТ,2,28.08.2016,2,19.05.1972,м,ученая степень,военнослужащий,Москва,правша,нет,нет,нет,нет,в порядке предъявления 2 (для нечетных) сначал...,N-0901-RAT-1-bike,N-0901-RAT-1-robb,False
1,902-НРАТ,1,28.08.2016,3,1965,ж,среднее специальное,медцина,Москва,правша,нет,нет,нет,нет,,N-0902-RAT-1-bike,N-0902-RAT-1-robb,False
2,903-НРАТ,2,30.08.2016,1,07.07.1995,ж,неоконченное высшее,студент,Санкт-Петербург,правша,нет,нет,нет,нет,,N-0903-RAT-1-bike,N-0903-RAT-1-robb,False
3,904-НРАТ,1,06.09.2016,1,09.01.1994,ж,неоконченное высшее,лингвистика,Москва,правша,да(чувашский),нет,нет,нет,,N-0904-RAT-1-bike,N-0904-RAT-1-robb,False
4,905-НРАТ,2,06.09.2016,1,03.10.1995,ж,неоконченное высшее,лингвистика,Химки,правша,нет,нет,нет,нет,,N-0905-RAT-1-bike,N-0905-RAT-1-robb,False


In [17]:
train_aphasia, test_aphasia = train_test_split(meta_data_aphasia, test_size=0.4, random_state=42, stratify=meta_data_aphasia["has_noise"])

val_aphasia, test_aphasia = train_test_split(test_aphasia, test_size=0.5, random_state=42, stratify=test_aphasia["has_noise"])

In [18]:
train_norm, test_norm = train_test_split(meta_data_norm, test_size=0.4, random_state=42, stratify=meta_data_norm["has_noise"])

val_norm, test_norm = train_test_split(test_norm, test_size=0.5, random_state=42, stratify=test_norm["has_noise"])

In [19]:
train_aphasia.shape, val_aphasia.shape, test_aphasia.shape

((151, 35), (51, 35), (51, 35))

In [20]:
train_norm.shape, val_norm.shape, test_norm.shape

((60, 18), (20, 18), (21, 18))

In [21]:
aphasia_files_list = os.listdir(APHASIA_DIR)
norm_files_list = os.listdir(NORM_DIR)

In [22]:
aphasia_files_dict = {}
norm_files_dict = {}

for audio_file in tqdm(aphasia_files_list):
    participant_id = audio_file.split('-')[1]
    if participant_id not in aphasia_files_dict:
        aphasia_files_dict[participant_id] = [audio_file]
    else:
        aphasia_files_dict[participant_id].append(audio_file)

for audio_file in tqdm(norm_files_list):
    participant_id = audio_file.split('-')[1]
    if participant_id not in norm_files_dict:
        norm_files_dict[participant_id] = [audio_file]
    else:
        norm_files_dict[participant_id].append(audio_file)

100%|██████████| 608/608 [00:00<00:00, 1349993.03it/s]
100%|██████████| 202/202 [00:00<00:00, 1007430.93it/s]


In [23]:
norm_files_dict

{'0941': ['N-0941-RAT-1-bike.wav', 'N-0941-RAT-1-robb.wav'],
 '0995': ['N-0995-RAT-1-bike.wav', 'N-0995-RAT-1-robb.wav'],
 '1011': ['N-1011-RAT-1-robb.wav', 'N-1011-RAT-1-bike.wav'],
 '0982': ['N-0982-RAT-1-robb.wav', 'N-0982-RAT-1-bike.wav'],
 '1006': ['N-1006-RAT-1-bike.wav', 'N-1006-RAT-1-robb.wav'],
 '0993': ['N-0993-RAT-1-robb.wav', 'N-0993-RAT-1-bike.wav'],
 '0933': ['N-0933-RAT-1-bike.wav', 'N-0933-RAT-1-robb.wav'],
 '1013': ['N-1013-RAT-1-bike.wav', 'N-1013-RAT-1-robb.wav'],
 '0969': ['N-0969-RAT-1-robb.wav', 'N-0969-RAT-1-bike.wav'],
 '0988': ['N-0988-RAT-1-robb.wav', 'N-0988-RAT-1-bike.wav'],
 '0970': ['N-0970-RAT-1-bike.wav', 'N-0970-RAT-1-robb.wav'],
 '0986': ['N-0986-RAT-1-robb.wav', 'N-0986-RAT-1-bike.wav'],
 '1018': ['N-1018-RAT-1-robb.wav', 'N-1018-RAT-1-bike.wav'],
 '1017': ['N-1017-RAT-1-bike.wav', 'N-1017-RAT-1-robb.wav'],
 '0965': ['N-0965-RAT-1-robb.wav', 'N-0965-RAT-1-bike.wav'],
 '0997': ['N-0997-RAT-1-bike.wav', 'N-0997-RAT-1-robb.wav'],
 '0925': ['N-0925-RAT-1-

In [24]:
def get_filename(df: pd.DataFrame, label: int):
    output = []
    file_columns = [x for x in df.columns if "file_name" in x]
    for i, row in df.iterrows():
        participant_id = row['Subj_ID'].split('-')[0]
        # print(participant_id)
        if label:
            file_names = [(x, label) for x in  aphasia_files_dict[participant_id] if x not in empty_files]
        else:
            # print(row)
            if len(participant_id) == 3:
                participant_id = '0' + participant_id
            file_names = [(x, label) for x in  norm_files_dict[participant_id] if x not in empty_files]
        output.extend(file_names)
    return output

In [25]:
train_filenames = get_filename(train_norm, 0) + get_filename(train_aphasia, 1)
val_filenames = get_filename(val_norm, 0) + get_filename(val_aphasia, 1)
test_filenames = get_filename(test_norm, 0) + get_filename(test_aphasia, 1)

In [26]:
len(train_filenames), len(val_filenames), len(test_filenames)

(472, 159, 172)

In [27]:
train_filenames = pd.DataFrame(train_filenames, columns=['file_name', 'label'])
val_filenames = pd.DataFrame(val_filenames, columns=['file_name', 'label'])
test_filenames = pd.DataFrame(test_filenames, columns=['file_name', 'label'])

In [28]:
train_filenames.to_csv(os.path.join(DATA_DIR, "train_filenames.csv"), index=False)
val_filenames.to_csv(os.path.join(DATA_DIR, "val_filenames.csv"), index=False)
test_filenames.to_csv(os.path.join(DATA_DIR, "test_filenames.csv"), index=False)

In [29]:
train_aphasia.to_csv(os.path.join(DATA_DIR, "train_aphasia.csv"), index=False)
val_aphasia.to_csv(os.path.join(DATA_DIR, "val_aphasia.csv"), index=False)
test_aphasia.to_csv(os.path.join(DATA_DIR, "test_aphasia.csv"), index=False)

train_norm.to_csv(os.path.join(DATA_DIR, "train_norm.csv"), index=False)
val_norm.to_csv(os.path.join(DATA_DIR, "val_norm.csv"), index=False)
test_norm.to_csv(os.path.join(DATA_DIR, "test_norm.csv"), index=False)

In [30]:
empty_files

['A-928-RAT-1-robb.wav']