In [3]:
# !mkdir data_mp3
# !unzip -nq data/first/train.zip -d data_mp3
# !unzip -nq data/first/test.zip -d data_mp3

# !unzip -nq data/semifinal/train.zip -d data_mp3
# !unzip -nq data/semifinal/test.zip -d data_mp3
# !unzip -nq data/semifinal/final_test.zip -d data_mp3

# !unzip -nq data/final/train.zip -d data_mp3
# !unzip -nq data/final/test.zip -d data_mp3

In [4]:
import pandas as pd
import numpy as np
import time
import os
import pickle
import re

import soundfile as sf

from tqdm import tqdm
from scipy.signal import resample
from scipy.io import wavfile
from sklearn.model_selection import train_test_split

In [5]:
disorders_letters = dict()
disorders_letters[0] = []
disorders_letters[1] = ["р"]
disorders_letters[2] = ["г"]
disorders_letters[3] = []

target_letters = []
for letters in disorders_letters.values():
    target_letters.extend(letters)
target_letters

['р', 'г']

In [6]:
with open("data_processed/target_letters.pkl", "wb") as f:
    pickle.dump(target_letters, f)

with open("data_processed/disorders_letters.pkl", "wb") as f:
    pickle.dump(disorders_letters, f)

In [7]:
def mp3_to_wav(input_file, output_file, target_samplerate):
    data, samplerate = sf.read(input_file)
    data = resample(data, len(data) * (target_samplerate / samplerate))
    sf.write(output_file, data, target_samplerate, format='WAV')

In [8]:
def get_path_to_file(filename, subset, audio_format='wav'):
    path = f"data_{audio_format}/{subset}/{filename}.{audio_format}"
    return path

In [9]:
!mkdir data_wav

def convert_folder_to_wav(name="train", target_name=None, target_samplerate=16000):
    if target_name is None:
        target_name = name
    !mkdir "data_wav/{target_name}"


    for file in tqdm(os.listdir(f'data_mp3/{name}'), desc=name + '-->' + target_name):
        filename = file.rsplit('.', 1)[0]
        mp3_to_wav(get_path_to_file(filename, name, 'mp3'),
                       get_path_to_file(filename, target_name, 'wav'),
                      target_samplerate=target_samplerate)

A subdirectory or file data_wav already exists.


In [10]:
train = pd.read_csv("data/final/train.csv", header=None)
train[1].value_counts(normalize=True)

1
0    0.594820
1    0.342406
2    0.046971
3    0.015803
Name: proportion, dtype: float64

In [11]:
target_samplerate = 16000

# convert_folder_to_wav('train', target_samplerate=target_samplerate)
# convert_folder_to_wav('test', target_name='test', target_samplerate=target_samplerate)
# convert_folder_to_wav('final_test', target_name='test', target_samplerate=target_samplerate) # для теста с полуфинала

In [12]:
from IPython.display import Audio
data, _ = sf.read('data_wav/train/f304ef39-cd7b-4727-8fd4-06f0b23c659c.wav')

Audio(data, rate=16000)

# Words

In [14]:
def process_y(y):
    y = y.rename(columns={0: 'audio_name', 1: 'target'})
    y = y.set_index('audio_name')
    return y

In [15]:
def get_word_array(path, start, end):
    _, data = wavfile.read(path)
    start += 0.1
    end += 0.25

    data = data[int(target_samplerate*start):int(target_samplerate*end)]
    return data

In [16]:
threshold = 0.5 # Порог для confidence виспера

In [17]:
letters = set() # Какие буквы оставить (удаляем знаки препинания/цифры/английские символы из транскрибации)
for i in range(33):
    letters.add(chr(ord('а') + i))
print(letters)

{'х', 'ч', 'и', 'ц', 'с', 'у', 'в', 'ш', 'ж', 'б', 'щ', 'ъ', 'э', 'м', 'о', 'й', 'г', 'д', 'т', 'ю', 'п', 'ф', 'н', 'ь', 'з', 'я', 'л', 'р', 'а', 'ы', 'к', 'ѐ', 'е'}


In [18]:
def get_subset(data_type):
    if data_type.endswith('train'):
        return 'train'
    elif data_type.endswith('test'):
        return 'test'
    else:
        raise ValueError()

In [19]:
def process_whisper_res(whisper_res, y, target_letters):
    np.random.seed(42)

    data = []

    scale_factor = 2**15 # На что делим при переводе аудио из int во float

    for filename in tqdm(whisper_res.keys()):
        label, data_type = y.loc[filename, ['target', 'data_type']].values
        subset = get_subset(data_type)

        for segment in whisper_res[filename]['segments']:
            words = segment['words']
            for word in words:
                try:
                    text = word['text'].lower()
                except:
                    text = word['word'].lower()
                text = ''.join(letter for letter in text if letter in letters)
                if len(text) <= 2: # Слишком короткий текст
                    continue

                start = word['start']
                end = word['end']
                try:
                    confidence = word['confidence']
                except:
                    confidence = word['probability']
                if confidence < threshold : continue # Низкий confidence виспера

                d = {'file': filename, 'text':text, 'start':start, 'end': end, 
                     'confidence':confidence, 'label':label,
                    'data_type': data_type, 'subset': subset}

                for letter in target_letters:

                    count = text.count(letter)
                    d[f"{letter}_count"] = count
                
                
                data.append(d)

    data = pd.DataFrame(data)
    data['path'] = data[['file', 'subset']].apply(lambda x: f'data_wav/{x['subset']}/' + x['file'][:-4] + '.wav', axis=1)

    arrays = []
    
    drop_ids = []

    for idx, (path, start, end) in tqdm(data[['path', 'start', 'end']].iterrows(), total=len(data)):
        array = get_word_array(path, start, end) / scale_factor
        if len(array) == 0: # Пустой отрезок вырезали
            drop_ids.append(idx)
            continue
            
        arrays.append(array)

        
    data = data.drop(index=drop_ids)
    
    return data, arrays

### Train

In [21]:
import pickle

whisper_res = {}

for root, dirs, files in os.walk('data_whisper'):
    for file in files:
        path = os.path.join(root, file)

        with open(path, 'rb') as f:
            whisper_res_one_part = pickle.load(f)
    
        for k,v in whisper_res_one_part.items():
            v['data_type'] = 'first_stage'
            whisper_res[k] = v

In [22]:
train1 = pd.read_csv("data/first/train.csv", header=None)
train1['data_type'] = 'first_stage_train'

test1 = pd.read_csv("data/first/test.csv", header=None)
test1['data_type'] = 'first_stage_test'

train2 = pd.read_csv("data/semifinal/train.csv", header=None)
train2['data_type'] = 'semifinal_train'

test2_private = pd.read_csv("data/semifinal/final_test.csv", header=None)
test2_private['data_type'] = 'semifinal_test'

train3 = pd.read_csv("data/final/train.csv", header=None)
train3['data_type'] = 'final_train'

y = pd.concat([train1, test1, train2, test2_private, train3], axis=0, ignore_index=True)

y = process_y(y)
y

Unnamed: 0_level_0,target,data_type
audio_name,Unnamed: 1_level_1,Unnamed: 2_level_1
611b27e7-0019-4fc6-9622-21d9647c45f0.mp3,1.0,first_stage_train
67465147-b88c-4acd-bb91-a78340a9bde7.mp3,0.0,first_stage_train
257002c3-13ce-4408-853a-a5686a051d1c.mp3,0.0,first_stage_train
16c70c64-e167-40e5-a4be-cf861e84c497.mp3,0.0,first_stage_train
6ed94dcd-d1e5-4d98-b6ee-d86766d7bf50.mp3,1.0,first_stage_train
...,...,...
8cd48344-3644-4664-ac32-258abc77b1fa.mp3,0.0,final_train
43acbfff-9b19-49c8-95d4-e83bbd1f08eb.mp3,0.0,final_train
5b47c4f5-ac91-4930-837d-01de88e9a97c.mp3,0.0,final_train
246ccceb-d47a-49f9-9ae8-9a5307903ffc.mp3,1.0,final_train


In [23]:
_, val_files = train_test_split(list(y[y['data_type']== 'final_train'].index),
                                          test_size=0.2, random_state=42,
                                         stratify=y.loc[y['data_type']=='final_train', 'target'])

train_files = [file for file in y.index if file not in val_files]
len(train_files), len(val_files)

(17203, 456)

In [24]:
!mkdir "data_processed/words"

with open('data_processed/words/val_files.pkl', 'wb') as f:
    pickle.dump(val_files, f)

A subdirectory or file data_processed/words already exists.


In [25]:
y_train, y_val = y.loc[train_files], y.loc[val_files]

In [26]:
y_val['target'].value_counts(), y_val['target'].value_counts(normalize=True)

(target
 0.0    271
 1.0    156
 2.0     22
 3.0      7
 Name: count, dtype: int64,
 target
 0.0    0.594298
 1.0    0.342105
 2.0    0.048246
 3.0    0.015351
 Name: proportion, dtype: float64)

In [27]:
y_train.shape, y_val.shape

((17203, 2), (456, 2))

In [28]:
whisper_res_train = {file: whisper_res[file] for file in train_files}
whisper_res_val = {file: whisper_res[file] for file in val_files}

In [29]:
y.loc['3ba8871a-56d0-4612-8874-e16c04ec4ff6.mp3']

target                    NaN
data_type    first_stage_test
Name: 3ba8871a-56d0-4612-8874-e16c04ec4ff6.mp3, dtype: object

In [30]:
train, train_arrays = process_whisper_res(whisper_res_train, y, target_letters=target_letters)
assert len(train) == len(train_arrays)

100%|██████████████████████████████████████████████████████████████████████████| 17203/17203 [00:02<00:00, 6186.00it/s]
100%|████████████████████████████████████████████████████████████████████████| 111410/111410 [01:45<00:00, 1057.69it/s]


In [31]:
%%time
train.to_parquet('data_processed/words/train.parquet')

CPU times: total: 219 ms
Wall time: 245 ms


In [32]:
%%time

with open('data_processed/words/train_arrays.pkl', 'wb') as f:
    pickle.dump(train_arrays, f)

CPU times: total: 1min 5s
Wall time: 1min 5s


In [33]:
del train_arrays

In [34]:
val, val_arrays = process_whisper_res(whisper_res_val, y, target_letters=target_letters)

100%|██████████████████████████████████████████████████████████████████████████████| 456/456 [00:00<00:00, 6491.74it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2878/2878 [00:02<00:00, 1339.49it/s]


In [35]:
val.to_parquet('data_processed/words/val.parquet')

In [36]:
val.to_parquet('data_processed/words/val.parquet')

with open('data_processed/words/val_arrays.pkl', 'wb') as f:
    pickle.dump(val_arrays, f)    

In [37]:
del val_arrays

### Test

In [39]:
import pickle
    
with open("data_whisper/final.pickle", "rb") as f:
    whisper_res = pickle.load(f)

In [40]:
y = pd.read_csv('data/final/test.csv', header=None)
y = process_y(y)
y['data_type'] = 'final_test'
y

Unnamed: 0_level_0,target,data_type
audio_name,Unnamed: 1_level_1,Unnamed: 2_level_1
0291d295-aae8-4aee-98c6-51899e638b50.mp3,,final_test
bebe7afb-0081-4975-81cc-2b62d92376fd.mp3,,final_test
a8273eba-cbee-41ae-8918-28196e693dc7.mp3,,final_test
f6ad5636-d1cd-40f2-95aa-4a2dba5cb2c4.mp3,,final_test
a98eff13-e333-412f-a5df-66d6f77aa4d0.mp3,,final_test
...,...,...
54c991cc-2505-480c-8be2-adb69d896dfc.mp3,,final_test
dceb22e0-5587-40cb-a4ae-d326e6ae17e4.mp3,,final_test
4cdf0a12-bca6-493b-aea4-c120271f8479.mp3,,final_test
3527d1f1-eeb9-4783-b493-27f1dec831a0.mp3,,final_test


In [41]:
whisper_res = {audio : whisper_res[audio] for audio in y.index}

In [42]:
test, test_arrays = process_whisper_res(whisper_res, y, target_letters=target_letters)

100%|██████████████████████████████████████████████████████████████████████████████| 718/718 [00:00<00:00, 6276.40it/s]
100%|████████████████████████████████████████████████████████████████████████████| 4612/4612 [00:03<00:00, 1274.20it/s]


In [43]:
test.to_parquet('data_processed/words/test.parquet')

In [44]:
%%time
test.to_parquet('data_processed/words/test.parquet')

with open('data_processed/words/test_arrays.pkl', 'wb') as f:
    pickle.dump(test_arrays, f)

CPU times: total: 344 ms
Wall time: 342 ms


In [45]:
train

Unnamed: 0,file,text,start,end,confidence,label,data_type,subset,р_count,г_count,path
0,611b27e7-0019-4fc6-9622-21d9647c45f0.mp3,нет,0.00,0.60,0.504,1.0,first_stage_train,train,0,0,data_wav/train/611b27e7-0019-4fc6-9622-21d9647...
1,611b27e7-0019-4fc6-9622-21d9647c45f0.mp3,такого,0.80,1.04,0.990,1.0,first_stage_train,train,0,1,data_wav/train/611b27e7-0019-4fc6-9622-21d9647...
2,611b27e7-0019-4fc6-9622-21d9647c45f0.mp3,планируется,1.32,1.88,0.971,1.0,first_stage_train,train,1,0,data_wav/train/611b27e7-0019-4fc6-9622-21d9647...
3,67465147-b88c-4acd-bb91-a78340a9bde7.mp3,посмотреть,0.00,1.06,0.728,0.0,first_stage_train,train,1,0,data_wav/train/67465147-b88c-4acd-bb91-a78340a...
4,67465147-b88c-4acd-bb91-a78340a9bde7.mp3,пока,1.06,1.24,0.644,0.0,first_stage_train,train,0,0,data_wav/train/67465147-b88c-4acd-bb91-a78340a...
...,...,...,...,...,...,...,...,...,...,...,...
111405,67ee2e50-8c8a-49d8-b8f8-75cdda336930.mp3,потом,11.12,12.40,0.994,1.0,final_train,train,0,0,data_wav/train/67ee2e50-8c8a-49d8-b8f8-75cdda3...
111406,67ee2e50-8c8a-49d8-b8f8-75cdda336930.mp3,оплату,12.40,13.18,0.944,1.0,final_train,train,0,0,data_wav/train/67ee2e50-8c8a-49d8-b8f8-75cdda3...
111407,67ee2e50-8c8a-49d8-b8f8-75cdda336930.mp3,выгодную,13.32,13.84,0.677,1.0,final_train,train,0,1,data_wav/train/67ee2e50-8c8a-49d8-b8f8-75cdda3...
111408,67ee2e50-8c8a-49d8-b8f8-75cdda336930.mp3,работу,13.84,14.08,0.999,1.0,final_train,train,1,0,data_wav/train/67ee2e50-8c8a-49d8-b8f8-75cdda3...
