In [1]:
import pandas as pd
import glob
import os
from tqdm import tqdm
import numpy as np
import jiwer
from pathlib import Path
from random import sample

import torch
import torch.utils.data

import io
import codecs
from torchtext import data

import torch
import torch.nn as nn
import torch.optim as optim

import torchtext
from torchtext.data import Field, BucketIterator
from torchtext.datasets import TranslationDataset

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import spacy
import numpy as np

import random
import math
import time

pd.options.mode.chained_assignment = None
np.random.seed(42)

In [2]:
SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

### Read ASR Data

In [3]:
# read prepared targets data that were used for TTS
df_mtsamples = pd.read_csv('normalized_mtsamples.csv',index_col=False)
df_mtsamples = df_mtsamples.rename(columns = {'sentence': 'tts_sentence'}, inplace = False)

texts_directory="/SSD-2T/medical_domain_adaptation_dataset/texts/"

In [4]:
def fill_df(df, texts_directory):
    """Fill dataframe with ASR predictions.
       inputs: df - dataframe with target sentences, 
               texts_directory - path to the folder with ASR outputs
    """
    
    text_paths = glob.glob(texts_directory+"*.txt")
    text_ids = [int(os.path.basename(path).split('.')[0]) for path in text_paths]
    df['predicted'] = np.nan
    
    for i in tqdm(range(len(text_ids))):
        text_id = text_ids[i]
        file_name = text_paths[i]
        file = open(file_name, "r")
        text = file.readlines()[0]
        
        text = text.replace('  ',' ')
        text=text.strip()
        
        df['predicted'][text_id]=text
            
fill_df(df_mtsamples, texts_directory)

100%|██████████| 94128/94128 [00:29<00:00, 3159.09it/s]


In [5]:
def fill_targets(df):
    """Fill dataframe with targets for Transformer.
       inputs: df - dataframe with target sentences that were used for TTS
    """
    targets=[]
    
    for i in tqdm(range(len(df))):
        text = df['tts_sentence'][i]
        
        text = text.replace('.','')
        text = text.replace(',','')
        text = text.replace(':','')
        text = text.replace(';','')
        text = text.replace('!','')
        text = text.replace('?','')
        text = text.replace('(','')
        text = text.replace(')','')
        text = text.replace('-','')
        text = text.replace('_','')
        text = text.replace('*','')
        text = text.replace('"','')
        text = text.replace('  ',' ')
        text = text.strip()
        
        targets.append(text)
        
    df['targets']=targets
    
fill_targets(df_mtsamples)

100%|██████████| 94128/94128 [00:00<00:00, 204671.47it/s]


In [6]:
# save dataframe
df_mtsamples.to_csv('data/medical_domain_adaptation_dataset.csv',index = False)

### Calculate metrics

In [7]:
def get_wers(df):
    """Calculate WER for each target sentence and ASR output.
       inputs: df - dataframe with target sentences and ASR outputs.
       returns: wers- list of wers.
    """
    wers=[]
    for i in tqdm(range(len(df))):
        ground_truth=df['targets'][i]
        hypothesis = df['predicted'][i]
        wer = jiwer.wer(ground_truth, hypothesis)
        wers.append(wer)        
    return wers

wers=get_wers(df_mtsamples)
print('Mean WER: ' + str(np.mean(wers)))
print('std WER: ' + str(np.std(wers)))

100%|██████████| 94128/94128 [00:02<00:00, 34441.27it/s]

Mean WER: 0.19598329766842254
std WER: 0.2544473036273024





In [8]:
def apply_transformations(targets, predictions):
    """Apply normalizations-transformations to sentences and return list of words.
       inputs: targets - list of ground truth sentences,
               predictions - list of predicted sentences.
       returns: references_corpus - list of words for ground truth sentences, 
                candidate_corpus - list of words for predicted sentences.
    """
    
    transformation = jiwer.Compose([
                                jiwer.ToLowerCase(),
                                jiwer.RemoveMultipleSpaces(),
                                jiwer.Strip(),
                                jiwer.RemoveEmptyStrings(),
                                jiwer.SentencesToListOfWords(word_delimiter=" ")
                                ]) 
    
    references_corpus = []
    candidate_corpus = []
    
    for i in tqdm(range(len(targets))):
        trainsformed_target = transformation(targets[i])
        trainsformed_prediction = transformation(predictions[i])
        references_corpus.append([trainsformed_target])
        candidate_corpus.append(trainsformed_prediction)
        
    return references_corpus, candidate_corpus
        

references_corpus, candidate_corpus = apply_transformations(df_mtsamples['targets'].tolist(), 
                                                            df_mtsamples['predicted'].tolist())

100%|██████████| 94128/94128 [00:00<00:00, 121124.36it/s]


In [9]:
from torchtext.data.metrics import bleu_score

# calculate BLEU score
bleu_score = bleu_score(candidate_corpus, references_corpus)
print('BLEU Score: '+str(bleu_score))

BLEU Score: 0.710877001285553


### Words statistics

In [10]:
# Calculate words statistics for dataset
num_words_references_corpus = [len(s[0]) for s in references_corpus]
num_words_candidate_corpus = [len(s) for s in candidate_corpus]

print('Num of words in sentence for references corpus: mean - '+
      str(np.mean(num_words_references_corpus))+
      ', min - '+str(np.min(num_words_references_corpus))+
      ', max - '+str(np.max(num_words_references_corpus))+
      ', std - '+str(np.std(num_words_references_corpus)))

print('Num of words in sentence for candidate corpus: mean - '+
      str(np.mean(num_words_candidate_corpus))+
      ', min - '+str(np.min(num_words_candidate_corpus))+
      ', max - '+str(np.max(num_words_candidate_corpus))+
      ', std - '+str(np.std(num_words_candidate_corpus)))

Num of words in sentence for references corpus: mean - 11.185757691653919, min - 1, max - 42, std - 5.924197336921742
Num of words in sentence for candidate corpus: mean - 11.602137514873364, min - 1, max - 41, std - 5.971627533062848


### Make TorchText Dataset

#### save to separate files

In [4]:
# create directory for dataset
Path('data preparation/data/MDA94k').mkdir(parents=True, exist_ok=True)

In [8]:
# split randomly on training and validation data
validation_length = 3128
train_length = len(df_mtsamples)-validation_length

indexes = sample(range(len(df_mtsamples)), len(df_mtsamples))
train_indexes = indexes[:train_length]
validation_indexes = indexes[train_length:]

In [9]:
train_references = [df_mtsamples['targets'][i] for i in train_indexes]
validation_references = [df_mtsamples['targets'][i] for i in validation_indexes]

train_candidates = [df_mtsamples['predicted'][i] for i in train_indexes]
validation_candidates = [df_mtsamples['predicted'][i] for i in validation_indexes]

In [10]:
def list2txt(data, filepath):
    """Save list of strings to txt file.
       inputs: data - list of strings.
       returns: filepath - path with filename of txt file.
    """
    file = open(filepath, "a+")
    file.writelines('\n'.join(data))
    file.close()

# save files
list2txt(train_references, filepath='data preparation/data/MDA94k/train.references')  
list2txt(validation_references, filepath='data preparation/data/MDA94k/val.references')
list2txt(train_candidates, filepath='data preparation/data/MDA94k/train.candidates')  
list2txt(validation_candidates, filepath='data preparation/data/MDA94k/val.candidates')    

#### load to TranslationDataset class

In [5]:
#!python -m spacy download en_core_web_sm
import en_core_web_sm

In [6]:
# load spacy tokenizers
spacy_en = en_core_web_sm.load()

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

SRC = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True, 
            batch_first = True)

TRG = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True, 
            batch_first = True)



In [7]:
# making TranslationDataset class for our dataset in order to make data in torchtext format
class MDA94k(TranslationDataset):
    """Class for our small custom dataset for Medical Domain Adaptation"""
    
    name = 'data preparation/data/MDA94k'
    dirname = ''

    @classmethod
    def splits(cls, exts, fields, root='',
               train='train', validation='val', test=None, **kwargs):
        """Create dataset objects for splits of the MDA94k dataset.

        Arguments:
            exts: A tuple containing the extension to path for each language.
            fields: A tuple containing the fields that will be used for data
                in each language.
            root: Root dataset storage directory. Default is '.data'.
            train: The prefix of the train data. Default: 'train'.
            validation: The prefix of the validation data. Default: 'val'.
            test: The prefix of the test data. Default: 'test'.
            Remaining keyword arguments: Passed to the splits method of
                Dataset.
        """

        # TODO: This is a _HORRIBLE_ patch related to #208
        # 'path' can be passed as a kwarg to the translation dataset constructor
        # or has to be set (so the download wouldn't be duplicated). A good idea
        # seems to rename the existence check variable from path to something else
        if 'path' not in kwargs:
            expected_folder = os.path.join(root, cls.name)
            path = expected_folder if os.path.exists(expected_folder) else None
        else:
            path = kwargs['path']
            del kwargs['path']

        return super(MDA94k, cls).splits(
            exts, fields, path, root, train, validation, test, **kwargs)

In [8]:
# load our data as torchtext Example
train_data, valid_data = MDA94k.splits(exts = ('.candidates', '.references'), 
                                       fields = (SRC, TRG))



In [9]:
# print example
print(vars(train_data[3]))

{'src': ['computer', 'typography', 'head', 'technique'], 'trg': ['computed', 'tomography', 'head', 'technique']}


In [10]:
# print example
print(vars(valid_data[0]))

{'src': ['as', 'well', 'as', 'right', 'index', 'finger', 'soreness', 'at', 'the', 'peak', 'it', "'s", 'territory', 'pressure', 'joint'], 'trg': ['as', 'well', 'as', 'right', 'index', 'finger', 'soreness', 'at', 'the', 'peak', 'inspiratory', 'pressure', 'joint']}


In [8]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")

Number of training examples: 91000
Number of validation examples: 3128


In [11]:
# make vocabularies
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

In [10]:
print(f"Unique tokens in source (ASR output) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (TTS input) vocabulary: {len(TRG.vocab)}")

Unique tokens in source (ASR output) vocabulary: 14674
Unique tokens in target (TTS input) vocabulary: 13697


In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
# making data iterators

BATCH_SIZE = 128

train_iterator, valid_iterator = BucketIterator.splits((train_data, valid_data), 
                                                        batch_size = BATCH_SIZE,
                                                        device = device)



### Define Model

In [14]:
from transformer import Encoder, Decoder, Seq2Seq

INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
HID_DIM = 256
ENC_LAYERS = 3
DEC_LAYERS = 3
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

enc = Encoder(INPUT_DIM, 
              HID_DIM, 
              ENC_LAYERS, 
              ENC_HEADS, 
              ENC_PF_DIM, 
              ENC_DROPOUT, 
              device)

dec = Decoder(OUTPUT_DIM, 
              HID_DIM, 
              DEC_LAYERS, 
              DEC_HEADS, 
              DEC_PF_DIM, 
              DEC_DROPOUT, 
              device)

In [21]:
SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)

In [22]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 14,787,969 trainable parameters


In [23]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

In [24]:
model.apply(initialize_weights);

### Training

In [25]:
LEARNING_RATE = 0.0005

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

In [26]:
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [27]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output, _ = model(src, trg[:,:-1])
                
        #output = [batch size, trg len - 1, output dim]
        #trg = [batch size, trg len]
            
        output_dim = output.shape[-1]
            
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:,1:].contiguous().view(-1)
                
        #output = [batch size * trg len - 1, output dim]
        #trg = [batch size * trg len - 1]
            
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [28]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output, _ = model(src, trg[:,:-1])
            
            #output = [batch size, trg len - 1, output dim]
            #trg = [batch size, trg len]
            
            output_dim = output.shape[-1]
            
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)
            
            #output = [batch size * trg len - 1, output dim]
            #trg = [batch size * trg len - 1]
            
            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [36]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [30]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'mda94k-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')



Epoch: 01 | Time: 0m 35s
	Train Loss: 3.865 | Train PPL:  47.693
	 Val. Loss: 1.873 |  Val. PPL:   6.510
Epoch: 02 | Time: 0m 35s
	Train Loss: 1.534 | Train PPL:   4.636
	 Val. Loss: 0.977 |  Val. PPL:   2.656
Epoch: 03 | Time: 0m 34s
	Train Loss: 0.854 | Train PPL:   2.349
	 Val. Loss: 0.709 |  Val. PPL:   2.031
Epoch: 04 | Time: 0m 34s
	Train Loss: 0.568 | Train PPL:   1.765
	 Val. Loss: 0.609 |  Val. PPL:   1.839
Epoch: 05 | Time: 0m 35s
	Train Loss: 0.418 | Train PPL:   1.519
	 Val. Loss: 0.574 |  Val. PPL:   1.776
Epoch: 06 | Time: 0m 34s
	Train Loss: 0.323 | Train PPL:   1.381
	 Val. Loss: 0.558 |  Val. PPL:   1.747
Epoch: 07 | Time: 0m 34s
	Train Loss: 0.257 | Train PPL:   1.293
	 Val. Loss: 0.558 |  Val. PPL:   1.748
Epoch: 08 | Time: 0m 33s
	Train Loss: 0.209 | Train PPL:   1.232
	 Val. Loss: 0.544 |  Val. PPL:   1.723
Epoch: 09 | Time: 0m 34s
	Train Loss: 0.174 | Train PPL:   1.190
	 Val. Loss: 0.564 |  Val. PPL:   1.759
Epoch: 10 | Time: 0m 33s
	Train Loss: 0.148 | Train PPL

In [29]:
model.load_state_dict(torch.load('mda94k-model.pt'))

test_loss = evaluate(model, valid_iterator, criterion)

print(f'| Valid Loss: {test_loss:.3f} | Valid PPL: {math.exp(test_loss):7.3f} |')



| Valid Loss: 0.544 | Valid PPL:   1.723 |
