In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.data as data
import torchaudio
from torch import Tensor
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd

import csv
from typing import Tuple

In [2]:
###data load (pickle file)

load_path='/home/skgudwn34/Accented_speech/speech_recognition/input_data/'

train_set=load_path+'train_set'
train_df=pd.read_pickle(train_set)

val_set=load_path+'val_set'
val_df=pd.read_pickle(val_set)

test_set=load_path+'test_set'
test_df=pd.read_pickle(test_set)

In [3]:
###commonvoice dataset

class Common_voice(Dataset):
    def __init__(self,dataframe):
        self.dataframe=dataframe
        self.len=len(self.dataframe)

    def __getitem__(self,idx) -> Tuple[str,str,str,int,Tensor]:
        return (self.dataframe['File'].iloc[idx],self.dataframe['Accent'].iloc[idx],self.dataframe['Sentence'].iloc[idx],
        self.dataframe['Sample_rate'].iloc[idx],self.dataframe['Waveform'].iloc[idx])
    
    def __len__(self) -> int:
        return self.len


In [4]:
'''
train_dataset=Common_voice(train_df)
val_dataset=Common_voice(val_df)
test_dataset=Common_voice(test_df)
'''

'\ntrain_dataset=Common_voice(train_df)\nval_dataset=Common_voice(val_df)\ntest_dataset=Common_voice(test_df)\n'

In [5]:
#for idx, data in enumerate(test_dataset): 
    #print(data)

#print(len(test_dataset))

In [6]:
###edit distance
'''
Levenshtein distance는 두 시퀀스 간의 차이를 측정하기위한 문자열 메트릭
Levenshtein distance는 한 단어를 다른 단어로 변경하는 데 필요한 최소 한 문자 편집 (대체, 삽입 또는 삭제) 수로 정의
'''

def levenshtein_distance(ref,hyp):
    
    m=len(ref) #reference
    n=len(hyp) #hypothesis

    #special case
    if ref==hyp:
        return 0
    if m==0:
        return n
    if n==0:
        return m

    if m<n:
        ref,hyp=hyp,ref
        m,n=n,m

    #use 0 (min(m,n)) space
    distance=np.zeors((2,n+1),dtype=np.int32)

    #initialize distance matrix
    for j in range(0,n+1):
        distance[0][j]=j

    #calculate levenshtein distance
    for i in range(1,m+1):

        prev_row_idx=(i-1)%2
        cur_row_idx=i%2

        distance[cur_row_idx][0]=i

        for j in range(1,n+1):
            if ref[i-1]==hyp[j-1]:
                distance[cur_row_idx][j]=distance[prev_row_idx][j-1]
            else:
                s_num=distance[prev_row_idx][j-1]+1
                i_num=distance[cur_row_idx][j-1]+1
                d_num=distance[prev_cur_idx][j]+1

                distance[cur_row_idx][j]=min(s_num,i_num,d_num)

    return distance[m%2][n]

In [7]:
def word_errors(reference, hypothesis, ignore_case=False, delimiter=' '):
    if ignore_case==True:
        reference=reference.upper()
        hypothesis=hypothesis.upper()
    
    ref_words=reference.split(delimiter)
    hyp_words=hypothesis.split(delimiter)

    edit_distance=levenshtein_distance(ref_words,hyp_words)

    return float(edit_distance), len(ref_words)

In [8]:
def char_errors(reference, hypothesis, ignore_case=False, delimiter=' ', remove_space=False):
    if ignore_case==True:
        reference=reference.upper()
        hypothesis=hypothesis.upper()

    join_char=' '

    if remove_space==True:
        join_char=''

    reference=join_char.join(filter(None,reference.split(delimiter)))
    hypothesis=join_char.join(filter(None,hypothesis.split(delimiter)))

    edit_distance=levenshtein_distance(reference,hypothesis)

    return float(edit_distance),len(reference)


In [9]:
###word error rate
'''
WER = (Sw + Dw + Iw) / Nw

Sw는 대체 된 단어의 수
Dw는 삭제 된 단어의 수
Iw는 삽입 된 단어의 수
Nw는 참조의 단어 수
'''

def WER(reference, hypothesis, ignore_case=False, delimiter=' '):

    edit_distance,ref_len=word_errors(reference,hypothesis,ignore_case,delimiter)

    if ref_len==0:
        raise ValueError("Reference's word number should be greater than 0.")

    wer=float(edit_distance)/ref_len

    return wer


In [10]:
###character error rate
'''
CER = (Sc + Dc + Ic) / Nc

Sc는 대체 된 문자의 수
Dc는 삭제 된 문자의 수
Ic는 삽입 된 문자의 수
Nc는 참조의 문자 수
'''

def CER(reference, hypothesis, ignore_case=False, delimiter=' ', remove_space=False):

    edit_distance,ref_len=char_errors(reference,hypothesis,ignore_case,delimiter,remove_space)

    if ref_len==0:
        raise ValueError("Length of reference should be greater than 0.")

    cer=float(edit_distance)/ref_len

    return cer



In [11]:
'''
The outputs of the network are the graphemes of each language. 
At each output time-step t, the RNN makes a prediction over characters.
In English we have `t ∈ {a, b, c, . . . , z,space, apostrophe, blank}
'''

###maps characters to integers and vice versa

class TextTransform:
    def __init__(self):
        char_map_str="""
        ' 0
        <SPACE> 1
        A 2
        B 3
        C 4
        D 5
        E 6
        F 7
        G 8
        H 9
        I 10
        J 11
        K 12
        L 13
        M 14
        N 15
        O 16
        P 17
        Q 18
        R 19
        S 20
        T 21
        U 22
        V 23
        W 24
        X 25
        Y 26
        Z 27
        """

        self.char_map={}
        self.index_map={}

        for line in char_map_str.strip().split('\n'):
            ch,index=line.split()
            self.char_map[ch]=int(index)
            self.index_map[int(index)]=ch
        
        self.index_map[1]=' '

    #Use a character map and convert text to an integer sequence
    def text_to_int(self,text):

        int_sequence=[]

        for c in text:
            if c==' ':
                ch=self.char_map['<SPACE>']
            else:
                ch=self.char_map[c]
            
            int_sequence.append(ch)
        
        return int_sequence

    #Use a character map and convert integer labels to an text sequence
    def int_to_text(self,labels):

        string=[]

        for i in labels:
            string.append(self.index_map[i])

        return ''.join(string).replace('<SPACE>',' ')

text_transform = TextTransform()


In [12]:
###spec augmentation

#for trainset
train_audio_transforms = nn.Sequential(
    torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=128),
    torchaudio.transforms.FrequencyMasking(freq_mask_param=30),
    torchaudio.transforms.TimeMasking(time_mask_param=100)
)

#for testset
test_audio_transforms = torchaudio.transforms.MelSpectrogram()

In [13]:
###data processing

def data_processing(data,data_type):

    spectrograms = []
    labels = []
    input_lengths = []
    label_lengths = []

    for (_, _, sentence, _, waveform) in data:
        if data_type=='train':
            spec=train_audio_transforms(waveform).squeeze(0).transpose(0,1)
        elif data_type=='test':
            spec=test_audio_transforms(waveform).squeeze(0).transpose(0,1)
        else:
            raise Exception('Data_type should be train or test')

        spectrograms.append(spec)

        label=torch.Tensor(text_transform.text_to_int(sentence.upper()))
        labels.append(label)

        input_lengths.append(spec.shape[0]//2)
        label_lengths.append(len(label))

    spectrograms=nn.utils.rnn.pad_sequence(spectrograms,batch_first=True).unsqueeze(1).transpose(2,3)
    labels=nn.utils.rnn.pad_sequence(labels,batch_first=True)

    return spectrograms, labels, input_lengths, label_lengths


In [14]:
###Layer normalization built for cnns input

class CNNLayerNorm(nn.Module):
    def __init__(self,n_feats):
        super(CNNLayerNorm,self).__init__()
        self.layer_norm=nn.LayerNorm(n_feats)

    
    def forward(self,x):
        # x (batch, channel, feature, time)
        x=x.transpose(2,3).contiguous() # (batch, channel, time, feature)
        x=self.layer_norm(x)

        return x.transpose(2,3).contiguous() # (batch, channel, feature, time) 



In [15]:
###Resnet

class ResidualCNN(nn.Module):
    def __init__(self,in_channels,out_channels,kernel,stride,dropout,n_feats):
        super(ResidualCNN,self).__init__()

        self.cnn1=nn.Conv2d(in_channels,out_channels,kernel,stride,padding=kernel//2)
        self.cnn2=nn.Conv2d(out_channels,out_channels,kernel,stride,padding=kernel//2)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.layer_norm1 = CNNLayerNorm(n_feats)
        self.layer_norm2 = CNNLayerNorm(n_feats)

    def forward(self,x):
        residual=x # (batch, channel, feature, time)
        
        x = self.layer_norm1(x)
        x = F.gelu(x)
        x = self.dropout1(x)
        x = self.cnn1(x)
        
        x = self.layer_norm2(x)
        x = F.gelu(x)
        x = self.dropout2(x)
        x = self.cnn2(x)
        
        x += residual

        return x # (batch, channel, feature, time)

In [16]:
###BiGRU

class BidirectionalGRU(nn.Module):
    def __init__(self,rnn_dim,hidden_size,dropout,batch_first):
        super(BidirectionalGRU, self).__init__()

        self.BiGRU=nn.GRU(input_size=rnn_dim,hidden_size=hidden_size,num_layers=1,batch_first=batch_first,bidirectional=True)
        self.layer_norm=nn.LayerNorm(rnn_dim)
        self.dropout=nn.Dropout(dropout)

    def forward(self,x):
        x=self.layer_norm(x)
        x=F.gelu(x)
        x,_=self.BiGRU(x)
        x=self.dropout(x)
        
        return x


In [17]:
###ASR model

class SpeechRecognitionModel(nn.Module):
    def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, n_class, n_feats, stride=2, dropout=0.1):
        super(SpeechRecognitionModel,self).__init__()
        
        n_feats=n_feats//2

        self.cnn=nn.Conv2d(1,32,3,stride=stride,padding=1)

        self.rescnn_layers=nn.Sequential(*[
            ResidualCNN(32,32,kernel=3,stride=1,dropout=dropout,n_feats=n_feats) for _ in range(n_cnn_layers)
        ])

        self.fully_connected=nn.Linear(n_feats*32,rnn_dim)

        self.birnn_layers=nn.Sequential(*[
            BidirectionalGRU(rnn_dim=rnn_dim if i==0 else rnn_dim*2,hidden_size=rnn_dim,dropout=dropout,batch_first=i==0)
            for i in range(n_rnn_layers)
        ])

        self.classifier=nn.Sequential(
            nn.Linear(rnn_dim*2,rnn_dim),nn.GELU(),nn.Dropout(dropout),nn.Linear(rnn_dim,n_class)
        )

    def forward(self,x):
        x = self.cnn(x)
        x = self.rescnn_layers(x)

        sizes = x.size()
        x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3])  # (batch, feature, time)
        x = x.transpose(1, 2) # (batch, time, feature)

        x = self.fully_connected(x)
        x = self.birnn_layers(x)
        x = self.classifier(x)

        return x


In [18]:
###decoder

def GreedyDecoder(output,labels,label_lengths,blank_label=28,collapse_repeated=True):

    arg_maxes=torch.argmax(output,dim=2)

    decodes=[]
    targets=[]

    for i,args in enumerate(arg_maxes):
        decode=[]
        targets.append(text_transform.int_to_text(labels[i][:label_lengths[i]].tolist()))
        for j, index in enumerate(args):
            if index != blank_label:
                if collapse_repeated and j!=0 and index ==args[j-1]:
                    continue
                decode.append(index.item())
            decodes.append(text_transform.int_to_text(decode))

    return decodes, targets
    

In [19]:
###keeps track of total iterations

class IterMeter(object):
    def __init__(self):
        self.val=0

    def step(self):
        self.val+=1

    def get(self):
        return self.val


In [20]:
###training

def train(model,device,train_loader,criterion,optimizer,epoch,iter_meter):
    print("Train start")
    model.train()

    data_len=len(train_loader.dataset)

    for batch_idx,data in enumerate(train_loader):
        spectrograms,labels,input_lengths,label_lengths=data
        spectrograms,labels=spectrograms.to(device),labels.to(device)

        optimizer.zero_grad()

        output=model(spectrograms)  # (batch, time, n_class)
        output=F.log_softmax(output, dim=2)
        output=output.transpose(0, 1) # (time, batch, n_class)

        loss=criterion(output,labels,input_lengths,label_lengths)
        loss.backward()

        optimizer.step()
        iter_meter.step()

        if batch_idx%100==0 or batch_idx==data_len:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(spectrograms), data_len,100. * batch_idx / len(train_loader), loss.item()))
        

In [21]:
def test(model,device,test_loader,criterion,epoch,iter_meter):
    print("Test start")
    model.eval()
    
    test_loss=0
    test_cer,test_wer=[],[]

    with torch.no_grad():
        for i, data in enumerate(test_loader):
            spectrograms,labels,input_lengths,label_lengths=data 
            spectrograms,labels=spectrograms.to(device),labels.to(device)

            output=model(spectrograms) # (batch, time, n_class)
            output=F.log_softmax(output, dim=2)
            output=output.transpose(0, 1) # (time, batch, n_class)

            loss=criterion(output,labels,input_lengths,label_lengths)
            test_loss+=loss.item()/len(test_loader)

            decoded_preds, decoded_targets = GreedyDecoder(output.transpose(0, 1), labels, label_lengths)

            for j in range(len(decoded_preds)):
                test_cer.append(CER(decoded_targets[j], decoded_preds[j]))
                test_wer.append(WER(decoded_targets[j], decoded_preds[j]))

    avg_cer=sum(test_cer)/len(test_cer)
    avg_wer=sum(test_wer)/len(test_wer)

    print('Test set: Average loss: {:.4f}, Average CER: {:4f} Average WER: {:.4f}\n'.format(test_loss, avg_cer, avg_wer))


In [22]:
###Experiment

def main(learning_rate, batch_size, epoch):

    hparams = {
        "n_cnn_layers": 3,
        "n_rnn_layers": 5,
        "rnn_dim": 512,
        "n_class": 29,
        "n_feats": 128,
        "stride":2,
        "dropout": 0.1,
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "epochs": epochs
    }

    use_cuda=torch.cuda.is_available()
    torch.manual_seed(7)
    device=torch.device("cuda" if use_cuda else "cpu")

    train_dataset=Common_voice(train_df)
    val_dataset=Common_voice(val_df)
    test_dataset=Common_voice(test_df)

    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

    train_loader = data.DataLoader(dataset=train_dataset,
                                batch_size=hparams['batch_size'],
                                shuffle=True,
                                collate_fn=lambda x: data_processing(x, 'train'),
                                **kwargs)

    test_loader = data.DataLoader(dataset=test_dataset,
                                batch_size=hparams['batch_size'],
                                shuffle=False,
                                collate_fn=lambda x: data_processing(x, 'test'),
                                **kwargs)

    model = SpeechRecognitionModel(
        hparams['n_cnn_layers'], hparams['n_rnn_layers'], hparams['rnn_dim'],
        hparams['n_class'], hparams['n_feats'], hparams['stride'], hparams['dropout']
        ).to(device)

    print('Num Model Parameters', sum([param.nelement() for param in model.parameters()]))

    optimizer=optim.Adam(model.parameters(),hparams['learning_rate'])
    criterion=nn.CTCLoss(blank=28).to(device)

    iter_meter=IterMeter()

    for epoch in range(1,epoch+1):
        train(model,device,train_loader,criterion,optimizer,epoch,iter_meter)
        test(model,device,test_loader,criterion,epoch,iter_meter)


In [23]:
###start experiment
learning_rate = 5e-4
batch_size = 10
epochs = 10
main(learning_rate, batch_size, epochs)

Num Model Parameters 23705373
Train start


KeyError: Caught KeyError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/skgudwn34/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py", line 185, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/skgudwn34/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 47, in fetch
    return self.collate_fn(data)
  File "<ipython-input-22-dc3776c1896c>", line 31, in <lambda>
    collate_fn=lambda x: data_processing(x, 'train'),
  File "<ipython-input-13-aa93d92b6b8a>", line 20, in data_processing
    label=torch.Tensor(text_transform.text_to_int(sentence.upper()))
  File "<ipython-input-11-ccb786efec6d>", line 61, in text_to_int
    ch=self.char_map[c]
KeyError: 'É'
