In [1]:
from google.colab import drive
drive.mount('/content/Drive')

Mounted at /content/Drive


In [2]:
!git clone --recursive https://github.com/parlance/ctcdecode.git
!pip install wget
%cd ctcdecode
!pip install .
%cd ..

Cloning into 'ctcdecode'...
remote: Enumerating objects: 1063, done.[K
remote: Total 1063 (delta 0), reused 0 (delta 0), pack-reused 1063[K
Receiving objects: 100% (1063/1063), 759.71 KiB | 2.56 MiB/s, done.
Resolving deltas: 100% (513/513), done.
Submodule 'third_party/ThreadPool' (https://github.com/progschj/ThreadPool.git) registered for path 'third_party/ThreadPool'
Submodule 'third_party/kenlm' (https://github.com/kpu/kenlm.git) registered for path 'third_party/kenlm'
Cloning into '/content/ctcdecode/third_party/ThreadPool'...
remote: Enumerating objects: 82, done.        
remote: Total 82 (delta 0), reused 0 (delta 0), pack-reused 82        
Cloning into '/content/ctcdecode/third_party/kenlm'...
remote: Enumerating objects: 13687, done.        
remote: Total 13687 (delta 0), reused 0 (delta 0), pack-reused 13687        
Receiving objects: 100% (13687/13687), 5.46 MiB | 11.41 MiB/s, done.
Resolving deltas: 100% (7880/7880), done.
Submodule path 'third_party/ThreadPool': checked 

In [3]:
cd /content/Drive/MyDrive

/content/Drive/MyDrive


In [4]:
# from ctcdecode import CTCBeamDecoder

import numpy as np
import torch
import sys
import torch.nn as nn
import torch.optim as optim
import os
import pandas as pd
import time
from torch.utils.data import DataLoader, Dataset, TensorDataset
from ctcdecode import CTCBeamDecoder
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import pdb

In [5]:
print("Subtitle Lookup Preview:")
subtitle = pd.read_table("/content/Drive/MyDrive/knnw/knnw_en_sub.csv", sep = ";", header=0)
subtitle

Subtitle Lookup Preview:


Unnamed: 0,Number,Start time in milliseconds,End time in milliseconds,Text
0,1,1650,10800,TOHO CORPORATION
1,2,53940,58090,"Some mornings, I wake up crying without knowin..."
2,3,58700,61440,That's when everything happens now and again.
3,4,62060,66540,"Whatever that dream was I had, I can never rem..."
4,5,66540,69550,- But... - But...
...,...,...,...,...
1388,1389,6363570,6367130,you refused but I saw them pouring Down your f...
1389,1390,6367130,6368820,"Crying even when I'm happy,"
1390,1391,6368820,6371440,smiling even when I'm feeling lonely!
1391,1392,6371440,6373430,It's because the heart of mine


In [None]:
print("Audio Shape:")
audio = np.load("/content/Drive/MyDrive/knnw/knnw_en.log_spectrogram.npy").shape
audio

Audio Shape:


(129, 1370582)

In [6]:
class KnnwAudioDataset(torch.utils.data.Dataset):
    
    def __init__(self, 
                 audio_path="/content/Drive/MyDrive/knnw/knnw_en.log_spectrogram.npy",
                 subtitle_lookup_path="/content/Drive/MyDrive/knnw/knnw_en_sub.csv",
                 total_frames=1370582, 
                 total_duration=6396010):
        
        self.duration_per_frame = total_duration / total_frames
        
        self.audio = np.load(audio_path)
        
        self.subtitle_lookup = pd.read_table(subtitle_lookup_path, 
                                                 sep = ";", header=0)
        
        self.length = len(self.subtitle_lookup)
        
    def __len__(self):
        
        return self.length
    
    def __getitem__(self, i):
        
        start_time = self.subtitle_lookup.iloc[i, 1]
        stop_time = self.subtitle_lookup.iloc[i, 2]
        
        audio_range = self.get_range(start_time, stop_time)
        
        audio_item = self.audio[:,audio_range].T
        
        subtitle_item = self.subtitle_lookup.iloc[i, 3]
        subtitle_item = self.get_tokenization(subtitle_item)
        
        return torch.from_numpy(audio_item).float(), torch.from_numpy(subtitle_item).float()
        
    def get_index(self, time, start_flag):
      
            if start_flag == True:
                return np.floor(time/self.duration_per_frame)
            else:
                return np.ceil(time/self.duration_per_frame)
        
    def get_range(self, start_time, end_time):
        
        start_index = self.get_index(start_time, start_flag=True)
        stop_index  = self.get_index(end_time, start_flag=False)
        
        return range(int(start_index), int(stop_index))
    
    def get_tokenization(self, subtitle_item):
        d=[]
        for c in subtitle_item:
          index = WORD_MAP.index(c)
          d.append(index)
        subtitle_item = np.stack(d)
        return subtitle_item

In [10]:
dataset = KnnwAudioDataset()

In [None]:
# dataset[0][0].shape
len(WORD_MAP)

81

In [42]:
dataset[0][0].shape

torch.Size([1962, 129])

In [17]:
WORD_MAP = [
    " ",
    ".", #SIL
    "!", #SPN
    "-",
    "?",
    ":",
    "'",
    '',
    ",",
    "0",
    "1",
    "2",
    "3",
    "4",
    "5",
    "6",
    "7",
    "8",
    "9",
    "[",
    "]",
    '/',
    '=',
    'é',
    "(",
    ")",
    '“',
    '"',
    '”',
    "a", #AA
    "A", #AE
    "b", #AH
    "B", #AO
    "c", #AW
    "C", #AY
    "d", #B
    "D", #CH
    "e", #D
    "E", #DH
    "f", #EH
    "F", #ER
    "g", #EY
    "G", #F
    "h", #G
    "H", #H
    "i", #IH 
    "I", #IY
    "j", #JH
    "J", #K
    "k", #L
    "K", #M
    "l", #N
    "L", #NG
    "m", #OW
    "M", #OY
    "n", #P 
    "N", #R
    "o", #S
    "O", #SH
    "p", #T
    "P", #TH
    "q", #UH
    "Q", #UW
    "r", #V
    "R", #W
    "s", #Y
    "S", #Z
    "t", #ZH
    "T", #M
    "u", #N
    "U", #NG
    "v", #OW
    "V", #OY
    "w", #P 
    "W", #R
    "x", #S
    "X", #SH
    "y", #T
    "Y", #TH
    "z", #UH
    "Z", #UW
]
#

In [7]:
class Model(nn.Module):
  def __init__(self, output_size,  hidden_size, embed_size=129):
    super(Model, self).__init__()
    self.output_size = output_size
    self.embed_size = embed_size
    self.hidden_size = hidden_size
    self.cnn = torch.nn.Sequential(
        nn.Conv1d(self.embed_size, self.hidden_size, 3,padding=1, bias=True),
        nn.BatchNorm1d(self.hidden_size),
        nn.ReLU(inplace=True)
    )
    self.rnn = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size,
                       num_layers=3,batch_first=True,dropout=0.4,bidirectional=True)
    self.output = nn.Sequential(
    nn.Linear(hidden_size*2,hidden_size),
    nn.Linear(hidden_size, output_size)
    )
  def forward(self, X, lengths):
    x_cnn_input = X.permute(0,2,1) #(B, C=129, T)
    x_post_cnn = self.cnn(x_cnn_input) #(B, C_out=hidden_size ,T_out)
    x_rnn_input = x_post_cnn.permute(2,0,1) #(T_out,B, C_out)
    x_packed  = pack_padded_sequence(x_rnn_input, lengths.detach().cpu(), enforce_sorted=False)
    output_packed, hidden = self.rnn(x_packed)
    output_padded, out_lens = pad_packed_sequence(output_packed, batch_first=True)
    out = self.output(output_padded).log_softmax(2) # (B, T_out, output_size)
    out = out.permute(1,0,2) #(T, B, output_size)
    return out, out_lens


In [47]:
def train_model(train_loader, model,criterion,optimizer):
  model.train()
  loss_count=0
  for batch_index,(inputs, targets, inputs_lens, targets_lens) in enumerate(train_loader):
    inputs, targets,inputs_lens,targets_lens = inputs.to(device), targets.to(device),inputs_lens.to(device),targets_lens.to(device)
    optimizer.zero_grad()
    out, out_lens = model(inputs,inputs_lens)
    loss = criterion(out,targets,out_lens,targets_lens)
    loss_count += loss.item()
    loss.backward()
    optimizer.step()
  print('loss for training are',loss_count/len(train_loader))
  # torch.save({
  #     'model_state_dict':model.state_dict(),
  #     'optimizer_state_dict':optimizer.state_dict(),
  #     'scheduler_state_dict':scheduler.state_dict(),
  #     }, "/content/drive/MyDrive/HW3P2/"+"Model2")

def evaluate_model(val_loader,model):
  model.eval()
  loss_count = 0
  for batch_index,(inputs,targets,inputs_lens,targets_lens) in enumerate(val_loader):
    inputs, targets,inputs_lens,targets_lens = inputs.to(device), targets.to(device),inputs_lens.to(device),targets_lens.to(device)
    optimizer.zero_grad()
    out, out_lens = model(inputs,inputs_lens)
    loss = criterion(out,targets,out_lens,targets_lens)
    loss_count += loss.item()
  print('loss are',loss_count/len(val_loader))

def decode_word(probs,probs_lens):
  decoder = CTCBeamDecoder(labels=WORD_MAP,beam_width=30,
                           num_processes=os.cpu_count(),log_probs_input=True)
  
  probs = torch.transpose(probs,0,1)
  out,_,_,out_lens = decoder.decode(probs,probs_lens) # out [N,B,T] out_lens [N,B]
  #print(out_lens.)
  PhonemeMap = []
  for i in range(probs.size(0)):
    if out_lens[i][0] !=0:
      currPhonemeMap = "".join([WORD_MAP[a] for a in out[i,0,:out_lens[i][0]]])
    PhonemeMap.append(currPhonemeMap)
  pdb.set_trace()
  return PhonemeMap

def make_prediction(test_loader,model):
  model.eval()
  Final_PhonemeMap=[]
  for batch_index,(inputs,_,inputs_lens,_) in enumerate(test_loader):
    inputs, inputs_lens = inputs.to(device), inputs_lens.to(device)
    optimizer.zero_grad()
    probs,probs_lens = model(inputs,inputs_lens)
    PhonemeMap = decode_word(probs,probs_lens)
    Final_PhonemeMap = np.concatenate((Final_PhonemeMap,PhonemeMap),axis=0)

  #np.save("submission.csv",Final_PhonemeList)
  idxs = np.array(list(range(len(Final_PhonemeMap))))
  df = pd.DataFrame({"id":idxs,"label":Final_PhonemeMap})
  df.to_csv("submission.csv",index=False)
  return Final_PhonemeMap

In [34]:
split_size = 0.8
train_size = int(split_size * len(dataset))
dev_size = len(dataset) - train_size
train_dataset, dev_dataset = torch.utils.data.random_split(dataset, [train_size, dev_size])

In [35]:
def pad_collate(batch):
    # reference from tutorial: https://suzyahyah.github.io/pytorch/2019/07/01/DataLoader-Pad-Pack-Sequence.html
    # sortedBatch = batch # sorted(batch, key=lambda x: x[0].shape[0], reverse=True)
    inputs = [x[0] for x in batch]
    targets = [x[1] for x in batch]
    inputs_pad = pad_sequence(inputs, batch_first=True) # dim (B, T, C) since batch_first is true, (T, B, C) if false
    targets_pad = pad_sequence(targets, batch_first=True)
    inputs_lens = torch.LongTensor([len(x) for x in inputs])
    targets_lens = torch.LongTensor([len(x) for x in targets])
    return inputs_pad, targets_pad, inputs_lens, targets_lens

In [36]:
def GetLoaders(train_data, dev_data, batch_size):
  trainLoader = DataLoader(train_data, shuffle=True, batch_size=batch_size, num_workers=2, collate_fn=pad_collate, pin_memory=True)

  devLoader = DataLoader(dev_data, shuffle=True, batch_size=batch_size, num_workers=2, collate_fn=pad_collate, pin_memory=True)

  testLoader = DataLoader(dataset, shuffle=False, batch_size=batch_size, num_workers=2, collate_fn=pad_collate, pin_memory=True,drop_last=False)

  return trainLoader, devLoader, testLoader

In [14]:
print("*** Load raw data ***")
train_loader, val_loader, testLoader = GetLoaders(train_dataset, dev_dataset, 64)


*** Load raw data ***


In [28]:
model = Model(output_size=81, hidden_size=512)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
criterion = nn.CTCLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=3e-5)
#optimizer = torch.optim.SGD(model.parameters(), lr=0.001, weight_decay=5e-4, momentum=0.9)
#scheduler = torch.optim.lr_sc heduler.StepLR(optimizer, step_size=10,gamma=0.1)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,factor=0.5,patience=5)
epochs=10
print(model)

Model(
  (cnn): Sequential(
    (0): Conv1d(129, 512, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
  )
  (rnn): LSTM(512, 512, num_layers=3, batch_first=True, dropout=0.4, bidirectional=True)
  (output): Sequential(
    (0): Linear(in_features=1024, out_features=512, bias=True)
    (1): Linear(in_features=512, out_features=81, bias=True)
  )
)


In [30]:
for i in range(epochs):
  train_model(train_loader,model,criterion,optimizer)
  evaluate_model(val_loader,model)

loss for training are 13.00140016608768
loss are 2.809915542602539
loss for training are 2.61750590801239
loss are 2.4816630840301515
loss for training are 2.456548915969001
loss are 2.3881879329681395
loss for training are 2.4181155363718667
loss are 2.3660500049591064
loss for training are 2.411627252896627
loss are 2.3502902030944823
loss for training are 2.3703985611597695
loss are 2.3264334201812744
loss for training are 2.3471198744244046
loss are 2.336795473098755
loss for training are 2.3281907637914023
loss are 2.2956642627716066
loss for training are 2.325795200135973
loss are 2.280344581604004
loss for training are 2.310097747378879
loss are 2.290620040893555


In [48]:
FP = make_prediction(testLoader,model)


> <ipython-input-47-f4ad0ea95f3d>(43)decode_word()
-> return PhonemeMap
(Pdb) Ph
*** NameError: name 'Ph' is not defined
(Pdb) PhonemeMap
['T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'To', 'T', 'To', 'To', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'To', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T']
(Pdb) q


BdbQuit: ignored

In [45]:
FP[0]

'T'