In [2]:
import pandas
import os
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import progressbar
import torch
import torchvision
import torchvision.transforms as transforms
import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pth = "/home/zedroman/Documents/Sonia_Data/eng_phonetics/"

In [3]:
with open(pth+"train.txt") as f:
    data = f.readlines()
    
sos = 1
eos = 2
words = []
phonetics = []
phoems_to_id = {}
chars_to_id = {}
id_to_chars = {}
id_to_phoems = {}
ph = 3
ch = 3
phoems_to_id[''] = 0
chars_to_id[''] = 0
max_word_len = 0
max_phonetic_len = 0
for x in data:
    x = x[:len(x) - 1]
    word = list(x.split()[0])
    phonetic = x.split()[1].split('_')
    if (len(word) > max_word_len):
        max_word_len = len(word)
    if (len(phonetic) > max_phonetic_len):
        max_phonetic_len = len(phonetic)
    for y in phonetic:
        if(y not in phoems_to_id):
            id_to_phoems[ph] = y
            phoems_to_id[y] = ph    
            ph+=1
    for y in word:
        if(y not in chars_to_id):
            id_to_chars[ch] = y
            chars_to_id[y] = ch
            ch+=1
    words.append(word)
    phonetics.append(phonetic)
print("Number of words:\n", len(words))
print("Chars to id:\n", chars_to_id)
print("Phoems to id:\n", phoems_to_id)
print("Max word len:", max_word_len)
print("Max phonetic_len:", max_phonetic_len)

Number of words:
 83194
Chars to id:
 {'': 0, 'L': 3, 'E': 4, 'M': 5, 'I': 6, 'U': 7, 'X': 8, 'N': 9, 'D': 10, 'G': 11, 'S': 12, 'T': 13, 'R': 14, 'P': 15, 'K': 16, 'C': 17, 'O': 18, 'F': 19, 'A': 20, 'B': 21, 'H': 22, 'V': 23, 'Y': 24, 'W': 25, 'J': 26, "'": 27, 'Q': 28, 'Z': 29, '-': 30}
Phoems to id:
 {'': 0, 'L': 3, 'AH': 4, 'M': 5, 'Y': 6, 'UW': 7, 'AY': 8, 'N': 9, 'D': 10, 'IH': 11, 'NG': 12, 'S': 13, 'T': 14, 'R': 15, 'P': 16, 'K': 17, 'EH': 18, 'AA': 19, 'F': 20, 'ER': 21, 'EY': 22, 'AE': 23, 'Z': 24, 'G': 25, 'B': 26, 'SH': 27, 'V': 28, 'OW': 29, 'AO': 30, 'IY': 31, 'W': 32, 'HH': 33, 'JH': 34, 'CH': 35, 'TH': 36, 'AW': 37, 'OY': 38, 'UH': 39, 'ZH': 40, 'DH': 41}
Max word len: 34
Max phonetic_len: 32


In [4]:
# def char_to_vec(c):
#     ans = np.zeros(len(chars_to_id))
#     ans[chars_to_id[c]] = 1
#     return ans


# def phoem_to_vec(c):
#     ans = np.zeros(len(phoems_to_id))
#     ans[phoems_to_id[c]] = 1
#     return ans


# def word_to_vec(w):
#     ans = np.zeros((len(chars_to_id), max_word_len))
#     for i in range(len(w)):
#         ans[:,i] = char_to_vec(w[i])
#     return ans


# def phoems_to_vec(p):
#     ans = np.zeros((len(phoems_to_id), max_phonetic_len))
#     for i in range(len(p)):
#         ans[:,i] = phoem_to_vec(p[i])
#     return ans


# def create_data(words):
#     ans = np.empty((len(words), len(chars_to_id), max_word_len))
#     for i in range(len(words)):
#         ans[i, :, :] = word_to_vec(words[i])
#     return ans


# def create_labels(words):
#     ans = np.empty((len(words), len(phoems_to_id), max_phonetic_len))
#     for i in range(len(words)):
#         ans[i, :, :] = phoems_to_vec(words[i])
#     return ans


# data = create_data(words)
# labels = create_labels(phonetics)

In [5]:
def word_to_vec(w):
    ans = np.zeros((max_word_len + 2))
    for i in range(len(w)):
        ans[i + 1] = chars_to_id[w[i]]
    ans[0] = 1
    ans[i] = 2
    return ans


def phoems_to_vec(p):
    ans = np.zeros((max_phonetic_len + 1))
    for i in range(len(p)):
        ans[i + 1] = phoems_to_id[p[i]]
    ans[0] = 1
    return ans


def create_data(words):
    ans = np.empty((len(words), max_word_len + 2))
    for i in range(len(words)):
        ans[i, :] = word_to_vec(words[i])
    return ans


def create_labels(words):
    ans = np.empty((len(words), max_phonetic_len + 1))
    for i in range(len(words)):
        ans[i, :] = phoems_to_vec(words[i])
    return ans


d = torch.from_numpy(create_data(words)).long()
l = torch.from_numpy(create_labels(phonetics)).long()
print(d.shape)

trainset = torch.utils.data.TensorDataset(d[:75000,:], l[:75000,:])
traindata = torch.utils.data.DataLoader(trainset, batch_size=5,
                                          shuffle=True, num_workers=2)
testset = torch.utils.data.TensorDataset(d[75000:,:], l[75000:,:])
testdata = torch.utils.data.DataLoader(trainset, batch_size=5,
                                          shuffle=True, num_workers=2)

torch.Size([83194, 36])


In [6]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, batch_first = True, dropout=dropout)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [sent len, batch size]
#         print(src.shape)
        embedded = self.dropout(self.embedding(src))
#         print(embedded.shape)
        #embedded = [sent len, batch size, emb dim]
        
        outputs, (hidden, cell) = self.rnn(embedded)
#         print(outputs.shape)
        #outputs = [sent len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #outputs are always from the top hidden layer
        
        return hidden, cell

In [7]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()

        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        
        self.out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        
        #input = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #n directions in the decoder will both always be 1, therefore:
        #hidden = [n layers, batch size, hid dim]
        #context = [n layers, batch size, hid dim]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        #embedded = [1, batch size, emb dim]
                
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        
        #output = [sent len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #sent len and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [n layers, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]
        
        prediction = self.out(output.squeeze(0))
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden, cell

In [8]:
INPUT_DIM = len(chars_to_id) + 2
OUTPUT_DIM = len(phoems_to_id) + 2
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

In [9]:
outputs = torch.zeros(max_phonetic_len + 1, 5, len(phoems_to_id) + 2)
teacher_forcing_ratio = 0.5
for x,y in traindata:
    hidden, cell = enc(x)
    input = y[:,0]
    for t in range(1, max_phonetic_len + 1):
        output, hidden, cell = dec(input, hidden, cell)
        outputs[t - 1] = output
        teacher_force = random.random() < teacher_forcing_ratio
        top1 = output.max(1)[1]
        input = (y[:, t] if teacher_force else top1)
    break

In [10]:
print(outputs[0,0,:])

tensor([ 0.0281,  0.0092, -0.0859,  0.0150,  0.0134, -0.0275, -0.0343, -0.0782,
         0.0288,  0.0647, -0.0214, -0.0226, -0.0309, -0.0663,  0.0184, -0.0037,
         0.0329, -0.0658, -0.0647, -0.0062,  0.0238,  0.0475,  0.0625,  0.0106,
         0.0291, -0.0282, -0.0841,  0.0088,  0.0098,  0.0299,  0.0260, -0.0374,
        -0.0107, -0.0092, -0.0296,  0.0113,  0.0432, -0.0103,  0.0371, -0.0007,
        -0.0433,  0.0113], grad_fn=<SliceBackward>)


In [19]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        
        assert encoder.hid_dim == decoder.hid_dim, "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        
        #src = [sent len, batch size]
        #trg = [sent len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        batch_size = trg.shape[0]
        max_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size)
        
        hidden, cell = enc(x)
        input = y[:,0]
        for t in range(1, max_len):      
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            input = (trg[:,t] if teacher_force else top1)
            
        return outputs
model = Seq2Seq(enc, dec)
for x, y in traindata:
    output = model(x,y);
    print(output[1:].view(-1, output.shape[2]).shape)
    print(y.shape)
    break

torch.Size([160, 42])
torch.Size([5, 33])


In [40]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch[0]
            trg = batch[1]
            print(src.shape)
            print(trg.shape)
            output = model(src, trg, 0) #turn off teacher forcing
            print(output.shape)
            trg = torch.t(trg)
            print(trg.shape)
            loss = criterion(output[1:].view(-1, output.shape[2]), trg[1:].view(-1))

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [41]:
for x, y in traindata:
    loss = evaluate(model, testdata, criterion)
    print(loss)
    break

torch.Size([5, 36])
torch.Size([5, 33])
torch.Size([33, 5, 42])
torch.Size([33, 5])


RuntimeError: invalid argument 2: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Call .contiguous() before .view(). at /opt/conda/conda-bld/pytorch_1544174967633/work/aten/src/TH/generic/THTensor.cpp:213

In [22]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

In [None]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
#         print(input_tensor[ei])
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[0]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [None]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
def trainIters(encoder, decoder, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()
#     print(training_pairs)
    for (input_tensor, target_tensor) in traindata:
        print(input_tensor.shape)
        input_tensor = input_tensor.view((-1))
        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [None]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [None]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [None]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [None]:
trainIters(encoder, decoder, print_every=5000)