In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from skimage import io, transform
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import torchvision.models as models
import string
import random

%matplotlib inline

In [2]:
with open('train.txt', encoding='utf-8') as f:
    train = f.readlines()
train = np.array([np.array([k for k in c.split()]) for c in train])
new_train = []
new = []
for i, t in enumerate(train):
    if not i % 1000:
        new = np.append(new, new_train)
        new_train = []
    word = t[0]
    for i in range(1, t.size):
        arr = np.append(word, t[i])
        new_train = np.append(new_train, arr)
new_train = new
new_train = new_train.reshape(new_train.size//2, 2)
train = new_train
#print(train[:10], train.shape)

In [3]:
all_letters = []
for i, t in enumerate(train):
    for c in t[0]:
        all_letters = np.append(all_letters, c)
    all_letters = np.unique(all_letters)
end_letter = 'Z&'
all_letters = np.append(all_letters, end_letter)
all_letters = np.sort(all_letters)
print(all_letters, all_letters.size)

["'" '-' 'A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P'
 'Q' 'R' 'S' 'T' 'U' 'V' 'W' 'X' 'Y' 'Z' 'Z&'] 29


In [4]:
n_letters = len(all_letters)

def letterToIndex(letter):
    return all_letters.searchsorted(letter)

def lineToIndices(line):
    tensor = torch.zeros(len(line) + 1).type(torch.LongTensor)
    for i, l in enumerate(line):
        tensor[i] = int(letterToIndex(l))
    tensor[-1] = int(letterToIndex(end_letter))
    return tensor

def indicesToLine(indices):
    line = ''
    for i in range(indices.size()[0] - 1): 
        line += all_letters[indices[i]]
    return line

sample_ = train[0][0]
print(sample_)
print(lineToIndices(sample_), lineToIndices(sample_).size())
print(indicesToLine(lineToIndices(sample_)))

LEMIEUX
tensor([13,  6, 14, 10,  6, 22, 25, 28]) torch.Size([8])
LEMIEUX


In [5]:
categories = []
for t in train:
    categories = np.append(categories, t[1].split('_'))
    categories = np.unique(categories)
start_token = '###'
end_token = 'ZZ$'
categories = np.append(categories, (start_token, end_token))
categories = np.sort(categories)
print(categories, categories.size)

['###' 'AA' 'AE' 'AH' 'AO' 'AW' 'AY' 'B' 'CH' 'D' 'DH' 'EH' 'ER' 'EY' 'F'
 'G' 'HH' 'IH' 'IY' 'JH' 'K' 'L' 'M' 'N' 'NG' 'OW' 'OY' 'P' 'R' 'S' 'SH'
 'T' 'TH' 'UH' 'UW' 'V' 'W' 'Y' 'Z' 'ZH' 'ZZ$'] 41


In [6]:
n_categories = len(categories)

def categoryToIndex(cat):
    return categories.searchsorted(cat)

def categoriesToIndices(line):
    arr = line.split('_')
    tensor = torch.zeros(len(arr) + 2)
    tensor[0] = torch.tensor(categoryToIndex(start_token))
    for li, cat in enumerate(arr):
        tensor[li+1] = torch.tensor(categoryToIndex(cat))
    tensor[-1] = torch.tensor(categoryToIndex(end_token))
    return tensor.type(torch.LongTensor)

def indicesToCategories(indeces):
    line = ''
    for i in range(1, indeces.size()[0] - 1): 
        line += categories[indeces[i]]
        line += '_'
    return line[:-1]

def startToken():
    return torch.tensor([categoryToIndex(start_token)]).type(torch.LongTensor)

def endToken():
    return torch.tensor([categoryToIndex(end_token)]).type(torch.LongTensor)

    
sample_ = train[0][1]
print(sample_)
print(categoriesToIndices(sample_), categoriesToIndices(sample_).size())
print(indicesToCategories(categoriesToIndices(sample_)))
print(startToken())

L_AH_M_Y_UW
tensor([ 0, 21,  3, 22, 37, 34, 40]) torch.Size([7])
L_AH_M_Y_UW
tensor([0])


In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [50]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)
n_hidden = 128
en = EncoderRNN(n_letters, n_hidden)
en.to(device)

EncoderRNN(
  (embedding): Embedding(29, 128)
  (gru): GRU(128, 128)
)

In [51]:
MAX_LENGTH = 34

In [52]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)
    
n_hidden = 128
de = AttnDecoderRNN(n_hidden, n_categories)
de.to(device)

AttnDecoderRNN(
  (embedding): Embedding(41, 128)
  (attn): Linear(in_features=256, out_features=34, bias=True)
  (attn_combine): Linear(in_features=256, out_features=128, bias=True)
  (dropout): Dropout(p=0.1)
  (gru): GRU(128, 128)
  (out): Linear(in_features=128, out_features=41, bias=True)
)

In [24]:
teacher_forcing_ratio = 0.5
endTensor = endToken().to(device)

def Train(input_tensor, target_tensor):
    en_hidden = en.initHidden()

    en_optimizer.zero_grad()
    de_optimizer.zero_grad()
    
    input_tensor, target_tensor = input_tensor.to(device), target_tensor.to(device)
    
    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    en_outputs = torch.zeros(MAX_LENGTH, en.hidden_size, device=device)
    
    loss = 0
    
    for ei in range(input_length):
        en_output, en_hidden = en(input_tensor[ei], en_hidden)
        en_outputs[ei] = en_output[0, 0]

    de_input = startToken().to(device)

    de_hidden = en_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    
    if use_teacher_forcing:
       
        for di in range(target_length):
            de_output, de_hidden, de_attention = de(de_input, de_hidden, en_outputs)
            loss += criterion(de_output, target_tensor[di])
            de_input = target_tensor[di]  # Teacher forcing

    else:
        for di in range(target_length):
            de_output, de_hidden, de_attention = de(de_input, de_hidden, en_outputs)
            topv, topi = de_output.topk(1)
            de_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(de_output, target_tensor[di])
            if de_input.item() == endTensor.item():
                break

    loss.backward()

    en_optimizer.step()
    de_optimizer.step()

    return loss.item() / target_length

In [12]:
def Transcript(input_tensor):
    with torch.no_grad():
        input_tensor = input_tensor.to(device)
        
        input_length = input_tensor.size()[0]
        en_hidden = en.initHidden()

        en_outputs = torch.zeros(MAX_LENGTH, en.hidden_size, device=device)

        for ei in range(input_length):
            en_output, en_hidden = en(input_tensor[ei], en_hidden)
            en_outputs[ei] += en_output[0, 0]

        de_input = startToken().to(device)

        de_hidden = en_hidden

        decoded_categories = []

        for di in range(MAX_LENGTH):
            de_output, de_hidden, de_attention = de(de_input, de_hidden, en_outputs)
            topv, topi = de_output.data.topk(1)
            decoded_categories.append(topi.item())
          #  print(topi.item())
            if topi.item() == endTensor.item():
                break
            decoder_input = topi.squeeze().detach()

        return decoded_categories

In [13]:
def Test(input_tensor, target_tensor):
    output = np.array(Transcript(input_tensor))
    target = target_tensor.data.numpy()
  #  print(output, target)
    return int(np.array_equal(output, target))

In [17]:
sum = 0
for i in  range(70000, 70000 + 100):
    input_tensor, target_tensor = lineToIndices(train[i][0]), categoriesToIndices(train[i][1])
    sum += Test(input_tensor, target_tensor)
print(sum/100)

0.0


In [53]:
#criterion = nn.CrossEntropyLoss()
criterion = nn.NLLLoss()
epochs_num = 6
results = []

In [54]:
en_optimizer = torch.optim.SGD(en.parameters(), lr=0.1)
de_optimizer = torch.optim.SGD(de.parameters(), lr=0.1)
samples_num = 15000
step = samples_num // 100
for i in range(samples_num):
    input_tensor, target_tensor = lineToIndices(train[i][0]), categoriesToIndices(train[i][1]).view(-1,1)
    loss = Train(input_tensor, target_tensor)
print('evaluation...')
sum = 0
num = 1000
for i in  range(samples_num, samples_num + num):
    input_tensor, target_tensor = lineToIndices(train[i][0]), categoriesToIndices(train[i][1])
    sum += Test(input_tensor, target_tensor)
print(sum/num)

evaluation...
0.001


In [56]:
print(i)

36199


In [58]:
for epoch in range(epochs_num):
    print('epoch:', epoch + 1)
    rates = [1e-2, 1e-3, 1e-3, 1e-4, 1e-4]
    learning_rate = rates[epoch]
    en_optimizer = torch.optim.SGD(en.parameters(), lr=learning_rate)
    de_optimizer = torch.optim.SGD(de.parameters(), lr=learning_rate)
    samples_num = 50000
    step = samples_num // 100
    for i in range(s, samples_num):
        if (i == 36199):
            continue
        input_tensor, target_tensor = lineToIndices(train[i][0]), categoriesToIndices(train[i][1]).view(-1,1)
        loss = Train(input_tensor, target_tensor)
        if not i % step:
            print(i)
        if not i % (step * 10):
            print(i, loss, Transcript(lineToIndices(train[i][0])),categoriesToIndices(train[i][1]))
    print('evaluation...')
    sum = 0
    num = 1000
    print(Transcript(lineToIndices(train[i][0])), categoriesToIndices(train[i][1]))
    for i in  range(samples_num, samples_num + num):
        input_tensor, target_tensor = lineToIndices(train[i][0]), categoriesToIndices(train[i][1])
        sum += Test(input_tensor, target_tensor)
    print(sum/num)
    print(Transcript(lineToIndices(train[i][0])), categoriesToIndices(train[i][1]))
    results = np.append(results, sum/num)

epoch: 1
36000
36500
37000
37500
38000
38500
39000
39500
40000
40000 0.6920232772827148 [0, 27, 21, 5, 9, 40] tensor([ 0, 27, 21,  5,  9, 40])
40500
41000
41500
42000
42500
43000
43500
44000
44500
45000
45000 0.6361077853611538 [0, 35, 3, 31, 17, 24, 24, 40] tensor([ 0, 35, 25, 31, 17, 24, 40])
45500
46000
46500
47000
47500
48000
48500
49000
49500
evaluation...
[0, 22, 12, 28, 40] tensor([ 0, 22, 12, 14, 40])
0.09
[0, 9, 17, 28, 17, 20, 31, 31, 31, 40] tensor([ 0,  9,  6, 28, 11, 20, 31, 17, 35, 38, 40])
epoch: 2
0
0 1.664879662649972 [0, 21, 11, 22, 17, 34, 20, 20, 40] tensor([ 0, 21,  3, 22, 37, 34, 40])
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5000 0.3553908348083496 [0, 36, 18, 29, 40] tensor([ 0, 36, 18, 38, 40])
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10000 1.651793638865153 [0, 22, 17, 29, 3, 23, 23, 23, 40] tensor([ 0, 22, 17, 29, 31, 13, 31, 22,  3, 23, 31, 40])
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15000 0.503299331665039 [0, 11, 27, 2

KeyboardInterrupt: 

0.0


In [None]:
for i in range(results.size):
    print('epoch:', i+2, 'results:', results[i])

In [17]:
for i in range(results.size):
    print('epoch:', i+1, 'results:', results[i])

epoch: 1 results: 0.2372
epoch: 2 results: 0.2915
epoch: 3 results: 0.39245


In [25]:
def Sep(word):
    transcription = Transcript(word)
    print(transcription)
    for i in range(1, len(word)):
        word1, word2 = word[:i], word[i:]
        print(word1, word2)
        transcription1 = Transcript(word1)
        print(transcription1)
        transcription2 = Transcript(word2)
        print(transcription2)
        
input = 'defeat'   
Sep(input)

d efeat
de feat
def eat
defe at
defea t


In [None]:
#'defeat' [0,0,1,0,0,0]
class SeparatorRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, 2)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output, hidden = self.gru(output, hidden)
        output = self.out(output).view(1,-1)
        #output = F.log_softmax(output, dim=1)
        output = F.softmax(output, dim=1)
        return output, hidden
    
n_hidden = 128
se = SeparatorRNN(n_letters, n_hidden)
se.to(device)

In [None]:
def sepTrain(input_tensor, splits):
    en_hidden = en.initHidden()

    en_optimizer.zero_grad()
    se_optimizer.zero_grad()
    
    input_tensor = input_tensor.to(device)
    
    input = torch.copy(input_tensor)
    
    input_length = input_tensor.size(0)
    
    loss = 0
    
    for ei in range(input_length):
        en_output, en_hidden = en(input_tensor[ei], en_hidden)  
        
    se_hidden = en_hidden
    
    for inp in input:
        se_output, se_hidden = se(input, se_hidden)
        loss += criterion(se_output, splits[i])
        print(se_output, split[i])
    
    loss.backward()
    
    se_optimizer.step()
    
    return loss.item() / input_length

In [161]:
#torch.save(de, 'decoder_34')
#torch.save(en, 'encoder_34')

In [12]:
max_LENGth = 0
for t in test:
    max_LENGth = max(max_LENGth, len(t))
print(max_LENGth)

20


In [9]:
test = pd.read_csv('test.csv').values[:,1]
print(test, test.size)

['PITCHED' 'DISSOLVERS' 'SCRAWNY' ... 'SCOGIN' 'HESSION' 'TARNOWSKI'] 41597


In [198]:
words = []
for i, t in enumerate(test):
    if not i % 1000:
        print(i)
    with torch.no_grad():
        input = lineToTensor(t)
        input = input.to(device)
        hidden = en(input)
        output, hidden = de(startTensor, hidden)
        outputs = []
        for i in range(max_length + 1):
            output, hidden = de(output, hidden)
            outputs = np.append(outputs, output.argmax().item())
            if (output.argmax().item() == endTensor.item()):
                break
        words = np.append(words, tensorToCategories(torch.from_numpy(outputs).type(torch.LongTensor)))

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000


In [200]:
print(words.shape)

(41597,)


In [203]:
_predictions = words.reshape(words.size,)
with open('predictions.csv', 'w') as out:
    print('id,word', file=out)
    for i, p in enumerate(_predictions):
        print(i+1, p, sep=',', file=out)

In [204]:
preds = pd.read_csv('predictions.csv').values[:,1]

In [206]:
print(preds.size)

41597
