In [1]:
import torch.nn as nn
import torch.optim as optim
import torch

from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
from torchtext import data
import spacy
import numpy as np

import random
import math
import time

In [2]:
import torch
torch.rand(1).cuda()

tensor([0.7003], device='cuda:0')

In [3]:
SEED = 1234


random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [4]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

In [5]:
def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [6]:
SRC = Field(tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>',lower=True)

TRG = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>',lower=True)



In [7]:
from spacy.lang.en.stop_words import STOP_WORDS
# STOP_WORDS

In [8]:

text = data.Field(tokenize=tokenize_en,stop_words=STOP_WORDS, init_token='<sos>', eos_token='<eos>',lower=True)
response = data.Field(tokenize=tokenize_en,stop_words=STOP_WORDS, init_token='<sos>', eos_token='<eos>',lower=True)


In [9]:


# create tuples representing the columns
fields = [
  ('text', text),
  ('response', response)
]

In [10]:
# import pandas as pd
# x = pd.read_csv('twitter_data/cleaned_tweets_valid.csv')
# x = x.loc[~pd.isna(x.response)]
# x.shape

In [11]:
# x.to_csv('twitter_data/cleaned_tweets_valid.csv',index=False)

In [12]:
train_data, valid_data, test_data = data.TabularDataset.splits(
   path = 'cornell',
   train = 'train.csv',
   validation = 'valid.csv',
   test = 'test.csv',
   format = 'csv',
   fields = fields,
   skip_header = True
)



In [17]:
print(vars(train_data.examples[5]))

{'text': ['"', 'real', '"', '.'], 'response': ['like', 'fear', 'wearing', 'pastels', '?']}


In [14]:
train_data.examples[1000].__dict__.keys()

dict_keys(['text', 'response'])

#### Cuild Vocab, tokens must have frequency of atleast 2

In [22]:
text.build_vocab(train_data,min_freq=10)
response.build_vocab(train_data,min_freq=10)

In [23]:
len(text.vocab),len(response.vocab)

(6113, 6105)

In [24]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [25]:
BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
            (train_data, valid_data, test_data),
            batch_size=BATCH_SIZE,sort=False,
            device=device)



In [26]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        #TO DO: add 5 more asserts
#         assert encoder.hid_dim == decoder.hid_dim,
    
    def forward(self,src, trg,teacher_force_ratio=0.5):
        context = self.encoder(src)
        hidden = context # only first GRU
        batch_size = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        input = trg[0,:]
        trg_len = trg.shape[0]
        #decoder needs to run one by one
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device) # assign to device
        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden, context)
            outputs[t] = output
            
            teacher_force = random.random() < teacher_force_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1
        return outputs
        

In [27]:
class Encoder(nn.Module):
    def __init__(self,input_dim,emb_dim, hid_dim, dropout):
        
        super().__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim

        self.embedding = nn.Embedding(self.input_dim, self.emb_dim)
        self.rnn = nn.GRU(self.emb_dim, self.hid_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self,src):
        
        embedded = self.dropout(self.embedding(src))
        output,hidden = self.rnn(embedded)
        
        return hidden
        
        

In [28]:
class Decoder(nn.Module):
    def __init__(self,output_dim,emb_dim, hid_dim, dropout):
        
        super().__init__()
        self.hid_dim = hid_dim
        self.output_dim = output_dim
        
        self.embedding = nn.Embedding(output_dim, emb_dim, hid_dim, dropout)
        self.rnn = nn.GRU(emb_dim + hid_dim, hid_dim)
        self.fc_out = nn.Linear(emb_dim+hid_dim*2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    
    def forward(self,input, hidden, context):
        
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        emb_con = torch.cat((embedded, context),dim=2)
        output, hidden = self.rnn(emb_con, hidden)
        output_concatenated = torch.cat((embedded.squeeze(0), hidden.squeeze(0), context.squeeze(0)), dim=1)
        prediction = self.fc_out(output_concatenated)
        
        return prediction, hidden
        
        
        

In [29]:
INPUT_DIM = len(text.vocab)
OUTPUT_DIM = len(response.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)


In [30]:
INPUT_DIM,OUTPUT_DIM

(6113, 6105)

In [31]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.01)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(6113, 256)
    (rnn): GRU(256, 512)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(6105, 256, padding_idx=512, max_norm=0.5)
    (rnn): GRU(768, 512)
    (fc_out): Linear(in_features=1280, out_features=6105, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [32]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'the model has {count_parameters(model)} trainable parameters')

the model has 14100185 trainable parameters


In [33]:
optimizer = optim.Adam(model.parameters())

In [34]:
TRG_PAD_IDX = response.vocab.stoi[response.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)

In [35]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.text
        trg = batch.response
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [36]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.text
            trg = batch.response

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [37]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time/60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [39]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    print('end_time:', end_time)
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

end_time: 1609932549.1359391
Epoch: 01 | Time: 5m 1s
	Train Loss: 4.748 | Train PPL: 115.362
	 Val. Loss: 5.012 |  Val. PPL: 150.131
end_time: 1609932847.3880217
Epoch: 02 | Time: 4m 58s
	Train Loss: 4.703 | Train PPL: 110.288
	 Val. Loss: 4.992 |  Val. PPL: 147.272
end_time: 1609933143.757708
Epoch: 03 | Time: 4m 56s
	Train Loss: 4.656 | Train PPL: 105.211
	 Val. Loss: 5.012 |  Val. PPL: 150.257
end_time: 1609933431.2262228
Epoch: 04 | Time: 4m 47s
	Train Loss: 4.614 | Train PPL: 100.929
	 Val. Loss: 5.016 |  Val. PPL: 150.865
end_time: 1609933717.2909431
Epoch: 05 | Time: 4m 46s
	Train Loss: 4.566 | Train PPL:  96.161
	 Val. Loss: 5.042 |  Val. PPL: 154.814
end_time: 1609934006.034572
Epoch: 06 | Time: 4m 48s
	Train Loss: 4.514 | Train PPL:  91.330
	 Val. Loss: 5.051 |  Val. PPL: 156.153
end_time: 1609934294.348881
Epoch: 07 | Time: 4m 48s
	Train Loss: 4.457 | Train PPL:  86.224
	 Val. Loss: 5.090 |  Val. PPL: 162.445
end_time: 1609934582.7804306
Epoch: 08 | Time: 4m 48s
	Train Loss:

In [None]:
model.load_state_dict(torch.load('tut2-model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

In [43]:
for i, batch in enumerate(train_iterator): 
    src = batch.text
    trg = batch.response

    optimizer.zero_grad()

    output = model(src, trg)
    output_dim = output.shape[-1]
            
    output = output[1:].view(-1, output_dim)
    print(len(output))
    print(np.argmax(output.cpu().detach().numpy(),axis=0))
    break

1856
[108  22  22 ... 197 547  57]


In [None]:
trg.shape

In [None]:
train_data.examples[0].__dict__['src'],train_data.examples[0].__dict__['trg']

In [None]:
TRG.vocab.stoi