In [1]:
import pandas as pd
import string
import nltk
import torch
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt') # Download the tokenizer model
nltk.download('wordnet') # Download the wordnet corpora

# CNN, RNN, Forests, SVR, FFNN, linreg

data_dir = "../data/"

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Michael\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Michael\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Michael\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
f_train_en = open(data_dir + "train.ende.src", "r",encoding="utf8")
train_en = f_train_en.readlines()

f_train_de = open(data_dir + "train.ende.mt", "r",encoding="utf8")
train_de = f_train_de.readlines()

f_train_labels = open(data_dir + "train.ende.scores", "r",encoding="utf8")
train_l = f_train_labels.readlines()

In [3]:
def tokenize(strings):
    tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
    return [tokenizer.tokenize(l) for l in strings]


def remove_stopwords(list_strings, additional_stopwords=(), language='english'):
    words = stopwords.words('english')
    for word in additional_stopwords:
        if word not in words: words.append(word)

    ret = []
    for sentence in list_strings:
        ret.append(list(filter(lambda word: word not in words, sentence)))
    return ret

def remove_capitalisation(list_strings):
    ret = []
    for sentence in list_strings:
        ret.append([w.lower() for w in sentence])
    return ret

def remove_punctuation(list_strings):
    ret = []
    for sentence in list_strings:
        ret.append(list(filter(lambda word: word not in string.punctuation, sentence)))
    return ret


def build_vocab(list_string):
    vocab_set = set()
    for sentence in list_string:
        vocab_set.update(sentence)
        
    return vocab_set

def build_w2i_i2w(vocabulary):
    word2idx = {w: idx+1 for (idx, w) in enumerate(vocabulary)}
    word2idx['<pad>'] = 0
    idx2word = {idx+1: w for (idx, w) in enumerate(vocabulary)}
    word2idx[0] = '<pad>'
    return word2idx, idx2word

def lemmatise():
    pass

def stem():
    pass

def extract_context(list_string, word2idx, window_size=2):
    idx_pairs = []
    for sentence in list_string:
        indices = [word2idx[word] for word in sentence]

        for center_word_pos in range(len(indices)):

            for w in range(-window_size, window_size + 1):
                context_word_pos = center_word_pos + w

                if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                    continue

                context_word_idx = indices[context_word_pos]
                idx_pairs.append((indices[center_word_pos], context_word_idx))

    return np.array(idx_pairs) # it will be useful to have this as numpy array


def get_one_hot(vocabulary, word_idx):
    x = torch.zeros(len(vocabulary)).float()
    x[word_idx] = 1.0
    return x

def preprocessing(data_list):
    train_tokenized = tokenize(data_list)
    train_nostop = remove_stopwords(train_tokenized)
    train_nopunct = remove_punctuation(train_nostop)
    train_nocap = remove_capitalisation(train_nopunct)
    return train_nocap

In [4]:
train_tokenised_en = preprocessing(train_en)
train_tokenised_de = preprocessing(train_de)

In [5]:
voc_en = build_vocab(train_tokenised_en)
word2idx_en, idx2word_en = build_w2i_i2w(voc_en)

voc_de = build_vocab(train_tokenised_de)
word2idx_de, idx2word_de = build_w2i_i2w(voc_de)

In [6]:
# contexts = extract_context(train_nocap, word2idx, window_size=3)

In [7]:
# len(contexts)

In [8]:
import random

import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [9]:
def get_model_inputs(tokenized_corpus, word2idx):
    # we index our sentences
    vectorized_sents = [[word2idx[tok] for tok in sent if tok in word2idx] for sent in tokenized_corpus]

    # Sentence lengths
    sent_lengths = [len(sent) for sent in vectorized_sents]

    # Get maximum length
    max_len = max(sent_lengths)

    # we create a tensor of a fixed size filled with zeroes for padding
    sent_tensor = torch.zeros((len(vectorized_sents), max_len)).long()

    # we fill it with our vectorized sentences 
    for idx, (sent, sentlen) in enumerate(zip(vectorized_sents, sent_lengths)):
        sent_tensor[idx, :sentlen] = torch.LongTensor(sent)

    return sent_tensor

train_sent_tensor_en = get_model_inputs(train_tokenised_en, word2idx_en)
train_sent_tensor_de = get_model_inputs(train_tokenised_de, word2idx_de)
labels_tensor = torch.FloatTensor([float(t.strip()) for t in train_l])

train_tensor = torch.cat((train_sent_tensor_en, train_sent_tensor_de), 1)


val_l = open(data_dir + "dev.ende.scores", "r",encoding="utf8")
val_l = val_l.readlines()
val_l = torch.FloatTensor([float(t.strip()) for t in val_l])

val_en = open(data_dir + "dev.ende.src", "r",encoding="utf8")
val_en = val_en.readlines()
val_en = preprocessing(val_en)

val_de = open(data_dir + "dev.ende.mt", "r",encoding="utf8")
val_de = val_de.readlines()
val_de = preprocessing(val_de)

val_tensor_en = get_model_inputs(val_en, word2idx_en)
val_tensor_de = get_model_inputs(val_de, word2idx_de)

val_tensor = torch.cat((val_tensor_en, val_tensor_de), 1)

In [10]:
class FFNN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, num_classes):  
        super(FFNN, self).__init__()
        
        # embedding (lookup layer) layer
        # padding_idx argument makes sure that the 0-th token in the vocabulary
        # is used for padding purposes i.e. its embedding will be a 0-vector
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
        # hidden layer
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        # output layer
        self.fc3 = nn.Linear(hidden_dim, num_classes)  
    
    def forward(self, x):
        # x has shape (batch_size, max_sent_len)

        embedded = self.embedding(x)
        # `embedding` has shape (batch size, max_sent_len, embedding dim)

        ########################################################################
        # Q: Compute the average embeddings of shape (batch_size, embedding_dim)
        ########################################################################
        # Implement averaging that ignores padding (average using actual sentence lengths).
        # How this effect the result?
#         avg = torch.zeros((embedded.size(0), embedded.size(2)))
#         for i, row in enumerate(embedded):
#             avg[i] = torch.sum(row,0) / (row[row != 0].size(0) / 100)
#             if i%1000 == 0 and i != 0: print(i)
        averaged = torch.sum(embedded, 1) / torch.sum(embedded != 0,1)

        out = F.leaky_relu(self.fc1(averaged))
        out = F.leaky_relu(self.fc2(out))
        out = self.fc3(out)
        return out
    
class RMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        
    def forward(self,yhat,y):
        return torch.sqrt(self.mse(yhat,y))

In [12]:
EPOCHS = 200
LRATE = 0.001
EMBEDDING_DIM = 50
HIDDEN_DIM = 100
OUTPUT_DIM = 1

model = FFNN(EMBEDDING_DIM, HIDDEN_DIM, len(word2idx_en) + len(word2idx_de), OUTPUT_DIM)
optimizer = optim.Adam(model.parameters(), lr=LRATE)


loss_fn = RMSELoss()

# Input and label tensors
feature = train_tensor
target = labels_tensor
validation = val_tensor
val_labels = val_l

################
# Start training
################
print(f'Will train for {EPOCHS} epochs')
for epoch in range(1, EPOCHS + 1):
    # to ensure the dropout (explained later) is "turned on" while training
    # good practice to include even if do not use here
    model.train()
  
    # we zero the gradients as they are not removed automatically
    optimizer.zero_grad()
  
    # squeeze is needed as the predictions will have the shape (batch size, 1)
    # and we need to remove the dimension of size 1
    predictions = model(feature).squeeze(1)

    # Compute the loss
    loss = loss_fn(predictions, target)
    train_loss = loss.item()

    # calculate the gradient of each parameter
    loss.backward()

    # update the parameters using the gradients and optimizer algorithm 
    optimizer.step()
  
    if epoch % 10 == 0 and epoch != 0:
        with torch.no_grad():
            v_pred = model(validation).squeeze(1)
            v_loss = loss_fn(v_pred, val_labels)
            val_loss = v_loss.item()
            print(f'| Epoch: {epoch:02d} | Train Loss: {train_loss:.3f} | Validation Loss: {val_loss:.3f}')

Will train for 200 epochs
| Epoch: 10 | Train Loss: 0.822 | Validation Loss: 0.862
| Epoch: 20 | Train Loss: 0.815 | Validation Loss: 0.862
| Epoch: 30 | Train Loss: 0.797 | Validation Loss: 0.869
| Epoch: 40 | Train Loss: 0.765 | Validation Loss: 0.894
| Epoch: 50 | Train Loss: 0.711 | Validation Loss: 0.921
| Epoch: 60 | Train Loss: 0.628 | Validation Loss: 0.968
| Epoch: 70 | Train Loss: 0.515 | Validation Loss: 1.027
| Epoch: 80 | Train Loss: 0.389 | Validation Loss: 1.085
| Epoch: 90 | Train Loss: 0.276 | Validation Loss: 1.124


KeyboardInterrupt: 