<a href="https://colab.research.google.com/github/ansonmiu0214/C490CW/blob/master/MinibatchQEV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Coursework: Quality Estimation Vectors


In [0]:
# Imports
import os
import json
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import sklearn
import tqdm
import matplotlib.pyplot as plt
from tqdm import tqdm

%matplotlib inline

device = torch.device('cuda' if torch.cuda.is_available() else 'gpu')
print(f'DEVICE={device}')
torch.manual_seed(0)
torch.cuda.manual_seed(0)
np.random.seed(0)
# PyTorch version
print(torch.__version__)

# Disable warnings :)
import warnings
warnings.filterwarnings('ignore')

DEVICE=cuda
1.4.0


In [0]:
# # Google Drive authorisation
# from google.colab import drive
# drive.mount('/content/gdrive')

# def in_gdrive(path):
#     return f'/content/gdrive/My Drive/Colab Notebooks/{path}'

# # !ls /content/gdrive/My\ Drive

## Importing Data

In [0]:
import os

if not os.path.exists('enzh_data.zip'):
    !wget -O enzh_data.zip https://competitions.codalab.org/my/datasets/download/03e23bd7-8084-4542-997b-6a1ca6dd8a5f
    !unzip enzh_data.zip

TRAIN_EN = 'train.enzh.src'
TRAIN_ZH = 'train.enzh.mt'
TRAIN_SCORES = 'train.enzh.scores'
VAL_EN = 'dev.enzh.src'
VAL_ZH = 'dev.enzh.mt'
VAL_SCORES = 'dev.enzh.scores'
TEST_EN = 'test.enzh.src'
TEST_ZH = 'test.enzh.mt'

--2020-02-17 14:14:47--  https://competitions.codalab.org/my/datasets/download/03e23bd7-8084-4542-997b-6a1ca6dd8a5f
Resolving competitions.codalab.org (competitions.codalab.org)... 129.175.22.230
Connecting to competitions.codalab.org (competitions.codalab.org)|129.175.22.230|:443... connected.
HTTP request sent, awaiting response... 302 FOUND
Location: https://newcodalab.lri.fr/prod-private/dataset_data_file/None/630ec/en-zh.zip?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=cfcf912b83cd9acc16105af5382275abf6711ee3d2801294e2a1d6cf8f9e2143&X-Amz-Date=20200217T141447Z&X-Amz-Credential=AZIAIOSAODNN7EX123LE%2F20200217%2Fnewcodalab%2Fs3%2Faws4_request [following]
--2020-02-17 14:14:47--  https://newcodalab.lri.fr/prod-private/dataset_data_file/None/630ec/en-zh.zip?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=cfcf912b83cd9acc16105af5382275abf6711ee3d2801294e2a1d6cf8f9e2143&X-Amz-Date=20200217T141

## Preprocessing

### English

1. Tokenise with spaCy language model
2. Remove stop words and punctuation
3. Normalise - lemmas

In [0]:
# Downloading spacy models for English

!spacy download en_core_web_md
!spacy link en_core_web_md en300 --force

Collecting en_core_web_md==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.1.0/en_core_web_md-2.1.0.tar.gz (95.4MB)
[K     |████████████████████████████████| 95.4MB 2.3MB/s 
[?25hBuilding wheels for collected packages: en-core-web-md
  Building wheel for en-core-web-md (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-md: filename=en_core_web_md-2.1.0-cp36-none-any.whl size=97126236 sha256=48afd473c19a5d85b306666b5bab0491f7beafd62a343245808958898ac5aa7c
  Stored in directory: /tmp/pip-ephem-wheel-cache-peipkewp/wheels/c1/2c/5f/fd7f3ec336bf97b0809c86264d2831c5dfb00fc2e239d1bb01
Successfully built en-core-web-md
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-2.1.0
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_md -->
/usr/local/lib/py

In [0]:
# Downloading stop words for English

from nltk import download
from nltk.corpus import stopwords

download('stopwords')
stop_words_en = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
# Get tokenizer

import spacy

nlp_en = spacy.load('en300')

In [0]:
def preprocess_en(sentence=None, *, keep_stopwords=False):
    def wrapper(sentence):
        text = sentence.lower()
        processed = [token.lemma_ for token in nlp_en.tokenizer(text)]
        processed = [token for token in processed if token.isalpha()]
        if not keep_stopwords:
            processed = [token for token in processed if token not in stop_words_en]
        return processed

    return wrapper if sentence is None else wrapper(sentence)

### Chinese

1. Tokenise with jieba
2. Remove stop words and punctuation

In [0]:
# Download stop words
FILE_STOP_WORDS_ZH = './chinese_stop_words.txt'

if not os.path.exists(FILE_STOP_WORDS_ZH):
    !wget -c https://github.com/Tony607/Chinese_sentiment_analysis/blob/master/data/chinese_stop_words.txt

with open(FILE_STOP_WORDS_ZH, 'r', encoding='utf-8') as f:
    stop_words_zh = [line.rstrip() for line in f]

--2020-02-17 14:15:39--  https://github.com/Tony607/Chinese_sentiment_analysis/blob/master/data/chinese_stop_words.txt
Resolving github.com (github.com)... 140.82.118.4
Connecting to github.com (github.com)|140.82.118.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘chinese_stop_words.txt’

chinese_stop_words.     [<=>                 ]       0  --.-KB/s               chinese_stop_words.     [ <=>                ] 419.14K  --.-KB/s    in 0.02s   

2020-02-17 14:15:40 (21.0 MB/s) - ‘chinese_stop_words.txt’ saved [429200]



In [0]:
import jieba

def preprocess_zh(sentence=None, *, keep_stopwords=False):
    def wrapper(sentence):
        tokens = jieba.cut(sentence, cut_all=True)
        processed = [token for token in tokens if token.isalnum()]
        if not keep_stopwords:
            processed = [token for token in processed if token not in stop_words_zh]
        return processed

    return wrapper if sentence is None else wrapper(sentence)

## Language Vocabulary

In [0]:
class Language(object):

    PAD_TOKEN = '<PAD>'
    SOS_TOKEN = '<SOS>'
    EOS_TOKEN = '<EOS>'
    UNK_TOKEN = '<UNK>'

    def __init__(self, name):
        self.name = name
        self.word2idx = {}
        self.word2count = {}
        self.idx2word = {0: self.PAD_TOKEN,
                         1: self.SOS_TOKEN,
                         2: self.EOS_TOKEN,
                         3: self.UNK_TOKEN}
    
    def __len__(self):
        return len(self.idx2word)

    def add_sentence(self, sentence):
        for token in sentence:
            self.add_word(token)

    def add_word(self, word):
        if word not in self.word2idx:
            idx = len(self)
            self.word2idx[word] = idx
            self.idx2word[idx] = word
        
        count = self.word2count.get(word, 0)
        self.word2count[word] = count + 1

    def sent_to_idxs(self, sent):
        return [self.word2idx.get(word, 2) for word in sent]
    
    def __repr__(self):
        return f'Language(name={self.name}) with {len(self)} words'

## Loading Data

In [0]:
# Read from file

with open(TRAIN_EN) as f:
    train_en = f.readlines()
with open(TRAIN_ZH) as f:
    train_zh = f.readlines()
with open(TRAIN_SCORES) as f:
    train_scores = f.readlines()
with open(VAL_EN) as f:
    val_en = f.readlines()
with open(VAL_ZH) as f:
    val_zh = f.readlines()
with open(VAL_SCORES) as f:
    val_scores = f.readlines()
with open(TEST_EN) as f:
    test_en = f.readlines()
with open(TEST_ZH) as f:
    test_zh = f.readlines()

In [0]:
def pad_sentences(vocab, sent):
    sent_lengths = [len(sentence) for sentence in sent]

    # create an empty matrix with padding tokens
    pad_token = 0
    longest_sent = max(sent_lengths)
    batch_size = len(sent)
    padded_sent = np.ones((batch_size, longest_sent)) * pad_token

    # copy over the actual sequences
    for i, sent_len in enumerate(sent_lengths):
      sequence = sent[i]
      padded_sent[i, 0:sent_len] = sequence[:sent_len]
    return padded_sent, sent_lengths

In [0]:
# English data

preprocess_english = preprocess_en(keep_stopwords=True)
train_en_sents = [preprocess_english(sent) for sent in train_en]
val_en_sents = [preprocess_english(sent) for sent in val_en]
test_en_sents = [preprocess_english(sent) for sent in test_en]

EN = Language('EN')
for sent in train_en_sents:
    EN.add_sentence(sent)

print(EN)

print()
print('Sample sentence')
sample_sent_en = train_en_sents[42]
print(sample_sent_en)
print(EN.sent_to_idxs(sample_sent_en))

train_en_idxs = [EN.sent_to_idxs(sent) for sent in train_en_sents]
train_en_idxs, en_sent_lengths = pad_sentences(EN, train_en_idxs)
val_en_idxs = [EN.sent_to_idxs(sent) for sent in val_en_sents]
val_en_idxs, en_sent_lengths = pad_sentences(EN, val_en_idxs)
test_en_idxs = [EN.sent_to_idxs(sent) for sent in test_en_sents]

Language(name=EN) with 19249 words

Sample sentence
['all', 'of', 'the', 'artilleryman', 'record', 'a', 'wound', 'die']
[341, 32, 4, 342, 343, 60, 344, 345]


In [0]:

# Chinese data

preprocess_chinese = preprocess_zh(keep_stopwords=False)

train_zh_sents = [preprocess_chinese(sent) for sent in train_zh]
val_zh_sents = [preprocess_chinese(sent) for sent in val_zh]
test_zh_sents = [preprocess_chinese(sent) for sent in test_zh]

ZH = Language('ZH')
for sent in train_zh_sents:
    ZH.add_sentence(sent)

print(ZH)

print()
print('Sample sentence')
sample_sent_zh = train_zh_sents[42]
print(sample_sent_zh)
print(ZH.sent_to_idxs(sample_sent_zh))

train_zh_idxs = [ZH.sent_to_idxs(sent) for sent in train_zh_sents]
train_zh_idxs, zh_sent_lengths = pad_sentences(ZH, train_zh_idxs)
val_zh_idxs = [ZH.sent_to_idxs(sent) for sent in val_zh_sents]
val_zh_idxs, zh_sent_lengths = pad_sentences(ZH, val_zh_idxs)
test_zh_idxs = [ZH.sent_to_idxs(sent) for sent in test_zh_sents]

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.790 seconds.
Prefix dict has been built successfully.


Language(name=ZH) with 23852 words

Sample sentence
['据', '记录', '所有', '6', '名', '炮兵', '都', '受伤', '了']
[484, 485, 486, 268, 487, 488, 489, 490, 18]


In [0]:
# Process scores

def prepare_score(score):
    return float(score)

train_scores = [prepare_score(score) for score in train_scores]
val_scores = [prepare_score(score) for score in val_scores]

In [0]:
# Custom Dataset Wrapper
class SentenceData(Dataset):
    def __init__(self, en_tensor, zh_tensor, score_tensor):
        self.en = en_tensor
        self.zh = zh_tensor
        self.score = score_tensor
        
    def __getitem__(self, index):
        return (self.en[index], self.zh[index], self.score[index])

    def __len__(self):
        return len(self.score)

In [0]:
# Datasets

train_en_tensors = [torch.LongTensor(sent_idxs) for sent_idxs in train_en_idxs]
train_zh_tensors = [torch.LongTensor(sent_idxs) for sent_idxs in train_zh_idxs]

# train_pairs = list(zip(train_en_tensors, train_zh_tensors))
# train_set = list(zip(train_pairs, train_scores))
train_set = SentenceData(train_en_tensors, train_zh_tensors, train_scores)

# train_set = [dat for dat in train_set if dat[2] < 1 and dat[2] > -1]

print(train_set[0])

val_en_tensors = [torch.LongTensor(sent_idxs) for sent_idxs in val_en_idxs]
val_zh_tensors = [torch.LongTensor(sent_idxs) for sent_idxs in val_zh_idxs]

# val_pairs = list(zip(val_en_tensors, val_zh_tensors))
# val_set = list(zip(val_pairs, val_scores))
val_set = SentenceData(val_en_tensors, val_zh_tensors, val_scores)

# val_set = [dat for dat in val_set if dat[2] < 1 and dat[2] > -1]

print(val_set[0])
# val_pairs = list(zip(val_en_idxs, val_zh_idxs))
# test_pairs = list(zip(test_en_idxs, test_zh_idxs))

(tensor([ 4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0]), tensor([ 4,  5,  6,  7,  8,  9, 10,  5, 11, 12, 13, 14,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]), -1.5284005772625449)
(tensor([11805, 10544,    11,  9460,   284,     4,     2,  2611,    32,  2085,
          358,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0]), tensor([16174, 16175, 16176, 12583,    35,  2987,   510,   511,     5,     2,
         3757,   397, 13875,    10,     5,  9582,  2524,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0]), -1.0228233810174194)


## Models

In [0]:
# Utilities

from scipy.stats.stats import pearsonr

def unzip(args):
    return zip(*args)

def RMSELoss(pred, target):
    return torch.sqrt(torch.mean((pred.to('cpu') - target.to('cpu')) ** 2))

### Bi-Direction NN

In [0]:
def get_context_vec(hidden_outputs, zh_emb):
    def activation(zh_emb, hidden):
        # print("hidden shape:", hidden.shape)
        # print("zh shape:", zh_emb.shape)

        zh = zh_emb.detach().cpu().numpy().reshape(1, -1)
        hid = hidden.detach().cpu().numpy().reshape(1, -1)
        res = zh.dot(hid.transpose())
        # print(res.shape)
        return res

    e_s = torch.Tensor([
        activation(zh_emb, hid)
        for hid in hidden_outputs
    ])

    a_s = F.softmax(e_s)

    ctx = torch.zeros(zh_emb.shape)
    for a, h in zip(a_s, hidden_outputs):
        ctx += a * h.to('cpu')
    return ctx.to(device)
    # tan = torch.nn.Tanh
    # return tan(ctx)

In [0]:
from collections import deque

class BiRNN(nn.Module):
    def __init__(self, *, vocab_size, emb_dim=100, num_layers=1):
        super().__init__()
        self.vocab_size = vocab_size
        self.emb_dim = emb_dim
        self.num_layers = num_layers

        self.embedding = nn.Embedding(self.vocab_size, self.emb_dim)

        self.forward_gru = torch.nn.GRU(
            input_size=self.emb_dim, hidden_size=self.emb_dim, num_layers=self.num_layers,batch_first=False, bidirectional=True)
        self.backward_gru = torch.nn.GRU(
            input_size=self.emb_dim, hidden_size=self.emb_dim, num_layers=self.num_layers, batch_first=False, bidirectional=False)
        
        self.backward_gru.weight_ih_l0 = self.forward_gru.weight_ih_l0_reverse
        self.backward_gru.weight_hh_l0 = self.forward_gru.weight_hh_l0_reverse
        self.backward_gru.bias_ih_l0 = self.forward_gru.bias_ih_l0_reverse
        self.backward_gru.bias_hh_l0 = self.forward_gru.bias_hh_l0_reverse

    def forward(self, tensor):
        """Return hidden states of backward RNN."""
        num_sent = tensor.shape[0]
        sent_len = tensor.shape[1]

        # print("sent_len:", sent_len)
        emb = self.embedding(tensor).view(sent_len, num_sent, -1)
        # print("tensor shape:", tensor.shape)
        # print("emb shape:", emb.shape)
        self.forward_gru.flatten_parameters()
        bi_output, bi_hidden = self.forward_gru(emb)
        rev_emb = emb[np.arange(sent_len - 1, -1, -1), :, :]

        self.backward_gru.flatten_parameters()
        rev_output, rev_hidden = self.backward_gru(rev_emb)
        return rev_output

class BiRNNAttention(nn.Module):
    def __init__(self, *, vocab_size, emb_dim=100, num_layers=1):
        super().__init__()
        self.vocab_size = vocab_size
        self.emb_dim = emb_dim
        self.num_layers = num_layers

        self.embedding = nn.Embedding(self.vocab_size, self.emb_dim)

        self.forward_gru = torch.nn.GRU(
            input_size=self.emb_dim, hidden_size=self.emb_dim, num_layers=self.num_layers,batch_first=False, bidirectional=True)
        # self.backward_gru = torch.nn.GRU(
            # input_size=self.emb_dim, hidden_size=self.emb_dim, num_layers=self.num_layers, batch_first=False, bidirectional=False)
        
        # self.backward_gru.weight_ih_l0 = self.forward_gru.weight_ih_l0_reverse
        # self.backward_gru.weight_hh_l0 = self.forward_gru.weight_hh_l0_reverse
        # self.backward_gru.bias_ih_l0 = self.forward_gru.bias_ih_l0_reverse
        # self.backward_gru.bias_hh_l0 = self.forward_gru.bias_hh_l0_reverse

    def forward(self, tensor, prev_hiddens):
        """Return hidden states of backward RNN."""
        # sent_len = len(tensor)
        # embs = self.embedding(tensor).view(sent_len, 1, -1)
        num_sent = tensor.shape[0]
        sent_len = tensor.shape[1]

        # print("sent_len:", sent_len)
        embs = self.embedding(tensor).view(sent_len, num_sent, -1)
        ctxs = torch.stack([get_context_vec(prev_hiddens, zh_emb)
                            for zh_emb in embs])

        self.forward_gru.flatten_parameters()
        bi_output, bi_hidden = self.forward_gru(ctxs)

        rev_ctxs = ctxs[np.arange(sent_len - 1, -1, -1), :, :]

        self.backward_gru.flatten_parameters()
        rev_output, rev_hidden = self.backward_gru(rev_ctxs)
        return rev_output

class QualVecRNN(nn.Module):
    def __init__(self, *, emb_dim=100, num_layers=1):
        super().__init__()
        self.emb_dim = emb_dim
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_size=self.emb_dim, hidden_size=1, num_layers=self.num_layers)
    
    def forward(self, tensor):
        # self.lstm.flatten_parameters()
        output, hidden = self.lstm(tensor)
        return hidden[0]

In [0]:
def forward_pass(data_set, regressor):
  loss = 0
  pred_scores = []

  for (en_tensor, zh_tensor, score) in tqdm(data_set):
    en_hiddens = en_rnn(en_tensor.to(device))
    qual_vecs = zh_rnn(zh_tensor.to(device), en_hiddens)
    pred_score = regressor(qual_vecs).squeeze()
    pred_scores.append(pred_score.data.cpu().numpy())

    loss += loss_fn(pred_score, score)

  return loss, pred_scores

In [0]:
def train(num_epochs, regressor, components, opts):
    trainloader = DataLoader(dataset=train_set,batch_size=1)
    validloader = DataLoader(dataset=val_set,batch_size=1)

    # print(next(iter(trainloader)))
    for eidx in range(num_epochs):
        print(f'Epoch {eidx + 1}')

        for comp in components:
            comp.zero_grad()

        print(f'Training: {len(train_set)}')

        loss, _ = forward_pass(trainloader, regressor)
        loss /= len(train_set)
        print(f'Training Loss={loss:.5f}\t')

        val_loss, val_pred_scores = forward_pass(validloader, regressor)
        val_loss /= len(val_set)
        print(f'Validation loss = {val_loss:.5f}\t')

        pearson_score, _ = pearsonr(np.array(val_pred_scores), np.array(val_scores))
        print(f'Validation pearson = {pearson_score:.5f}\t')

        # Backpropagation
        loss.backward()

        for opt in opts:
            opt.step()


In [0]:
torch.cuda.empty_cache()
en_rnn = BiRNN(vocab_size=len(EN))
en_rnn.to(device)

zh_rnn = BiRNNAttention(vocab_size=len(ZH))
zh_rnn.to(device)

regressor = QualVecRNN()
regressor.to(device)

LR = 0.003

en_opt = torch.optim.Adam(en_rnn.parameters(), lr=LR)
zh_opt = torch.optim.Adam(zh_rnn.parameters(), lr=LR)
regr_opt = torch.optim.Adam(regressor.parameters(), lr=LR)

loss_fn = RMSELoss

print(en_rnn)
print(zh_rnn)
print(regressor)

components = (en_rnn, zh_rnn, regressor)
opts = (en_opt, zh_opt, regr_opt)

train(10, regressor, components, opts)

  0%|          | 0/7000 [00:00<?, ?it/s]

BiRNN(
  (embedding): Embedding(19249, 100)
  (forward_gru): GRU(100, 100, bidirectional=True)
  (backward_gru): GRU(100, 100)
)
BiRNNAttention(
  (embedding): Embedding(23852, 100)
  (forward_gru): GRU(100, 100, bidirectional=True)
  (backward_gru): GRU(100, 100)
)
QualVecRNN(
  (lstm): LSTM(100, 1)
)
Epoch 1
Training: 7000


 45%|████▍     | 3140/7000 [10:07<45:25:25, 42.36s/it]

### FFNN with trained embeddings

In [0]:
class FFNN(nn.Module):
    
    def __init__(self, *, en_vocab_size, zh_vocab_size, emb_dim):
        super().__init__()
        self.en_vocab_size = en_vocab_size
        self.zh_vocab_size = zh_vocab_size
        self.emb_dim = emb_dim

        self.en_embedding = nn.Embedding(self.en_vocab_size, self.emb_dim)
        self.zh_embedding = nn.Embedding(self.zh_vocab_size, self.emb_dim)

        self.en_hidden = nn.Linear(self.emb_dim, 1)
        self.zh_hidden = nn.Linear(self.emb_dim, 1)

        self.out = nn.Linear(2, 1)
    
    def forward(self, en_tensors, zh_tensors):
        en_emb = self.en_embedding(en_tensors)
        zh_emb = self.zh_embedding(zh_tensors)

        en_hid = F.relu(self.en_hidden(en_emb))
        zh_hid = F.relu(self.zh_hidden(en_emb))

        hid_concat = torch.stack((en_hid, zh_hid), axis=1).squeeze()
        score = self.out(hid_concat)
        return score.mean()

In [0]:
ffnn = FFNN(en_vocab_size=len(EN), zh_vocab_size=len(ZH), emb_dim=200)
ffnn.to(device)
print(ffnn)

ffnn_opt = torch.optim.Adam(ffnn.parameters(), lr=0.003)
loss_fn = RMSELoss

FFNN(
  (en_embedding): Embedding(19249, 200)
  (zh_embedding): Embedding(23852, 200)
  (en_hidden): Linear(in_features=200, out_features=1, bias=True)
  (zh_hidden): Linear(in_features=200, out_features=1, bias=True)
  (out): Linear(in_features=2, out_features=1, bias=True)
)


In [0]:
NUM_EPOCHS = 100

train_losses = []
val_losses = []
val_pearson = []

for eidx in range(NUM_EPOCHS):
    print(f'Epoch {eidx + 1}: \t', end=' ')
    ffnn.zero_grad()
    
    loss = 0
    for en_tensor, zh_tensor, score in train_set:
        pred = ffnn(en_tensor.to(device), zh_tensor.to(device))
        loss += loss_fn(pred, score)

    loss /= len(train_set)
    train_losses.append(loss)
    
    print(f'train loss = {loss:.5f}\t', end='')

    # Validation loss
    val_loss = 0
    for (en_tensor, zh_tensor), score in val_set:
        pred = ffnn(en_tensor.to(device), zh_tensor.to(device))
        val_loss += loss_fn(pred, score)
    val_loss /= len(val_set)
    print(f'validation loss = {val_loss:.5f}\t', end='')
    val_losses.append(val_loss)

    # Validation score
    val_preds, val_targets = unzip([(ffnn(en_tensor.to(device), zh_tensor.to(device)).detach().cpu().numpy(), score)
                              for (en_tensor, zh_tensor), score in val_set])
    
    val_preds = np.array(val_preds)
    val_targets = np.array(val_targets)

    pearson_score, _ = pearsonr(val_preds, val_targets)
    val_pearson.append(pearson_score)
    print(f'validation pearson = {pearson_score:.5f}\t')

    # Backpropagation
    loss.backward()
    ffnn_opt.step()

Epoch 1: 	 

AttributeError: ignored

### RNN Chain

In [0]:
class RNNChain(nn.Module):

    def __init__(self, *, en_vocab_size, zh_vocab_size, emb_dim):
        super().__init__()
        self.en_vocab_size = en_vocab_size
        self.zh_vocab_size = zh_vocab_size
        self.emb_dim = emb_dim

        self.en_embedding = nn.Embedding(self.en_vocab_size, self.emb_dim)
        self.zh_embedding = nn.Embedding(self.zh_vocab_size, self.emb_dim)

        self.en_rnn = nn.GRU(self.emb_dim, self.emb_dim, bidirectional=True)
        self.zh_rnn = nn.GRU(self.emb_dim, self.emb_dim, bidirectional=True)

        self.hidden = nn.Linear(self.emb_dim, 50)
        self.out = nn.Linear(50, 1)

    def forward(self, en_tensor, zh_tensor):
        en_emb = self.en_embedding(en_tensor)
        zh_emb = self.zh_embedding(zh_tensor)

        en_hidden = torch.zeros(2, 1, self.emb_dim, device=device)

        for word_idx in en_emb:
            word_idx = word_idx.view(1, 1, -1)
            _, en_hidden = self.en_rnn(word_idx, en_hidden)
    
        zh_hidden = en_hidden
        for word_idx in zh_emb:
            word_idx = word_idx.view(1, 1, -1)
            _, zh_hidden = self.zh_rnn(word_idx, zh_hidden)

        score = self.out(F.relu(self.hidden(zh_hidden[-1])))
        return score

In [0]:
USE_PREV = True

rnn = RNNChain(en_vocab_size=len(EN), zh_vocab_size=len(ZH), emb_dim=100)
rnn.to(device)

rnn_opt = torch.optim.Adam(rnn.parameters(), lr=0.003)
loss_fn = RMSELoss

NUM_EPOCHS = 100

state = {
    'curr_epoch': 1,
    'train_losses': [],
    'val_losses': [],
    'val_pearson': [],
}

if USE_PREV and os.path.exists(in_gdrive('rnn.pt')):
    print('Loading from Google Drive...', end=' ')
    rnn.load_state_dict(torch.load(in_gdrive('rnn.pt')))

    with open(in_gdrive('rnn.json'), 'r') as f:
        state = json.load(f)
    print('done!')


while state['curr_epoch'] <= NUM_EPOCHS:
    print(f'Epoch {state["curr_epoch"]}:')
    rnn.zero_grad()
    
    loss = 0
    print(f'Training {len(train_set)}: ', end='')
    for idx, ((en_tensor, zh_tensor), score) in enumerate(train_set):
        pred = rnn(en_tensor.to(device), zh_tensor.to(device)).squeeze()
        curr_loss = loss_fn(pred, score) 
        loss += curr_loss
        if idx % 500 == 0:
            print('.', end='')
    print()

    loss /= len(train_set)
    state['train_losses'].append(loss.detach().cpu().numpy().tolist())
    
    print(f'==>train loss = {loss:.5f}')

    # Validation loss
    val_loss = 0
    print(f'Validating loss {len(val_set)}: ', end='')
    for idx, ((en_tensor, zh_tensor), score) in enumerate(val_set):
        pred = rnn(en_tensor.to(device), zh_tensor.to(device))
        val_loss += loss_fn(pred, score)

        if idx % 100 == 0:
            print('.', end='')
    print()    

    val_loss /= len(val_set)
    print(f'==>validation loss = {val_loss:.5f}')
    state['val_losses'].append(val_loss.detach().cpu().numpy().tolist())

    # Validation score
    val_preds, val_targets = unzip([(rnn(en_tensor.to(device), zh_tensor.to(device)).squeeze().detach().cpu().numpy(), score)
                              for (en_tensor, zh_tensor), score in val_set])
    val_preds = np.array(val_preds)
    val_targets = np.array(val_targets)

    pearson_score, _ = pearsonr(val_preds, val_targets)
    state['val_pearson'].append(pearson_score)
    print(f'==>validation pearson = {pearson_score:.5f}')

    # Backpropagation
    print('Backpropagation...', end=' ')
    loss.backward()
    rnn_opt.step()
    print('done!')

    # Save
    print('Saving to Google Drive...', end=' ')
    torch.save(rnn.state_dict(), in_gdrive('rnn.pt'))

    state['curr_epoch'] += 1
    with open(in_gdrive('rnn.json'), 'w') as f:
        json.dump(state, f)

    print('done!\n')
    

In [0]:
class RNNChain(nn.Module):

    def __init__(self, *, vocab_size, emb_dim):
        super().__init__()
        self.vocab_size = vocab_size
        self.emb_dim = emb_dim

        self.embedding = nn.Embedding(self.vocab_size, self.emb_dim)
        self.rnn = nn.GRU(self.emb_dim, self.emb_dim, bidirectional=True)

    def forward(self, x, hidden):
        emb = self.embedding(x)
        output = emb.view(1, 1, -1)
        output, hidden = self.rnn(output, hidden)

        return output, hidden

    def init_hidden(self):
        return torch.zeros(2, 1, self.emb_dim, device=device)

class RegressorLayer(nn.Module):

    def __init__(self, *, emb_dim):
        super().__init__()
        self.emb_dim = emb_dim

        self.hidden = nn.Linear(self.emb_dim, 50)
        self.out = nn.Linear(50, 1)

    def forward(self, x):
        return self.out(F.relu(self.hidden(x)))

In [0]:
en_model = RNNChain(vocab_size=len(EN), emb_dim=200)
zh_model = RNNChain(vocab_size=len(ZH), emb_dim=200)
regressor = RegressorLayer(emb_dim=200)

en_model.to(device)
zh_model.to(device)
regressor.to(device)

print(en_model)
print(zh_model)

LR = 0.003

en_opt = torch.optim.Adam(en_model.parameters(), lr=LR)
zh_opt = torch.optim.Adam(zh_model.parameters(), lr=LR)
regressor_opt = torch.optim.Adam(regressor.parameters(), lr=LR)

def RMSELoss(pred, target):
    return torch.sqrt(torch.mean((pred - target) ** 2))

loss_fn = RMSELoss

In [0]:
def train(en_tensor, zh_tensor, score):
    en_model.zero_grad()
    zh_model.zero_grad()

    en_hidden = en_model.init_hidden()

    for word_idx in en_tensor:
        hids, en_hidden = en_model(word_idx, en_hidden)
        print('Hids', hids.shape)
    
    # print('EN final hidden state', en_hidden)

    zh_hidden = en_hidden
    for word_idx in zh_tensor:
        _, zh_hidden = zh_model(word_idx, zh_hidden)
    
    # print('ZH final hidden state', zh_hidden)

    pred_score = regressor(zh_hidden).squeeze()

    loss = loss_fn(pred_score, score)
    
    # print('Loss', loss)    

    loss.backward()

    regressor_opt.step()
    zh_opt.step()
    en_opt.step()

    return loss.data

In [0]:

for eidx in range(100):
    loss = 0
    for (en_tensor, zh_tensor), score in train_set[:100]:
        loss += train(en_tensor.to(device), zh_tensor.to(device), score)
    loss /= 100
    print(loss)