<a href="https://colab.research.google.com/github/ansonmiu0214/C490CW/blob/master/Sentence_Level_QE_2020.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CO490 Coursework: Quality Estimation

__Team__
* Anson Miu (kcm116)
* Cheryl Chen (czc16)
* Clara Gila (acg116)

## Setup

In [0]:
# Imports
import os
import json
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import sklearn
from tqdm import tqdm

import matplotlib.pyplot as plt
%matplotlib inline

In [9]:
# Setup CUDA
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

torch_device = getattr(torch, str(device))
torch_device.empty_cache()
print(f'DEVICE={torch_device.get_device_name()}')
print(torch.cuda.memory_summary(device=device))

DEVICE=Tesla P100-PCIE-16GB
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |       0 B  |       0 B  |       0 B  |       0 B  |
|       from large pool |       0 B  |       0 B  |       0 B  |       0 B  |
|       from small pool |       0 B  |       0 B  |       0 B  |       0 B  |
|---------------------------------------------------------------------------|
| Active memory         |       0 B  |       0 B  |       0 B  |       0 B  |
|       from large pool |       0 B  |       0 B  |       0 B  |       0 B  |
|       from small pool |       0 B  |       0 B  |       0 B  |       0 B  |
|-----------------------------------

## Utilities

### Loading Data

In [11]:
if not os.path.exists('enzh_data.zip'):
    !wget -O enzh_data.zip https://competitions.codalab.org/my/datasets/download/03e23bd7-8084-4542-997b-6a1ca6dd8a5f
    !unzip enzh_data.zip

TRAIN_EN = 'train.enzh.src'
TRAIN_ZH = 'train.enzh.mt'
TRAIN_SCORES = 'train.enzh.scores'
VAL_EN = 'dev.enzh.src'
VAL_ZH = 'dev.enzh.mt'
VAL_SCORES = 'dev.enzh.scores'
TEST_EN = 'test.enzh.src'
TEST_ZH = 'test.enzh.mt'

--2020-02-25 10:38:06--  https://competitions.codalab.org/my/datasets/download/03e23bd7-8084-4542-997b-6a1ca6dd8a5f
Resolving competitions.codalab.org (competitions.codalab.org)... 129.175.22.230
Connecting to competitions.codalab.org (competitions.codalab.org)|129.175.22.230|:443... connected.
HTTP request sent, awaiting response... 302 FOUND
Location: https://newcodalab.lri.fr/prod-private/dataset_data_file/None/630ec/en-zh.zip?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=a1907a9deeca8e4ee6c14b3a15238ed6d1ee1aa80cd794e18656aa56b494ee79&X-Amz-Date=20200225T103807Z&X-Amz-Credential=AZIAIOSAODNN7EX123LE%2F20200225%2Fnewcodalab%2Fs3%2Faws4_request [following]
--2020-02-25 10:38:07--  https://newcodalab.lri.fr/prod-private/dataset_data_file/None/630ec/en-zh.zip?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=a1907a9deeca8e4ee6c14b3a15238ed6d1ee1aa80cd794e18656aa56b494ee79&X-Amz-Date=20200225T103

In [0]:
# Read from file

with open(TRAIN_EN) as f:
    train_en = f.readlines()
with open(TRAIN_ZH) as f:
    train_zh = f.readlines()
with open(TRAIN_SCORES) as f:
    train_scores = [float(score.strip()) for score in f]
with open(VAL_EN) as f:
    val_en = f.readlines()
with open(VAL_ZH) as f:
    val_zh = f.readlines()
with open(VAL_SCORES) as f:
    val_scores = [float(score.strip()) for score in f]
with open(TEST_EN) as f:
    test_en = f.readlines()
with open(TEST_ZH) as f:
    test_zh = f.readlines()

### Metrics

In [0]:
from scipy.stats import pearsonr

def RMSELoss(pred, target, *, is_numpy=False):
    mean = np.mean if is_numpy else torch.mean
    sqrt = np.sqrt if is_numpy else torch.sqrt
    return sqrt(mean((pred - target) ** 2))

def pearson():
    ...

### Model Training

In [0]:
def suppress_log(*args, **kwargs):
    pass

def debug_log(*args, **kwargs):
    print('[Debug]:', end='')
    print(*args, **kwargs)


class SentencePairTestDataset(Dataset):
    def __init__(self, en, zh):
        self.en = en
        self.zh = zh

    def __getitem__(self, index):
        return self.en[index], self.zh[index]

    def __len__(self):
        return len(self.en)

class SentencePairTrainDataset(Dataset):
    def __init__(self, en, zh, scores):
        self.en = en
        self.zh = zh
        self.scores = scores

    def __getitem__(self, index):
        return (self.en[index], self.zh[index]), self.scores[index]

    def __len__(self):
        return len(self.en)

def build_dataset(en_inputs, zh_inputs, scores=None, idxs=None):
    if idxs is not None:
        en_inputs = en_inputs[idxs]
        zh_inputs = zh_inputs[idxs]
        scores = scores[idxs]

    en_tensors = [torch.LongTensor(data) for data in en_inputs]
    zh_tensors = [torch.LongTensor(data) for data in zh_inputs]

    if scores is None:
        return SentencePairTestDataset(en_tensors, zh_tensors)
    else:
        return SentencePairTrainDataset(en_tensors, zh_tensors, scores)

In [0]:
def torch_to_kfold(cls=None, *, opt, lr, num_epochs, batch_size, loss_fn, **metrics):
    def wrapper(cls):

        def fit(self, dataset):
            # Enter train mode
            self.train()
            self.to(device)

            # Construct data loader
            loader = DataLoader(dataset=dataset, batch_size=batch_size)

            optimiser = opt(self.parameters(), lr=lr)
            
            for epoch in range(1, num_epochs + 1):
                header = f'Epoch {epoch}'
                print(header)
                print('=' * len(header))

                for X, scores in tqdm(loader, desc='Mini-Batch'):
                    optimiser.zero_grad()

                    pred = self(*(x.to(device) for x in X)).squeeze()
                    loss = loss_fn(pred, scores.to(device))

                    loss.backward()
                    optimiser.step()

        def predict(self, dataset):
            # Enter evaluation mode
            self.eval()
            self.to(device)

            loader = DataLoader(dataset=dataset, batch_size=1)

            preds = []
            scores = []
            with torch.no_grad():
                for X, score in tqdm(loader):
                    pred = self(*(x.to(device) for x in X)).squeeze().cpu()
                    preds.append(pred)
                    scores.append(score)
            
                preds = torch.stack(preds)
                scores = torch.cat(scores)

                loss = loss_fn(preds, scores)
                metrics = {name: metric_fn(preds, scores)
                           for name, metric in metrics.items()}
            return loss, metrics


        cls.fit = fit
        cls.predict = predict

        return cls

    return wrapper if cls is None else wrapper(cls)

### Cross-Validation

In [0]:
from sklearn.model_selection import KFold

def kfold_cross_validate(model, train_en_inputs, train_zh_inputs, train_scores,
                         n_splits=2,
                         random_state=0,
                         **kwargs):
    
    if not isinstance(train_scores, torch.FloatTensor):
        train_scores = torch.FloatTensor(train_scores)

    cv_split = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    for train_idxs, test_idxs in cv_split.split(train_en_inputs):

        train_set = build_dataset(train_en_inputs, train_zh_inputs, train_scores,
                                  idxs=train_idxs)
        test_set = build_dataset(train_en_inputs, train_zh_inputs, train_scores,
                                 idxs=test_idxs)

        # Training
        model.fit(train_set)

        # Evaluation
        predicted = model.predict(test_set)
        scores = test_set.scores

    return None

## Building Blocks

### Preprocessing

#### English

In [13]:
# Downloading spacy models for English

!spacy download en_core_web_md
!spacy link en_core_web_md en300 --force

# Downloading stop words for English

from nltk import download
from nltk.corpus import stopwords

download('stopwords')
stop_words_en = set(stopwords.words('english'))

Collecting en_core_web_md==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.1.0/en_core_web_md-2.1.0.tar.gz (95.4MB)
[K     |████████████████████████████████| 95.4MB 63.3MB/s 
[?25hBuilding wheels for collected packages: en-core-web-md
  Building wheel for en-core-web-md (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-md: filename=en_core_web_md-2.1.0-cp36-none-any.whl size=97126236 sha256=7b6f734d234a8a4f868235bb95b6995173d0435fc9055129f62126b0ccdaf0a9
  Stored in directory: /tmp/pip-ephem-wheel-cache-e6vmp3j0/wheels/c1/2c/5f/fd7f3ec336bf97b0809c86264d2831c5dfb00fc2e239d1bb01
Successfully built en-core-web-md
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-2.1.0
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_md -->
/usr/local/lib/p

In [0]:
# Get tokenizer

import spacy

nlp_en = spacy.load('en300')

def preprocess_en(sentence=None, *, keep_stopwords=False):
    """Preprocess English sentence using spaCy for tokenisation.
    Toggle `keep_stopwords=True` to preserve stopwords."""

    def wrapper(sentence):
        text = sentence.lower()
        processed = [token.lemma_ for token in nlp_en.tokenizer(text)]
        processed = [token for token in processed if token.isalpha()]
        if not keep_stopwords:
            processed = [token for token in processed if token not in stop_words_en]
        return processed

    return wrapper if sentence is None else wrapper(sentence)

#### Chinese

In [15]:
# Download stopwords
FILE_STOP_WORDS_ZH = './chinese_stop_words.txt'

if not os.path.exists(FILE_STOP_WORDS_ZH):
    !wget -c https://github.com/Tony607/Chinese_sentiment_analysis/blob/master/data/chinese_stop_words.txt

with open(FILE_STOP_WORDS_ZH, 'r', encoding='utf-8') as f:
    stop_words_zh = [line.rstrip() for line in f]

--2020-02-25 10:44:18--  https://github.com/Tony607/Chinese_sentiment_analysis/blob/master/data/chinese_stop_words.txt
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘chinese_stop_words.txt’

chinese_stop_words.     [  <=>               ] 419.55K  1.30MB/s    in 0.3s    

2020-02-25 10:44:19 (1.30 MB/s) - ‘chinese_stop_words.txt’ saved [429623]



In [0]:
import jieba

def preprocess_zh(sentence=None, *, keep_stopwords=False):
    """Preprocess Chinese sentence using jieba for tokenisation.
    Toggle `keep_stopwords=True` to preserve stopwords."""
    
    def wrapper(sentence):
        tokens = jieba.cut(sentence, cut_all=False)
        processed = [token for token in tokens if token.isalnum()]
        if not keep_stopwords:
            processed = [token for token in processed if token not in stop_words_zh]
        return processed

    return wrapper if sentence is None else wrapper(sentence)

### Pretrained Word Embeddings

#### English

In [0]:
import torchtext

glove = torchtext.vocab.GloVe(name='6B', dim=100)

#### Chinese

In [0]:
ZH_MODEL_BIN = 'model.bin'

if not os.path.exists(ZH_MODEL_BIN):
    !wget -O zh.zip http://vectors.nlpl.eu/repository/20/35.zip
    !unzip zh.zip 

from gensim.models import Word2Vec
from gensim.models import KeyedVectors

wv_from_bin = KeyedVectors.load_word2vec_format(ZH_MODEL_BIN, binary=True) 

### Sentence Representation

In [0]:
def pad_sentences(sents, *, pad_token=0):
    """Pad sentences with `pad_token` to the longest sentence in `sents."""

    # Get max sentence length
    sent_lengths = [len(sent) for sent in sents]
    max_sent_len = max(sent_lengths)
    
    # Create empty matrix with padding tokens
    padded_sents = np.ones((len(sents), max_sent_len)) * pad_token

    # Copy over the sequences
    for i, (sent_len, sent) in enumerate(zip(sent_lengths, sents)):
        padded_sents[i, 0:sent_len] = sent[:sent_len]
    return padded_sents

### Vocabulary Representation

In [0]:
class Language(object):

    PAD_TOKEN = '<PAD>'
    UNK_TOKEN = '<UNK>'

    def __init__(self, name):
        self.name = name
        self.word2idx = {}
        self.idx2word = {0: self.PAD_TOKEN,
                         1: self.UNK_TOKEN}

    def __len__(self):
        return len(self.idx2word)

    def add_sentence(self, sentence):
        for token in sentence:
            self.add_word(token)

    def add_word(self, word):
        if word not in self.word2idx:
            idx = len(self)
            self.word2idx[word] = idx
            self.idx2word[idx] = word
    
    def sent_to_idxs(self, sent):
        return [self.word2idx.get(word, 1) for word in sent]

    def __getitem__(self, key):
        if isinstance(key, int):
            return self.idx2word[key]
        if isinstance(key, str):
            return self.word2idx[key]
        raise KeyError(key)
    
    def __repr__(self):
        return f'Language(name={self.name}) with {len(self)} words'

## Models

### 1) Fine-tuning Baseline Regressor
---

__Pipeline__

1. Manual preprocessing
    * EN - tokenisation with [spaCy](https://spacy.io),
    stopword removal
    * ZH - tokenisation with [jieba](https://github.com/fxsjy/jieba),
    stopword removal, 
2. Pretrained embeddings
    * EN - GloVe
    * ZH - TODO
3. Regression model
    * SVR
    * LinearRegression

__Model selection__

We perform 2-fold cross validation to select

__Evaluation__
...

### 2) Baseline with FFNN Regressor

### 3) Autoencoder with Quality Estimation Vectors

#### Pipeline

In [26]:
##########
# ENGLISH
##########

preprocess_english = preprocess_en(keep_stopwords=False)
train_en_sents = [preprocess_english(sent) for sent in train_en]
val_en_sents = [preprocess_english(sent) for sent in val_en]
test_en_sents = [preprocess_english(sent) for sent in test_en]

EN = Language('EN')
for sent in train_en_sents:
    EN.add_sentence(sent)
print(EN)

print()
print('Sample sentence')
sample_sent_en = train_en_sents[42]
print(sample_sent_en)
print(EN.sent_to_idxs(sample_sent_en))

train_en_idxs = pad_sentences([EN.sent_to_idxs(sent) for sent in train_en_sents])
val_en_idxs = pad_sentences([EN.sent_to_idxs(sent) for sent in val_en_sents])
test_en_idxs = pad_sentences([EN.sent_to_idxs(sent) for sent in test_en_sents])

Language(name=EN) with 19141 words

Sample sentence
['artilleryman', 'record', 'wound', 'die']
[292, 293, 294, 295]


In [27]:
##########
# CHINESE
##########

preprocess_chinese = preprocess_zh(keep_stopwords=False)
train_zh_sents = [preprocess_chinese(sent) for sent in train_zh]
val_zh_sents = [preprocess_chinese(sent) for sent in val_zh]
test_zh_sents = [preprocess_chinese(sent) for sent in test_zh]

ZH = Language('ZH')
for sent in train_zh_sents:
    ZH.add_sentence(sent)
print(ZH)

print()
print('Sample sentence')
sample_sent_zh = train_zh_sents[0]
print(sample_sent_zh)
print(ZH.sent_to_idxs(sample_sent_zh))

train_zh_idxs = pad_sentences([ZH.sent_to_idxs(sent) for sent in train_zh_sents])
val_zh_idxs = pad_sentences([ZH.sent_to_idxs(sent) for sent in val_zh_sents])
test_zh_idxs = pad_sentences([ZH.sent_to_idxs(sent) for sent in test_zh_sents])

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.800 seconds.
Prefix dict has been built successfully.


Language(name=ZH) with 21992 words

Sample sentence
['最后', '的', '征服者', '骑着', '他', '的', '剑', '继续前进']
[2, 3, 4, 5, 6, 3, 7, 8]


#### Model Architecture

In [0]:
@torch_to_kfold(opt=torch.optim.Adam,
                lr=1e-3,
                num_epochs=10,
                batch_size=2,
                loss_fn=RMSELoss)
class AutoencoderQEV(nn.Module):

    def __init__(self, *, en_vocab_size, zh_vocab_size, emb_dim):
        super().__init__()
        self.en_vocab_size = en_vocab_size
        self.zh_vocab_size = zh_vocab_size
        self.emb_dim = emb_dim

        """Source sentence: embedding layer + encoder."""
        self.source_embedding = nn.Embedding(self.en_vocab_size, self.emb_dim)
        self.source_rnn = nn.GRU(input_size=self.emb_dim,
                                 hidden_size=self.emb_dim,
                                 bidirectional=True)

        """Target sentence: embedding layer + decoder with attention."""
        self.target_embedding = nn.Embedding(self.zh_vocab_size, self.emb_dim)
        self.target_rnn = nn.GRU(input_size=self.emb_dim,
                                 hidden_size=self.emb_dim * 2,
                                 bidirectional=False)

        """RNN for producing summary unit."""
        self.qualvec_rnn = nn.GRU(input_size=self.emb_dim * 2,
                                  hidden_size=self.emb_dim,
                                  bidirectional=False)

        """Regression output layer."""
        self.regressor_output = nn.Linear(in_features=self.emb_dim,
                                          out_features=1)

    def forward(self, en_sent, zh_sent, *, log=suppress_log, get_qualvecs=False):
        """Perform forward pass and returns the prediction scores.

        Parameters:
            en_sent: (batch_size, en_max_sent_len)
            zh_sent: (batch_size, zh_max_sent_len)
        
        Debug parameters:
            log: custom `print` function, defaults to suppressing messages
            get_qualvecs: if True, returns the quality vectors instead.
        """

        en_batch_size, en_sent_len = en_sent.shape
        en_emb = self.source_embedding(en_sent)
        log('en_emb:', en_emb.shape)

        en_emb = en_emb.view(en_sent_len, en_batch_size, -1)
        log('en_emb:', en_emb.shape)
        en_all_hids, en_last_hid = self.source_rnn(en_emb)

        log('en_all_hids:', en_all_hids.shape)
        log('en_last_hid:', en_last_hid.shape)

        ############################################
        def get_context(prev_state):
            log('prev_state:', prev_state.shape)
            s_s = []
            for hid in en_all_hids:
                s_s_batches = torch.Tensor([
                    one_hid_batch.dot(one_prev_state_batch)
                    for one_hid_batch, one_prev_state_batch in zip(prev_state, hid)
                ])
                s_s.append(s_s_batches)
            
            s_s = torch.stack(s_s, dim=0)
            log('s_s', s_s.shape)

            a_s = F.softmax(s_s, dim=0)
            log('a_s', a_s.shape)

            ctx_vecs = []
            for j, (a_i, hid) in enumerate(zip(a_s, en_all_hids)):
                vecs = []
                for i, (one_a_batch, one_hid_batch) in enumerate(zip(a_i, hid)):
                    vec = one_a_batch * one_hid_batch
                    vecs.append(vec)
                
                vecs = torch.stack(vecs)
                # print(f'hid state {j}', vecs.shape)
                ctx_vecs.append(vecs)

            ctx_vecs = torch.stack(ctx_vecs).sum(dim=0)
            log(f'ctx_vecs', ctx_vecs.shape)
            return ctx_vecs

        ############################################

        zh_batch_size, zh_sent_len = zh_sent.shape
        log('zh_sent_len', zh_sent_len)
        zh_emb = self.target_embedding(zh_sent)

        log('zh_emb:', zh_emb.shape)
        zh_emb = zh_emb.view(zh_sent_len, zh_batch_size, -1)
        log('zh_emb:', zh_emb.shape)

        qualvecs = []
        zh_hid = None
        for zh in zh_emb:
            log('zh:', zh.shape)
            zh = zh.view(1, zh_batch_size, -1)
            log('zh:', zh.shape)
            if zh_hid is None:
                _, zh_hid = self.target_rnn(zh)
            else:
                _, zh_hid = self.target_rnn(zh, zh_hid)
            log('zh_hid:', zh_hid.shape)

            zh_hid_reshaped = zh_hid.view(zh_batch_size, -1)
            log('zh_hid_reshaped:', zh_hid_reshaped.shape)

            ctx = get_context(zh_hid_reshaped)
            log('ctx:', ctx.shape)
            
            #TODO: fix the linear combination
            qualvecs.append(ctx + zh_hid_reshaped)

        qualvecs = torch.stack(qualvecs)
        log('qualvecs:', qualvecs.shape)

        if get_qualvecs:
            return qualvecs

        _, qualvec_hid = self.qualvec_rnn(qualvecs)
        log('qualvec_hid:', qualvec_hid.shape)

        qualvec_hid = qualvec_hid.view(zh_batch_size, -1)
        log('qualvec_hid:', qualvec_hid.shape)

        qualvec_hid_act = torch.tanh(qualvec_hid)


        score = self.regressor_output(qualvec_hid_act)
        log('score', score.shape)
        
        return score

#### Training

In [0]:
model = AutoencoderQEV(en_vocab_size=len(EN), zh_vocab_size=len(ZH), emb_dim=100)

kfold_cross_validate(model, train_en_idxs, train_zh_idxs, train_scores)

Mini-Batch:   0%|          | 0/1750 [00:00<?, ?it/s]

Epoch 1


Mini-Batch:  35%|███▌      | 615/1750 [03:23<06:00,  3.15it/s]

### 4) BERT with Sentence Embeddings

### 5) BERT with Sentence-Pair Embeddings