# Assignment 1
You should submit the **UniversityNumber.ipynb** file and your final prediction file **UniversityNumber.test.out** to moodle. Make sure your code does not use your local files and that the results are reproducible. Before submitting, please **run your notebook and keep all running logs** so that we can check.

## 1 $n$-gram Language Model

### 1.1 Building vocabulary

#### Code

In [4]:
import nltk

MINIMAL_FREQUENCY = 3
START_TOKEN = '<s>'
END_TOKEN = '</s>'
UNKNOWN_TOKEN = '<UNK>'

FILE_PATH = 'data/lm'
TEST_FILENAME = f'{FILE_PATH}/test.txt'
DEV_FILENAME = f'{FILE_PATH}/dev.txt'
TRAIN_FILENAME = f'{FILE_PATH}/train.txt'

word_frequency = {}

def load_file(file_name):
    texts = []
    with open(file_name, encoding='utf-8') as f:
        for line in f.readlines():
            text = line.strip().lower().split(' ')
            text.insert(0, START_TOKEN)
            text.append(END_TOKEN)
            texts.append(text)
    return texts

def cal_frequency(texts):
    global word_frequency
    for text in texts:
        for word in text:
            if word not in word_frequency:
                word_frequency[word] = 1
            else:
                word_frequency[word] += 1

def add_unk():
    global word_frequency
    tmp = {}
    tmp[UNKNOWN_TOKEN] = 0
    for key in word_frequency:
        value = word_frequency[key]
        if value < MINIMAL_FREQUENCY:
            tmp[UNKNOWN_TOKEN] += value
        else:
            tmp[key] = value
    word_frequency = tmp

def tackle_unk(texts):
    global word_frequency
    for text in texts:
        for i in range(len(text)):
            if text[i] not in word_frequency:
                text[i] = UNKNOWN_TOKEN

train_texts = load_file(TRAIN_FILENAME)
dev_texts = load_file(DEV_FILENAME)
test_texts = load_file(TEST_FILENAME)

cal_frequency(train_texts)
print(f'have {len(word_frequency)} tokens.')

add_unk()
print(f'after add unknown token, have {len(word_frequency)} tokens.')

tackle_unk(train_texts)
tackle_unk(dev_texts)
tackle_unk(test_texts)


have 46310 tokens.
after add unknown token, have 20663 tokens.


#### Discussion

There are 46310 tokens in the training dataset. After removing tokens that occur less than three times, there are 20663 tokens.
Use V as the size of vocabulary. Then there are $V^{n}$ parameters in n-gram models. Most of the parameters are zero. The number of none-zero parameters is linear to the total length of the training dataset.


### 1.2 $n$-gram Language Modeling

#### Code

In [5]:
## model
def unigram(train_texts):
    model = {'c_up': {}, 'c_down': {}}
    for text in train_texts:
        for i in range(0, len(text)):
            word = text[i]
            if word not in model['c_up']:
                model['c_up'][word] = 1
            else:
                model['c_up'][word] += 1
    model['N'] = sum([len(l) for l in train_texts])  ######################
    return model

def bigram(train_texts):
    model = {'c_up': {}, 'c_down': {}}
    for text in train_texts:
        for word in text:
            if word not in model['c_down']:
                model['c_down'][word] = 1
            else:
                model['c_down'][word] += 1
        for i in range(1, len(text)):
            tmp = (text[i - 1], text[i])
            if tmp not in model['c_up']:
                model['c_up'][tmp] = 1
            else:
                model['c_up'][tmp] += 1
    return model

def trigram(train_texts):
    model = {'c_up': {}, 'c_down': {}}
    for text in train_texts:
        for i in range(0, len(text)):
            if i == 0:
                tmp = (START_TOKEN, text[i])
            else:
                tmp = (text[i - 1], text[i])
            if tmp not in model['c_down']:
                model['c_down'][tmp] = 1
            else:
                model['c_down'][tmp] += 1
        for i in range(1, len(text)):
            if i == 1:
                tmp = (START_TOKEN, text[i - 1], text[i])
            else:
                tmp = (text[i - 2], text[i - 1], text[i])
            if tmp not in model['c_up']:
                model['c_up'][tmp] = 1
            else:
                model['c_up'][tmp] += 1
        
    return model

### model
unigram_model = unigram(train_texts)
bigram_model = bigram(train_texts)
trigram_model = trigram(train_texts)


In [6]:
### calculate ppl
import math

def cal_perplexity(model, dim, texts):
    ret = 0.0
    # cnt = 30
    if dim == 1:
        N = model['N']
        for text in texts:
            for i in range(1, len(text) - 1):
                if text[i] in model['c_up']:### how to deal with words that don't exist
                    ret += math.log2(model['c_up'][text[i]] / N)
                else:
                    assert(True)
    elif dim == 2:
        for text in texts:
            text_ppl = 0
            for i in range(1, len(text)):
                up, down = 0, 0
                up_word = (text[i - 1], text[i])
                down_word = text[i - 1]
                if up_word in model['c_up']:
                    up = model['c_up'][up_word]
                if down_word in model['c_down']:
                    down = model['c_down'][down_word]
                if up != 0 and down != 0:### how to deal with words that don't exist
                    text_ppl += math.log2(up / down)
                else:
                    # assert(False)
                    text_ppl = -300
                    break
            ret += text_ppl
    elif dim == 3:
        for text in texts:
            text_ppl = 0
            for i in range(1, len(text)):
                up, down = 0, 0
                if i == 1:
                    up_word = (START_TOKEN, text[i - 1], text[i])
                    down_word = (START_TOKEN, text[i - 1])
                else:
                    up_word = (text[i - 2], text[i - 1], text[i])
                    down_word = (text[i - 2], text[i - 1])
                if up_word in model['c_up']:
                    up = model['c_up'][up_word]
                if down_word in model['c_down']:
                    down = model['c_down'][down_word]
                if up != 0 and down != 0:### how to deal with words that don't exist
                    text_ppl += math.log2(up / down)
                else:
                    text_ppl = -200
                    break
            # cnt -= 1
            # print(text_ppl)
            # if cnt < 0:
            #     break
            ret += text_ppl
    else:
        assert(True)

    M = sum([len(l) for l in texts])
    return pow(2, -ret / M)


print('       unigram    bigram    trigram')

ppl1_train = cal_perplexity(unigram_model, 1, train_texts)
ppl2_train = cal_perplexity(bigram_model, 2, train_texts)
ppl3_train = cal_perplexity(trigram_model, 3, train_texts)

print('train:', ppl1_train, ppl2_train, ppl3_train)
        
ppl1_dev = cal_perplexity(unigram_model, 1, dev_texts)
ppl2_dev = cal_perplexity(bigram_model, 2, dev_texts)
ppl3_dev = cal_perplexity(trigram_model, 3, dev_texts)

print('dev:', ppl1_dev, ppl2_dev, ppl3_dev)

# ppl1_test = cal_perplexity(unigram_model, 1, test_texts)
# ppl2_test = cal_perplexity(bigram_model, 2, test_texts)
# ppl3_test = cal_perplexity(trigram_model, 3, test_texts)

# print('test:', ppl1_test, ppl2_test, ppl3_test)

       unigram    bigram    trigram
train: 549.0812082590452 61.074203599247376 8.38145802944635
dev: 524.4927581303102 1441.3089611913326 158.77343109253673


#### Discussion

The perplexity of unigram, bigram and trigram can be seen above.
There are some problems when calculating perplexity. If the word pair doesn't appear in the training dataset, then we will meet log zero problem, which is equal to negetive infinity. And in the calculation above, I view the log(0) as **-300** for bigram and **-200** for trigram because this is less than the mimimun value of texts that exists.
Normally, perplexity goes down as N(N-gram) decreases, which is contrary to the result. But when N grows, there are more pairs that have not appeared in the training dataset, so perplexity grows.

### 1.3 Smoothing

#### 1.3.1 Add-one (Laplace) smoothing

##### Code

In [7]:
def cal_perplexity_bigram_addk(model, texts, k=1):
    V = len(word_frequency)
    ret = 0.0
    for text in texts:
        text_ppl = 0
        for i in range(1, len(text)):
            up, down = k, V * k
            up_word = (text[i - 1], text[i])
            down_word = text[i - 1]
            if up_word in model['c_up']:
                up += model['c_up'][up_word]
            if down_word in model['c_down']:
                down += model['c_down'][down_word]
            text_ppl += math.log2(up / down)
        ret += text_ppl
        # print(text_ppl)
    M = sum([len(l) for l in texts])
    return pow(2, -ret / M)

print('add-one smoothing on bigram')

ppl2_addone_train = cal_perplexity_bigram_addk(bigram_model, train_texts, 1)
# print(ppl2_addone_train)
ppl2_addone_dev = cal_perplexity_bigram_addk(bigram_model, dev_texts)
# ppl2_addone_test = cal_perplexity_bigram_addk(bigram_model, test_texts)
print('   train          dev  ')
print(ppl2_addone_train, ppl2_addone_dev)

add-one smoothing on bigram
   train          dev  
617.8600408466235 703.3725830223763


##### Discussion

The perplexity of bigram on train and dev dataset is shown above. Compared to the result in 1.2, the ppl become more reliable because the ppl after using add-one smoothing is similiar on train and dev dataset. And the ppl on dev dataset is lower because after smoothing, there is no log(0). <br/>
But it's strange that ppl on train dataset is larger, since intuitively I think smoothing will make our model better(ppl is lower). And my explanation is that considering every sentence's possibility, c(x1, x2) + 1 / c(x1) + |V|. For every sentence, the total value added to numerator equal the length of sentence, while |V|*len(sentence) is added to denominator, so generally the possibility is lower.

##### Optional: Add-k smoothing

###### Code

In [8]:
for w in range(2, 5):
    ppl2_addk_train = cal_perplexity_bigram_addk(bigram_model, train_texts, k = w)
    ppl2_addk_dev = cal_perplexity_bigram_addk(bigram_model, dev_texts, k = w)
    # ppl2_addk_test = cal_perplexity_bigram_addk(bigram_model, test_texts, k = w)

    print('k =', w, ': ', ppl2_addk_train, ppl2_addk_dev)

k = 2 :  896.1034059620703 966.4312619206362
k = 3 :  1114.562826222555 1173.4343263736675
k = 4 :  1300.0676609064171 1349.767958113951


###### Discussion

The ppl is larger when k is increasing. Again, my explanation is that now the possibility is c(x1, x2) + k / c(x1) + k*|V|. As k gets larger, the possibility gets smaller, so ppl gets larger.

#### 1.3.2 Linear Interpolation

##### Code

In [9]:
def cal_perplexity_linear(unigram, bigram, trigram, w, texts):
    ret = 0.0
    N = unigram['N']
    V = len(word_frequency)
    for text in texts:
        text_ppl = 0
        for i in range(1, len(text)):
            ### trigram
            up, down = 1, V
            if i == 1:
                up_word = (START_TOKEN, text[i - 1], text[i])
                down_word = (START_TOKEN, text[i - 1])
            else:
                up_word = (text[i - 2], text[i - 1], text[i])
                down_word = (text[i - 2], text[i - 1])
            if up_word in trigram['c_up']:
                up += trigram['c_up'][up_word]
            if down_word in trigram['c_down']:
                down += trigram['c_down'][down_word]
            p3 = up / down

            ### unigram
            p1 = unigram['c_up'][text[i]] / N

            ### bigram
            up, down = 1, V
            up_word = (text[i - 1], text[i])
            down_word = text[i - 1]
            if up_word in bigram['c_up']:
                up += bigram['c_up'][up_word]
            if down_word in bigram['c_down']:
                down += bigram['c_down'][down_word]
            p2 = up / down

            text_ppl += math.log2(w[0] * p1 + w[1] * p2 + w[2] * p3)
        # print(text_ppl)
        ret += text_ppl
    M = sum([len(l) for l in texts])
    return pow(2, -ret / M)

mn_ppl = 1e9
best_w = []

for i in range(0, 10):
    for j in range(0, 10):
        if i + j < 10:
            k = 10 - i - j
            w = [i/10, j/10, k/10]
            ppl = cal_perplexity_linear(unigram_model, bigram_model, trigram_model, w, dev_texts)
            if ppl < mn_ppl:
                mn_ppl = ppl
                best_w = w
print(f'best parameters: {best_w}')
print(f'best ppl: {mn_ppl}')

best parameters: [0.5, 0.4, 0.1]
best ppl: 426.28508588292203


In [10]:
### use the best parameter on test dataset
best_w = [0.5, 0.4, 0.1]
print(best_w, ':', cal_perplexity_linear(unigram_model, bigram_model, trigram_model, best_w, test_texts))

[0.5, 0.4, 0.1] : 425.3966712794675


##### Discussion

I tried several k parameters, among which [0.5, 0.4, 0.1] is the best. And I use it in the test dataset and get the result.

##### Optional: Optimization

###### Discussion

First the problem can be viewed as select three parameters [w0, w1, w2] that minimize perplexity. The first idea is that we can assume value's step is 0.1 so we can iterate all possible tuples and get the best parameters.
Another idea is what called Simulated annealing(SA), or just use binary search(BS). We can fix two parameters and do SA or BS on the third parameters. WE do this procedure three times on every w.

## 2 Preposition Prediction

In [11]:

def load_file_labels(file_name):
    texts = []
    with open(file_name, encoding='utf-8') as f:
        for line in f:
            text = line.strip().lower().split(' ')
            texts.extend(text)
    return texts

PREP = ['at', 'in', 'of', 'for', 'on']

def predict(unigram, bigram, trigram, w, texts):
    ret = 0.0
    N = unigram['N']
    V = len(word_frequency)
    ret = []
    for text in texts:
        tmp = []
        for i in range(1, len(text)):
            if text[i] == '<prep>':
                max = -1e9
                select_prep = ''
                for prep in PREP:
                    text[i] = prep

                    ### trigram
                    up, down = 1, V
                    if i == 1:
                        up_word = (START_TOKEN, text[i - 1], text[i])
                        down_word = (START_TOKEN, text[i - 1])
                    else:
                        up_word = (text[i - 2], text[i - 1], text[i])
                        down_word = (text[i - 2], text[i - 1])
                    if up_word in trigram['c_up']:
                        up += trigram['c_up'][up_word]
                    if down_word in trigram['c_down']:
                        down += trigram['c_down'][down_word]
                    p3 = up / down

                    ### unigram
                    p1 = unigram['c_up'][text[i]] / N

                    ### bigram
                    up, down = 1, V
                    up_word = (text[i - 1], text[i])
                    down_word = text[i - 1]
                    if up_word in bigram['c_up']:
                        up += bigram['c_up'][up_word]
                    if down_word in bigram['c_down']:
                        down += bigram['c_down'][down_word]
                    p2 = up / down
                    text_ppl = w[0] * p1 + w[1] * p2 + w[2] * p3
                    if text_ppl > max:
                        max = text_ppl
                        select_prep = prep
                text[i] = select_prep
                tmp.append(select_prep)
        ret.append(tmp)
    return ret

data_inputs = load_file('data/prep/dev.in')
data_labels = load_file_labels('data/prep/dev.out')
model_out = predict(unigram_model, bigram_model, trigram_model, [0.0, 0.02, 0.98], data_inputs)
ans = []
for x in model_out:
    ans.extend(x)
acc = 0
for i in range(len(ans)):
    if ans[i] == data_labels[i]:
        acc += 1
print(f'accuracy = {acc / len(ans):.4f}')


### produce test.out
test_inputs = load_file('data/prep/test.in')
test_labels = predict(unigram_model, bigram_model, trigram_model, [0.0, 0.02, 0.98], test_inputs)

with open('test.out', 'w', encoding='utf-8') as f:
    for prep in test_labels:
        f.write(' '.join(prep))
        f.write('\n')


accuracy = 0.7061


The accuracy on the dev dataset is 70.61%. I use the Linear Interpolation method and manually choose the parameter which is different from the best parameter for perplexity. My explanation is that it is somewhat ridiculous to use unigram to predict prepostion. Thus I only focus on bigram and trigram. And I manually try some parameters and get the result.

------------------

Below is my previous work because I didn't realize that we shall use the model in section1 to deal with section2. And I tried to use something learnt from the class to do it. I think it's bad to just delete it so I keep it.

I use double LSTM. First we reorder the dataset, with duplicated sentence. Every sentence now only deal with one preposition. And I ensure the prepositon located in the mid of the sentence by add '<pad>' in the begining and at the end. And I divided the dev dataset into 90% and 10% which used for training and validation. The the model goes into the double LSTM network. <br/>
The network first use word embedding to represent the word. Then, because for every sentence, we only need to predict the mid prepostion, I use two LSTMs. One goes from begining to the mid and the other goes from the end to the mid. I then concatanate the result of the two LSTMs and goes into a linear layer to pridict the preposition.<br/>
By choosing proper hyperparameters, the model can get 66% accuracy on the dataset. 

In [12]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.autograd import Variable
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split
import nltk

import copy
import random
import time

from urllib3 import encode_multipart_formdata

def message(msg):
    print(msg, '-'*100)

def set_seed(seed=1120):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

set_seed()

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

### const
PREP_LABEL = '<prep>'
UNKNOWN_LABEL = '<unk>'
PAD_LABEL = '<pad>'
PREP = ['at', 'in', 'of', 'for', 'on']
PREP_MAP = {'at': 0, 'in': 1, 'of': 2, 'for': 3, 'on':4}
CLASSES_NUM = len(PREP)
MAX_LEN = 0

### hyperparameter
SHUFFLE_DATASET = True
BATCH_SIZE = 64
WORD_DIM = 512
LEARNING_RATE=0.01
EPOCH = 10


def load_file_inputs(file_name):
    texts = []
    cnts = []
    with open(file_name, encoding='utf-8') as f:
        for line in f:
            text = line.strip().lower().split(' ')
            cnt = 0
            for i, word in enumerate(text):
                if word == PREP_LABEL.lower():
                    text[i] = UNKNOWN_LABEL
                    cnt += 1
            cnts.append(cnt)
            for i, word in enumerate(text):
                if word == UNKNOWN_LABEL:
                    tmp = copy.deepcopy(text)
                    tmp[i] = PREP_LABEL
                    texts.append(tmp)
    return texts, cnts

def load_file_labels(file_name):
    texts = []
    with open(file_name, encoding='utf-8') as f:
        for line in f:
            text = line.strip().lower().split(' ')
            texts.extend(PREP_MAP[prep] for prep in text)
    return np.array(texts)

data_inputs, _ = load_file_inputs('data/prep/dev.in')
data_labels = load_file_labels('data/prep/dev.out')

test_inputs, test_cnts = load_file_inputs('data/prep/test.in')

# print(len(data_inputs))
# print(len(data_labels))
assert(len(data_inputs) == len(data_labels))

message('finish loading')

def add_pad(texts, init_maxlen=0):
    max_len = init_maxlen
    ret = []
    for text in texts:
        p = text.index(PREP_LABEL)
        max_len = max(max_len, p, len(text) - p - 1)
    for i, text in enumerate(texts):
        p = text.index(PREP_LABEL)
        l, r = p, len(text) - p - 1
        tmp = [PAD_LABEL] * (max_len - l) + text + [PAD_LABEL] * (max_len - r)
        tmp[max_len+1:len(tmp)] = tmp[len(tmp)-1:max_len:-1]
        ret.append(tmp)
    return ret, max_len

def init_vocab(texts):
    word2id = {}
    word2id[PAD_LABEL] = 0
    word2id[UNKNOWN_LABEL] = 1
    word2id[PREP_LABEL] = 2
    for text in texts:
        for word in text:
            if word not in word2id:
                word2id[word] = len(word2id)
    embeddings = np.random.uniform(-0.25, 0.25, (len(word2id), WORD_DIM))  ## reverse the second half
    embeddings[word2id[PAD_LABEL]] = np.zeros((WORD_DIM, )) ## <pad> is zero

    for i, text in enumerate(texts):
        texts[i] = [word2id[word] for word in text]
    texts = np.array(texts)
    return word2id, embeddings,

### load test data
test_inputs, test_maxlen = add_pad(test_inputs)
message(f'test_maxlen: {test_maxlen}')

### load train data
data_inputs, MAX_LEN = add_pad(data_inputs, test_maxlen)
word2id, embeddings = init_vocab(data_inputs)
message(f'max_len: {MAX_LEN}')

for i, text in enumerate(test_inputs):
    for j, word in enumerate(text):
        if word in word2id:
            test_inputs[i][j] = word2id[word]
        else:
            test_inputs[i][j] = word2id[UNKNOWN_LABEL]


### split input into train and dev 
def data_loader(inputs_train, inputs_dev, labels_train, labels_dev, batch_size=BATCH_SIZE):
    # print(inputs_train)
    inputs_train = torch.tensor(inputs_train)
    inputs_dev = torch.tensor(inputs_dev)
    labels_train = torch.tensor(labels_train, dtype=torch.long)
    labels_dev = torch.tensor(labels_dev, dtype=torch.long)

    data_train = TensorDataset(inputs_train, labels_train)
    sampler_train = RandomSampler(data_train)
    dataloader_train = DataLoader(data_train, sampler=sampler_train, batch_size=batch_size)

    data_dev = TensorDataset(inputs_dev, labels_dev)
    sampler_dev = SequentialSampler(data_dev)
    dataloader_dev = DataLoader(data_dev, sampler=sampler_dev, batch_size=batch_size)

    return dataloader_train, dataloader_dev


inputs_train, inputs_dev, labels_train, labels_dev = train_test_split(\
    data_inputs, data_labels, test_size=0.1, shuffle=SHUFFLE_DATASET)

dataloader_train, dataloader_dev = data_loader(inputs_train, inputs_dev, labels_train, labels_dev, BATCH_SIZE)

message('finish data')




### model #######################################################################
HIDDEN_LAYER_SIZE = 256
vocab_size = len(word2id)

class LSTM(nn.Module):
    def __init__(self, 
                word_dim=WORD_DIM):
        super(LSTM, self).__init__()
        self.lstm = nn.LSTM(input_size=word_dim, hidden_size=HIDDEN_LAYER_SIZE, num_layers=1, batch_first=True)

    def forward(self, input):
        output, (h_n, c_n) = self.lstm(input)
        # print(h_n)
        # print(output.shape)
        return h_n[-1]
        # return output[:,-1,:]

class Double_LSTM(nn.Module):
    def __init__(self, 
                vocab_size,
                word_dim=WORD_DIM,
                pretrained_embedding=None,
                classes_num=CLASSES_NUM,
                dropout=0.5):
        super(Double_LSTM, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                        embedding_dim=word_dim,
                                        max_norm=5.0,
                                        padding_idx=0)
        self.lstm1 = LSTM(word_dim)
        self.lstm2 = LSTM(word_dim)
        self.fc = nn.Linear(2 * HIDDEN_LAYER_SIZE, classes_num)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, input):
        x = self.embedding(input)
        # print(x)
        xl, _, xr = x.split([MAX_LEN, 1, MAX_LEN], dim=1)
        xl = self.lstm1(xl)
        xr = self.lstm2(xr)
        # print(xl)
        x_fc = torch.cat([xl, xr], dim=1)
        x_fc = self.dropout(x_fc)
        # print(x_fc.shape)
        # x_fc = self.dropout(x_fc)
        x_fc = self.fc(x_fc)
        return x_fc


### train ############################################################################
message('start train')

loss_f = nn.CrossEntropyLoss()



def train(model, optimizer, dataloader_train, dataloader_dev, epochs=EPOCH):
    best_accuracy = 0
    for epoch_i in range(epochs):
        start_time = time.time()
        total_error = 0

        model.train()
        for step, batch in enumerate(dataloader_train):
            inputs, labels = tuple(t.to(device) for t in batch)
            model.zero_grad()
            y = model(inputs)
            # print(y)
            # exit()
            error = loss_f(y, labels)
            total_error += error.item()
            # print(error)

            error.backward()
            optimizer.step()

        dev_accuracy = evaluate(model, dataloader_dev)
        avg_error = total_error / len(dataloader_train)

        if dev_accuracy > best_accuracy:
            best_accuracy = dev_accuracy
            torch.save(model.state_dict(), 'double_lstm.pt')
        
        time_elapsed = time.time() - start_time
        print(f'time{time_elapsed:.2f}, epoch_{epoch_i}: dev_accuracy {dev_accuracy:.2f}, loss {avg_error}')

def evaluate(model, dataloader_dev):
    model.eval()
    accuracy_list = []
    print_first = True
    for step, batch in enumerate(dataloader_dev):
        inputs, labels = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            y = model(inputs)
        preds = torch.argmax(y, dim=1).flatten()
        accuracy = (preds == labels).cpu().numpy().mean() * 100
        accuracy_list.append(accuracy)
        # if print_first:
        #     print(f'labels{labels}')
        #     print(f'predic{preds}')
        #     print_first = False
        # exit(0)

    return np.mean(accuracy_list)


double_lstm = Double_LSTM(len(word2id))
double_lstm.to(device)

# optimizer = optim.Adadelta(double_lstm.parameters(),  ## get all 2 if use Adadelta
#                                 lr=LEARNING_RATE,
#                                 rho=0.95)
optimizer = optim.Adam(double_lstm.parameters(), 
                        lr=LEARNING_RATE)

train(double_lstm, optimizer, dataloader_train, dataloader_dev)
# exit()



# ### model on test data ##########################################################
# double_lstm.load_state_dict(torch.load('double_lstm.pt'))
# test_inputs = torch.tensor(test_inputs)
# # for i in range(len(test_inputs)):
# #     test_inputs[i].to(device)
# # test_inputs.to(device)

# test_labels = double_lstm(test_inputs)
# test_labels = torch.argmax(test_labels, dim=1).flatten()

# preps = []
# p = 0
# for cnt in test_cnts:
#     tmp = []
#     s = p
#     while p < s + cnt:
#         tmp.append(PREP[test_labels[p].item()])
#         p += 1
#     preps.append(tmp)

# with open('dblstm_test.out', 'w', encoding='utf-8') as f:
#     for prep in preps:
#         f.write(' '.join(prep))
#         f.write('\n')




finish loading ----------------------------------------------------------------------------------------------------
test_maxlen: 86 ----------------------------------------------------------------------------------------------------
max_len: 86 ----------------------------------------------------------------------------------------------------
finish data ----------------------------------------------------------------------------------------------------
start train ----------------------------------------------------------------------------------------------------
time1.09, epoch_0: dev_accuracy 64.06, loss 1.231169880964817
time0.74, epoch_1: dev_accuracy 62.75, loss 0.41890043860826737
time1.04, epoch_2: dev_accuracy 63.50, loss 0.09614764583798555
time0.86, epoch_3: dev_accuracy 64.62, loss 0.05361938773869322
time0.93, epoch_4: dev_accuracy 67.81, loss 0.030104539691446684
time0.68, epoch_5: dev_accuracy 64.06, loss 0.01125405211515056
time0.65, epoch_6: dev_accuracy 65.31, loss 0