In [None]:
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
os.chdir('/data2/home/zhaoyi/labs/USTC-labs/deeplearn_lab2')
import torch
import torchtext
import pandas as pd
import numpy as np
from torchtext.legacy import data, datasets
import torch.nn as nn
import torch.nn.functional as F

In [None]:
TEXT = data.Field(lower=True, batch_first=True, fix_length=50)
LABEL = data.Field(sequential=False)
train, dev, test = data.TabularDataset.splits(path='/data2/home/zhaoyi/labs/USTC-labs/deeplearn_lab2/dataset/procd/', train='train.csv', validation='dev.csv', test='test.csv', 
                format='csv', fields=[('Text',TEXT),('Label',LABEL)])

https://zhuanlan.zhihu.com/p/447309785  word2vec approaches

https://zhuanlan.zhihu.com/p/562237953 pretrained word2vecs (e.g. glove of different versions)

In [None]:
# construct and load word-vectors from a pretrained file
TEXT.build_vocab(train, vectors="glove.6B.100d", max_size=10000, min_freq=10)
# glove-file-location : workspace/.vector_cache
LABEL.build_vocab(train)
# print(TEXT.vocab.freqs.most_common(20))

In [None]:
print(TEXT.vocab.freqs.most_common(20))

In [None]:
# defintion of data_loader
mydevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(mydevice)
train_iter, dev_iter, test_iter = data.BucketIterator.splits((train, dev, test), batch_size=16, device=mydevice, shuffle=True, sort=False)

In [None]:
# here is specific batch to see the content of the train/test-iters
batch = next(iter(train_iter))
print(batch)
print('batch.Text = \n',batch.Text)
print('batch.Label = \n',batch.Label)

In [None]:
# model architecture for sentiment classification: LSTM + MLP
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, 
                num_layers, bidirectional, drop_out, pad_idx, batch_first = False):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, 
                        padding_idx = pad_idx)
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, 
                            batch_first = batch_first, bidirectional=bidirectional,
                            dropout=drop_out)
        '''
        nn.LSTM(input_size, hidden_size, num_layers)
        num_layers: the layer_num of LSTM, usually an important thing in LSTM-based model architecture...
        bidirectional: also an important hyperparameter...
        reference:https://blog.csdn.net/baidu_38963740/article/details/117197619?spm=1001.2101.3001.6650.1&utm_medium=distribute.pc_relevant.none-task-blog-2%7Edefault%7ECTRLIST%7Edefault-1.no_search_link&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2%7Edefault%7ECTRLIST%7Edefault-1.no_search_link
        '''
        if bidirectional == False:
            num_direction = 1
        else:
            num_direction = 2
        lstm_output_dim = num_direction * hidden_dim

        self.fc = nn.Linear(lstm_output_dim, 2)
        # for this case is a 2-class problem
        self.dropout = nn.Dropout(drop_out)
    
    def forward(self, x):
        embedded = self.embedding(x)
        '''
        x.shape = embedded.shape = (batch_size, seq_len, embedding_dim) [tips: when we set `batch_first` == True]
        otherwise, x.shape = embedded.shape = (seq_len, bs, embedding_dim)
        '''
        lstm_output, (_, _) = self.lstm(embedded)
        '''
        when num_layers = bidirectional = 1 and batch_first = True
        size of lstm_output: (batch_size, seq_len, hidden_dim * num_directions)
        size of h_n and c_n: (num_layers * num_directions = 1, batch_size, hidden_size) 
        '''

        output = self.dropout(self.fc(lstm_output[:, -1, :]))
        '''
        we only select last-step of seq_len in the lstm_output as 
        the encoding sentence vector, for it is containing the information
        of the whole sentence(unidirectionally speaking),
        when we adapt bidirectional lstm, we can choose any-step of seq_len
        instead.
        '''
        '''
        output:(batch_size, encoding_vector_dim=2)
        '''
        return F.log_softmax(output, dim = 1)

pad_idx = TEXT.vocab.stoi[TEXT.pad_token]

# definition of model and optimizer
model = LSTM(len(TEXT.vocab.stoi), 100, 128, 2, False, 0.4, pad_idx, True)
model.embedding.weight.data = TEXT.vocab.vectors
model.embedding.weight.requires_grad = False
# frozen pretrained embedding weights

model = model.cuda()
'''
(self, vocab_size, embedding_dim, hidden_dim, 
num_layers, bidirectional, drop_out, pad_idx, batch_first = False)
'''
opt = torch.optim.Adam(model.parameters(),lr=1e-3)

In [None]:
# training function
def train_epoch(model, opt, data_loader, phase='training'):
    '''
    function: train model with opt for one epoch
    '''
    if phase == 'training':
        model.train()
    if (phase == 'validation') or (phase == 'testing'):
        model.eval()
    # model.train() : open `batch_normalization` and `drop_out`
    # model.eval() : open `batch_normalization`, close `drop_out`
    running_loss = 0.0
    running_correct = 0.0
    for _, batch in enumerate(data_loader):
        text, target = batch.Text, batch.Label
        if mydevice == 'cuda':
            text, target = text.cuda(),target.cuda()
        if phase == 'training':
            opt.zero_grad()
        output = model(text)
       
        loss = F.nll_loss(output, target-1)
        running_loss = F.nll_loss(output, target-1, size_average=False).data
        preds = output.data.max(dim=1, keepdim=True)[1] + 1
        # for label '0' -> 1(in vocab); label '1' -> 2(in vocab);
        running_correct += preds.eq(target.data.view_as(preds)).sum()
        if phase == 'training':
            loss.backward()
            opt.step()

    
    running_loss = running_loss.type(torch.FloatTensor)
    running_correct = running_correct.type(torch.FloatTensor)
    
    # IMPORTANT above! otherwise accuracy will be zero all the time!
    loss = running_loss/len(data_loader.dataset)
    accuracy = running_correct/len(data_loader.dataset)
    # print(type(loss),type(accuracy))
    
 
    print(f'{phase} loss is {loss:{5}.{2}} and {phase} accuracy is {running_correct}/{len(data_loader.dataset)} {accuracy:{10}.{4}}')
    return loss,accuracy

In [None]:
# collect results
train_losses, train_accuracy = [], []
val_losses, val_accuracy = [], []

train_iter.repeat = False
test_iter.repeat = False

epoch_max = 20
for epoch in range(1,epoch_max+1):
    print('---the ',epoch,"'s training starts---")
    epoch_loss, epoch_accuracy = train_epoch(model, opt, train_iter, phase='training')
    val_epoch_loss, val_epoch_accuracy = train_epoch(model, opt, dev_iter, phase='validation')
    train_losses.append(epoch_loss)
    train_accuracy.append(epoch_accuracy)
    val_losses.append(val_epoch_loss)
    val_accuracy.append(val_epoch_accuracy)
    print('---the ',epoch,"'s training ends---")

# test model's performance
train_epoch(model, opt, test_iter, phase='testing')
