<center><font size =5>nlp pj 1</font></center>
<p align='right'>刘卓瀚 21307130254</p>

### dataset
> cite:
> @incollection{SocherEtAl2013:RNTN,
> title = {{Parsing With Compositional Vector Grammars}},
> author = {Richard Socher and Alex Perelygin and Jean Wu and Jason Chuang and Christopher Manning and Andrew Ng and Christopher Potts},
> booktitle = {{EMNLP}},
> year = {2013}
> }

dataset source: https://nlp.stanford.edu/sentiment/index.html

### model
input -> embedding(glove/random) -> encode(transformer_encoder/rnn) -> (attention) -> classifier(maxpooling + mlp) -> softmax -> output

### environment
- python==3.9
- torch==1.13.0+cu116
- torchtext==0.6.0 

### experiment
- glove + transformer_encoder + adam optimizer
- glove + transformer_encoder + attention before mlp + adam optimizer
- grid search for hyperparameters(learning rate, batch size)
- random(set the dim of vectors to 512) + transformer_encoder + sgd optimizer
- random(set the dim of vectors to 512) + transformer_encoder + adam optimizer + scheduler

import packages

In [1]:
import torch
import torchtext
from torch import nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer

load the dataset and initial it in two ways:
- map each word to a vector in the pretrained glove vocabulary
- build the local vocabulary and map each word to a random vector(implemented in the encoder class)

In [2]:
config = dict()
config['batch_size'] = 16
config['init'] = 'GloVe'
config['maxtextlen'] = 200
config['vecdim'] = 100

def build_dataset(config):
    TEXT = torchtext.data.Field(lower=True,fix_length=config['maxtextlen'],batch_first=True)
    LABEL = torchtext.data.Field(sequential=False)
    train,valid,test = torchtext.datasets.SST.splits(TEXT,LABEL)
    if config['init'] == 'GloVe':
        TEXT.build_vocab(train,vectors=torchtext.vocab.GloVe(name='6B',dim=config['vecdim']),max_size=20000,min_freq=10)
        LABEL.build_vocab(train)
    elif config['init'] == 'random':
        TEXT.build_vocab(train,max_size=20000,min_freq=10)
        LABEL.build_vocab(train)
    train_iter, valid_iter, test_iter = torchtext.data.BucketIterator.splits((train,valid,test),batch_size=config['batch_size'])
    return train_iter, valid_iter, test_iter, TEXT, LABEL

train_iter, valid_iter, test_iter, TEXT, LABEL = build_dataset(config)

two kinds of encoder:
- transformer_encoder
  - d_model: the dimension of the vector after embedding, 100 for glove, 512 for random
  - nhead: the number of heads in the multiheadattention models
  - num_encoder_layers: the number of sub-encoder-layers in the encoder
  - dim_feedforward: the dimension of the feedforward network model
  - dropout: the dropout value
- rnn_encoder:
  - d_model: the dimension of the vector after embedding, 100 for glove, 512 for random
  - num_encoder_layers: the number of sub-encoder-layers in the encoder

In [3]:
transformer_config = dict()
transformer_config['nvocab'] = len(TEXT.vocab)
transformer_config['d_model'] = config['vecdim']
transformer_config['nhead'] = 4
transformer_config['num_encoder_layers'] = 2
transformer_config['dim_feedforward'] = 2048
transformer_config['dropout'] = 0.1

rnn_config = dict()
rnn_config['d_model'] = config['vecdim']
rnn_config['num_encoder_layers'] = 2

config['encoder_config'] = transformer_config
config['encoder_name'] = 'transformer'

class encoder(nn.Module):
    def __init__(self,name,config,embedding) -> None:
        super().__init__()
        if(embedding == 'GloVe'):
            self.embedding = nn.Embedding(config['nvocab'],config['d_model'],_weight=TEXT.vocab.vectors)
        elif embedding == 'random':   
            self.embedding = nn.Embedding(config['nvocab'],config['d_model'])
        if(name == 'transformer'):
            encoder_layer = TransformerEncoderLayer(config['d_model'],config['nhead'],config['dim_feedforward'],config['dropout'])
            self.encoder = TransformerEncoder(encoder_layer,config['num_encoder_layers'])
        elif(name == 'RNN'):
            self.encoder = nn.RNN(config['d_model'],config['d_model'],config['num_encoder_layers'])
        
    def forward(self,x):
        x = self.embedding(x)
        return self.encoder(x)

classifier: mlp, only one layer  
num_classes is set to 4 due to the dataset  
feature_size is set the same as d_model above

In [4]:
mlp_config = dict()
mlp_config['feature_size'] = config['encoder_config']['d_model']
mlp_config['num_classes'] = 4

config['classifier_config'] = mlp_config
config['classifier_name'] = 'mlp'

class classifier(nn.Module):
    def __init__(self,config) -> None:
        super().__init__()
        self.linear = nn.Linear(config['feature_size'],config['num_classes'])
        
    def forward(self,x):
        x = self.linear(x)
        return x

the complete model:
- encoder
- maxpooling
- attention(optional)
- classifier
- softmax

In [5]:
config['attention'] = False
config['attention_nhead'] = 4

class MyModel(nn.Module):
    def __init__(self,config) -> None:
        super().__init__()
        self.encoder = encoder(config['encoder_name'],config['encoder_config'],config['init'])
        self.classifier = classifier(config['classifier_config'])
        self.attention = None
        if(config['attention']):
            self.attention = nn.MultiheadAttention(config['encoder_config']['d_model'],config['attention_nhead'])
        
    def forward(self,x):
        x = self.encoder(x)
        # print(x.shape)
        x = nn.MaxPool1d(x.shape[1])(x.permute(0,2,1)).squeeze(2)
        # print(x.shape)
        if(self.attention is not None):
            x = self.attention(x,x,x)[0]
        x = self.classifier(x)
        return torch.nn.functional.softmax(x,dim=1)

two optimizer:
- adam
- sgd

In [6]:
config['optimizer'] = 'adam'
config['learning_rate'] = 1e-4

def get_optimizer(choice,model,lr):
    if(choice == 'adam'):
        return torch.optim.Adam(model.parameters(),lr=lr)
    else:
        return torch.optim.SGD(model.parameters(),lr=lr)

train and test the model  
- for training, the loss is cross entropy loss, the loss and accuracy will be valuated each 10 epoch using the validation set
- for testing, the loss is cross entropy loss, the accuracy will be valuated using the test set
- the log option is to judge whether to print the loss and accuracy during training
- the number of epoch is set to 10

In [7]:
config['nepoch'] = 10
config['log'] = True


def train(config):
    model = MyModel(config)
    model = model.cuda()
    criterion = nn.CrossEntropyLoss()
    optimizer = get_optimizer(config['optimizer'],model,config['learning_rate'])
    # scheduler = torch.optim.lr_scheduler.StepLR(optimizer,10,gamma=0.5)
    for epoch in range(config['nepoch']):
        for i,batch in enumerate(train_iter):
            optimizer.zero_grad()
            text,label = batch.text,batch.label
            # print(text.shape)
            # print(label.shape)
            # print(text.dtype)
            text = text.cuda()
            label = label.cuda()
            model = model.train()
            output = model(text)
            # print(output.shape)
            loss = criterion(output,label)
            # print('epoch:{},batch:{},loss:{}'.format(epoch,i,loss.item()))
            loss.backward()
            optimizer.step()
            if(i%10 == 0):
                _,batch = next(enumerate(valid_iter))
                text,label = batch.text,batch.label
                text = text.cuda()
                label = label.cuda()
                model = model.eval()
                output = model(text)
                loss = criterion(output,label)
                correct = torch.sum(torch.argmax(output,dim=1) == label).item()
                accuracy = correct/len(label)
                if(config['log']):
                    print('epoch:{},batch:{},valid loss:{},accuracy:{}'.format(epoch,i,loss.item(),accuracy))
                # print('valid loss:{}'.format(loss.item()))
        # scheduler.step()
    return model

def test(model):
    correct = 0
    total = test_iter.batch_size * len(test_iter)
    for i,batch in enumerate(test_iter):
        text,label = batch.text,batch.label
        text = text.cuda()
        label = label.cuda()
        model = model.eval()
        output = model(text)
        output = torch.argmax(output,dim=1)
        correct += torch.sum(output == label).item()
    accuracy = correct/total
    print('accuracy:{}'.format(accuracy))
    return accuracy

experiment starts  
the first is glove + transformer_encoder + adam optimizer  
with the accuracy of 62.5%

In [8]:
model = train(config)

epoch:0,batch:0,valid loss:1.3033047914505005,accuracy:0.5
epoch:0,batch:10,valid loss:1.4555109739303589,accuracy:0.125
epoch:0,batch:20,valid loss:1.4048328399658203,accuracy:0.25
epoch:0,batch:30,valid loss:1.3641164302825928,accuracy:0.375
epoch:0,batch:40,valid loss:1.373837947845459,accuracy:0.375
epoch:0,batch:50,valid loss:1.4061107635498047,accuracy:0.3125
epoch:0,batch:60,valid loss:1.3635584115982056,accuracy:0.375
epoch:0,batch:70,valid loss:1.3936657905578613,accuracy:0.375
epoch:0,batch:80,valid loss:1.371219277381897,accuracy:0.375
epoch:0,batch:90,valid loss:1.384311318397522,accuracy:0.375
epoch:0,batch:100,valid loss:1.4011622667312622,accuracy:0.375
epoch:0,batch:110,valid loss:1.3648145198822021,accuracy:0.375
epoch:0,batch:120,valid loss:1.3774707317352295,accuracy:0.375
epoch:0,batch:130,valid loss:1.408767819404602,accuracy:0.3125
epoch:0,batch:140,valid loss:1.364759922027588,accuracy:0.375
epoch:0,batch:150,valid loss:1.349884271621704,accuracy:0.4375
epoch:0,b

In [9]:
import time
t = time.localtime()
time_str = time.strftime("%Y-%m-%d-%H-%M-%S", t)
path = './exp/' + time_str + '.pth'
torch.save(model.state_dict(),path)

In [10]:
test(model)

accuracy:0.6245503597122302


0.6245503597122302

glove + transformer_encoder + attention before mlp + adam optimizer
accuracy: 60.8%  
seems that the attention module before mlp doesn't make it work better

In [11]:
config['attention'] = True
config['attention_nhead'] = 4
config['nepoch'] = 10
model = train(config)
test(model)

epoch:0,batch:0,valid loss:1.4013829231262207,accuracy:0.125
epoch:0,batch:10,valid loss:1.5273172855377197,accuracy:0.125
epoch:0,batch:20,valid loss:1.5101301670074463,accuracy:0.125
epoch:0,batch:30,valid loss:1.3949425220489502,accuracy:0.375
epoch:0,batch:40,valid loss:1.3703863620758057,accuracy:0.375
epoch:0,batch:50,valid loss:1.4521245956420898,accuracy:0.125
epoch:0,batch:60,valid loss:1.4810879230499268,accuracy:0.125
epoch:0,batch:70,valid loss:1.426999807357788,accuracy:0.125
epoch:0,batch:80,valid loss:1.4196863174438477,accuracy:0.125
epoch:0,batch:90,valid loss:1.4396880865097046,accuracy:0.125
epoch:0,batch:100,valid loss:1.459096908569336,accuracy:0.125
epoch:0,batch:110,valid loss:1.4447401762008667,accuracy:0.125
epoch:0,batch:120,valid loss:1.438862919807434,accuracy:0.125
epoch:0,batch:130,valid loss:1.4291874170303345,accuracy:0.125
epoch:0,batch:140,valid loss:1.426395058631897,accuracy:0.125
epoch:0,batch:150,valid loss:1.4324805736541748,accuracy:0.125
epoch:0

0.608363309352518

grid search for hyperparameters(learning rate, batch size)
- learning rate = 0.0001/0.001 works well, but 0.01 is too large
- batch size seems to have no effect on the accuracy

In [12]:
# 网格搜索
config['attention'] = False
lr = [1e-4,1e-3,1e-2]
batch_size = [16,32,64]
config['nepoch'] = 10
config['log'] = False
accuracy = []

for i in lr:
    for j in batch_size:
        config['learning_rate'] = i
        config['batch_size'] = j
        print('lr:{},batch_size:{}'.format(i,j))
        model = train(config)
        accuracy.append(test(model))

lr:0.0001,batch_size:16
accuracy:0.6299460431654677
lr:0.0001,batch_size:32
accuracy:0.6218525179856115
lr:0.0001,batch_size:64
accuracy:0.6294964028776978
lr:0.001,batch_size:16
accuracy:0.6007194244604317
lr:0.001,batch_size:32
accuracy:0.6011690647482014
lr:0.001,batch_size:64
accuracy:0.6065647482014388
lr:0.01,batch_size:16
accuracy:0.4087230215827338
lr:0.01,batch_size:32
accuracy:0.4087230215827338
lr:0.01,batch_size:64
accuracy:0.4087230215827338


random(set the dim of vectors to 512) + transformer_encoder + sgd optimizer  
accuracy: 55.8%

In [13]:
config['optimizer'] = 'sgd'
config['log'] = True
config['batch_size'] = 16
config['learning_rate'] = 1e-3
config['init'] = 'random'
config['encoder_config']['d_model'] = 512
config['classifier_config']['feature_size'] = 512
config['nepoch'] = 30
train_iter, valid_iter, test_iter, TEXT, LABEL = build_dataset(config)
model = train(config)
test(model)

epoch:0,batch:0,valid loss:1.3595155477523804,accuracy:0.3125
epoch:0,batch:10,valid loss:1.4074476957321167,accuracy:0.1875
epoch:0,batch:20,valid loss:1.426666498184204,accuracy:0.1875
epoch:0,batch:30,valid loss:1.3868436813354492,accuracy:0.1875
epoch:0,batch:40,valid loss:1.3735367059707642,accuracy:0.25
epoch:0,batch:50,valid loss:1.3854525089263916,accuracy:0.25
epoch:0,batch:60,valid loss:1.3614096641540527,accuracy:0.3125
epoch:0,batch:70,valid loss:1.364307165145874,accuracy:0.3125
epoch:0,batch:80,valid loss:1.3699952363967896,accuracy:0.25
epoch:0,batch:90,valid loss:1.394089937210083,accuracy:0.1875
epoch:0,batch:100,valid loss:1.3884210586547852,accuracy:0.1875
epoch:0,batch:110,valid loss:1.3846442699432373,accuracy:0.1875
epoch:0,batch:120,valid loss:1.3885319232940674,accuracy:0.1875
epoch:0,batch:130,valid loss:1.3985987901687622,accuracy:0.1875
epoch:0,batch:140,valid loss:1.4124358892440796,accuracy:0.1875
epoch:0,batch:150,valid loss:1.3850396871566772,accuracy:0.2

0.5575539568345323

random(set the dim of vectors to 512) + transformer_encoder + adam optimizer + scheduler  
accuracy: 61.6%  
seems that random initialization doesn't work well  
and sgd optimizer is less effective than adam optimizer

In [16]:
config['optimizer'] = 'adam'
config['log'] = True
config['batch_size'] = 16
config['learning_rate'] = 1e-4
config['init'] = 'random'
config['encoder_config']['d_model'] = 512
config['classifier_config']['feature_size'] = 512
config['nepoch'] = 10
train_iter, valid_iter, test_iter, TEXT, LABEL = build_dataset(config)


def train(config):
    model = MyModel(config)
    model = model.cuda()
    criterion = nn.CrossEntropyLoss()
    optimizer = get_optimizer(config['optimizer'],model,config['learning_rate'])
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,1,gamma=0.9)
    for epoch in range(config['nepoch']):
        for i,batch in enumerate(train_iter):
            optimizer.zero_grad()
            text,label = batch.text,batch.label
            # print(text.shape)
            # print(label.shape)
            # print(text.dtype)
            text = text.cuda()
            label = label.cuda()
            model = model.train()
            output = model(text)
            # print(output.shape)
            loss = criterion(output,label)
            # print('epoch:{},batch:{},loss:{}'.format(epoch,i,loss.item()))
            loss.backward()
            optimizer.step()
            if(i%10 == 0):
                _,batch = next(enumerate(valid_iter))
                text,label = batch.text,batch.label
                text = text.cuda()
                label = label.cuda()
                model = model.eval()
                output = model(text)
                loss = criterion(output,label)
                correct = torch.sum(torch.argmax(output,dim=1) == label).item()
                accuracy = correct/len(label)
                if(config['log']):
                    print('epoch:{},batch:{},valid loss:{},accuracy:{}'.format(epoch,i,loss.item(),accuracy))
                # print('valid loss:{}'.format(loss.item()))
        scheduler.step()
    return model

model = train(config)
test(model)

epoch:0,batch:0,valid loss:1.2668275833129883,accuracy:0.5
epoch:0,batch:10,valid loss:1.3960175514221191,accuracy:0.3125
epoch:0,batch:20,valid loss:1.3965139389038086,accuracy:0.3125
epoch:0,batch:30,valid loss:1.3500370979309082,accuracy:0.375
epoch:0,batch:40,valid loss:1.3625673055648804,accuracy:0.375
epoch:0,batch:50,valid loss:1.3740121126174927,accuracy:0.375
epoch:0,batch:60,valid loss:1.3471786975860596,accuracy:0.4375
epoch:0,batch:70,valid loss:1.4055596590042114,accuracy:0.3125
epoch:0,batch:80,valid loss:1.3638920783996582,accuracy:0.3125
epoch:0,batch:90,valid loss:1.4305912256240845,accuracy:0.3125
epoch:0,batch:100,valid loss:1.390799880027771,accuracy:0.3125
epoch:0,batch:110,valid loss:1.3746086359024048,accuracy:0.3125
epoch:0,batch:120,valid loss:1.4038790464401245,accuracy:0.3125
epoch:0,batch:130,valid loss:1.409687876701355,accuracy:0.3125
epoch:0,batch:140,valid loss:1.3882880210876465,accuracy:0.375
epoch:0,batch:150,valid loss:1.3562270402908325,accuracy:0.3

0.6155575539568345