# Prepare

In [1]:
# download dataset
!git clone https://github.com/cardiffnlp/tweeteval.git
# install requirements
!pip install fasttext torch==1.6.0 transformers torchtext==0.2.3 nltk

/bin/bash: git: command not found
Defaulting to user installation because normal site-packages is not writeable
[31mERROR: Could not find a version that satisfies the requirement fasttext[0m
[31mERROR: No matching distribution found for fasttext[0m


In [2]:
# better to save in google drive and mount
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
!gzip -d cc.en.300.bin.gz

--2022-03-26 06:46:22--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... failed: Name or service not known.
wget: unable to resolve host address 'dl.fbaipublicfiles.com'
gzip: cc.en.300.bin.gz: No such file or directory


# Process data and create data loader

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import fasttext
from tqdm import tqdm
from math import log
from itertools import chain
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')

In [2]:
# check cuda + torch + GPU compability
a = torch.arange(256).reshape((16, 16)).to(device, dtype=torch.float)
l = nn.Linear(16, 2).to(device)
a = l(a)

In [3]:
ft = fasttext.load_model('cc.en.300.bin')  # may use 1 min
# ft = fasttext.load_model('/content/drive/MyDrive/cc.en.300.bin')
ft.get_word_vector('hello').shape



(300,)

In [46]:
train_data = []
task = "offensive"
with open(f"tweeteval/datasets/{task}/train_text.txt") as xs,\
open(f"tweeteval/datasets/{task}/train_labels.txt") as ys:
    for y, x in zip(ys.readlines(), xs.readlines()):
        train_data.append((int(y.strip()), x.strip().split(' ')))

val_data = []
with open(f"tweeteval/datasets/{task}/val_text.txt") as xs,\
open(f"tweeteval/datasets/{task}/val_labels.txt") as ys:
    for y, x in zip(ys.readlines(), xs.readlines()):
        val_data.append((int(y.strip()), x.strip().split(' ')))

test_data = []
with open(f"tweeteval/datasets/{task}/test_text.txt") as xs,\
open(f"tweeteval/datasets/{task}/test_labels.txt") as ys:
    for y, x in zip(ys.readlines(), xs.readlines()):
        test_data.append((int(y.strip()), x.strip().split(' ')))

In [47]:
train_data[:3]

[(0,
  ['@user',
   'Bono...',
   'who',
   'cares.',
   'Soon',
   'people',
   'will',
   'understand',
   'that',
   'they',
   'gain',
   'nothing',
   'from',
   'following',
   'a',
   'phony',
   'celebrity.',
   'Become',
   'a',
   'Leader',
   'of',
   'your',
   'people',
   'instead',
   'or',
   'help',
   'and',
   'support',
   'your',
   'fellow',
   'countrymen.']),
 (1,
  ['@user',
   'Eight',
   'years',
   'the',
   'republicans',
   'denied',
   'obama’s',
   'picks.',
   'Breitbarters',
   'outrage',
   'is',
   'as',
   'phony',
   'as',
   'their',
   'fake',
   'president.']),
 (0,
  ['@user',
   'Get',
   'him',
   'some',
   'line',
   'help.',
   'He',
   'is',
   'gonna',
   'be',
   'just',
   'fine.',
   'As',
   'the',
   'game',
   'went',
   'on',
   'you',
   'could',
   'see',
   'him',
   'progressing',
   'more',
   'with',
   'his',
   'reads.',
   'He',
   'brought',
   'what',
   'has',
   'been',
   'missing.',
   'The',
   'deep',
   'ball',
 

In [48]:
vocabulary = set()
for _, ws in train_data:
    vocabulary |= set(ws)

In [49]:
vocab_size = len(vocabulary) + 1
tkn2idx = {w: i for i, w in enumerate(vocabulary, start=1)}
print(vocab_size)

36465


In [66]:
from torch.nn.utils.rnn import pad_sequence, pack_sequence
from torch.utils.data import DataLoader

PAD_IDX = 0
ft_text_pipeline = lambda words:np.array([ft.get_word_vector(word) for word in words if word != "@user"])
tkn_text_pipeline = lambda words:[tkn2idx[word] for word in words if word in vocabulary]
def collate_batch(batch):
    '''
    input: List[(label, sentence)]
    '''
    label_list, text_list, seq_len = [], [], []
    for label, words in batch:
        label_list.append(label)
        processed_text = torch.tensor(tkn_text_pipeline(words), dtype=torch.int64)
        text_list.append(processed_text)
        seq_len.append(processed_text.shape[0])
    # print(label_list, text_list, seq_len)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    # print(len(text_list))
    text_list = pad_sequence(text_list, padding_value=PAD_IDX)
    # print(text_list.shape)
    seq_len = torch.tensor(seq_len, dtype=torch.int64)
    return label_list.to(device), text_list.to(device), seq_len.to(device)

def collate_ft(batch):
    '''
    input: List[(label, sentence)]
    '''
    label_list, text_list, seq_len = [], [], []
    for label, words in batch:
        label_list.append(label)

        # use fasttext embedding
        processed_text = torch.tensor(ft_text_pipeline(words), dtype=torch.float)
        # print("processed_text.shape,", processed_text.shape)
        text_list.append(processed_text)
        seq_len.append(processed_text.shape[0])

    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = pad_sequence(text_list, padding_value=0.)
    seq_len = torch.tensor(seq_len, dtype=torch.int64)
    return label_list.to(device), text_list.to(device), seq_len.to(device)

In [56]:
# test data process procedure
ttt = []
for l, w in train_data[:3]:
    processed_text = torch.tensor(tkn_text_pipeline(w), dtype=torch.int64)
    print("processed_text.shape,", processed_text.shape)
    ttt.append(processed_text)
text_list_tmp = pad_sequence(ttt, padding_value=PAD_IDX)
print("text_list.shape", text_list_tmp.shape)

processed_text.shape, torch.Size([31])
processed_text.shape, torch.Size([17])
processed_text.shape, torch.Size([44])
text_list.shape torch.Size([44, 3])


Transformer Model

In [57]:
class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim: int, drop_rate=0.1, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=drop_rate)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2) * (-log(10000.0) / embed_dim))
        pe = torch.zeros(1, max_len, embed_dim)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor, shape [batch_size, seq_len, embedding_dim]
        """
#         print("PositionalEncoding input x shape", x.shape, "self.pe[:, :x.size(1)]", self.pe[:, :x.size(1)].shape)
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

In [58]:
class TokenAndPositionEmbedding(nn.Module):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = nn.Embedding(vocab_size, embed_dim)
        self.pos_emb = PositionalEncoding(embed_dim)
        
    def forward(self, inputs):
        # todo
        inputs = self.token_emb(inputs)
        inputs = self.pos_emb(inputs)  # in, out B T hidden
        return inputs

In [68]:
class TransformerClassfication(nn.Module):
    def __init__(self, maxlen, vocab_size, embed_dim, n_heads, attn_drop_rate, layer_drop_rate, dense_dim):
        super().__init__()
        # self.emb = PositionalEncoding(embed_dim)
        self.emb = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)

        self.transformer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=n_heads, dim_feedforward=4*embed_dim, activation="gelu")
        # self.transformer = TransformerBlock(embed_dim, n_heads, attn_drop_rate, layer_drop_rate)
#         self.pool = nn.AvgPool1d(kernel_size=embed_dim)
        self.d1 = nn.Dropout(layer_drop_rate)
        self.fc1 = nn.Linear(embed_dim, dense_dim)
        self.act1 = nn.Sequential(nn.ReLU(), nn.Dropout(layer_drop_rate))
        self.fc2 = nn.Linear(dense_dim, 2)
#         self.out = nn.Softmax() # included in loss
        
    def init_weight(self):
        nn.init.xavier_uniform_(self.fc1)
        nn.init.xavier_uniform_(self.fc2)
        nn.init.uniform_(self.emb)
    
    def forward(self, x, seq_len):
        '''
        x: S, B, embed_dim
        '''
        x = self.emb(x.transpose(0,1))
        x = self.transformer(x)  # B, S, H
        # print("after transformer shape,", x.shape)
        # after transformer shape, torch.Size([32, 38, 64])
        
        # take average along seq l
        masks = (torch.arange(x.shape[1], device=device)[None, :] >= seq_len[:, None]).to(device)
        masked_x = x.transpose(1,2).masked_fill(masks[:, None]==1, 0)  # 4, 8, 5 B, H, SEQL
        # hidden =8, after average = 8; seql =5, after avg = 1
        avg_t = torch.sum(masked_x, dim=2, dtype=torch.float) / seq_len[:, None] # 4, 8, 1 / 4, 1
        x = avg_t.to(device)
#         print("after pool shape,", x.shape)  # B, H
        
        x = self.d1(x)
        x = self.fc1(x)
        x = self.act1(x)
        x = self.fc2(x)
        return x

In [69]:
class FTTransformerClassfication(TransformerClassfication):
    def __init__(self, maxlen, vocab_size, embed_dim, n_heads, attn_drop_rate, layer_drop_rate, dense_dim):
        super().__init__(maxlen, vocab_size, embed_dim, n_heads, attn_drop_rate, layer_drop_rate, dense_dim)
        self.emb = PositionalEncoding(embed_dim)

In [70]:
import time

def train(dataloader, log_interval=5):
    model.train()
    total_acc, total_count = 0, 0
    start_time = time.time()

    for idx, (label, text, seq_len) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text, seq_len)
        # print("predcited", predicted_label)
        loss = criterion(predicted_label, label)
        # loss.requres_grad = True
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| loss {:5.3f} '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              loss.item(),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, seq_len) in enumerate(dataloader):
            predicted_label = model(text, seq_len)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [76]:
# use embedding layer
EPOCHS = 10
LR = 1e-3
BATCH_SIZE = 128
EMBED_DIM = 128
N_HEADS = 2
DENSE_DIM = 32

model = TransformerClassfication(maxlen=5000, vocab_size=vocab_size, embed_dim=EMBED_DIM, 
                                 n_heads=N_HEADS, attn_drop_rate=0.1, layer_drop_rate=0.1, 
                                 dense_dim=DENSE_DIM).to(device)

train_dl = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)
dev_dl = DataLoader(val_data, batch_size=1, shuffle=False, collate_fn=collate_batch)

In [74]:
# use fast text word embedding
EMBED_DIM = 300
DENSE_DIM = 128

model = FTTransformerClassfication(maxlen=5000, vocab_size=vocab_size, embed_dim=EMBED_DIM, 
                                 n_heads=N_HEADS, attn_drop_rate=0.1, layer_drop_rate=0.1, 
                                 dense_dim=DENSE_DIM).to(device)

train_dl = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_ft)
dev_dl = DataLoader(val_data, batch_size=1, shuffle=False, collate_fn=collate_ft)

In [77]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dl, log_interval=30)
    accu_val = evaluate(dev_dl)
    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid acc {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

| epoch   1 |    30/   94 batches | loss 0.662 | accuracy    0.658
| epoch   1 |    60/   94 batches | loss 0.639 | accuracy    0.671
| epoch   1 |    90/   94 batches | loss 0.639 | accuracy    0.671
-----------------------------------------------------------
| end of epoch   1 | time:  2.15s | valid acc    0.653 
-----------------------------------------------------------
| epoch   2 |    30/   94 batches | loss 0.655 | accuracy    0.666
| epoch   2 |    60/   94 batches | loss 0.590 | accuracy    0.678
| epoch   2 |    90/   94 batches | loss 0.641 | accuracy    0.674
-----------------------------------------------------------
| end of epoch   2 | time:  2.13s | valid acc    0.665 
-----------------------------------------------------------
| epoch   3 |    30/   94 batches | loss 0.631 | accuracy    0.680
| epoch   3 |    60/   94 batches | loss 0.550 | accuracy    0.705
| epoch   3 |    90/   94 batches | loss 0.636 | accuracy    0.711
---------------------------------------------

KeyboardInterrupt: 