2. Document Classification

In [126]:
import torch
from torch.utils.data import Dataset,DataLoader
from torch import nn
from tqdm import tqdm
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import pandas as pd
import torchtext
import math
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import csv
import torch.nn.functional as F


In [127]:
class Dataset(Dataset):
    def __init__(self, root='train.csv'):

        self.data=pd.read_csv(root)
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        #id = row['id']
        label = row['category']-1
        headline = str(row['headline'])
        short_description = str(row['short_description'])
        text=str(headline+' '+short_description)
        text=text.replace('“','').replace('‘','').replace('”','')
        return label,text
    
class TestDataset(Dataset):
    def __init__(self, root='test.csv'):

        self.data=pd.read_csv(root)
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        #id = row['id']
        #label = row['category']
        headline = str(row['headline'])
        short_description = str(row['short_description'])
        text=str(headline+' '+short_description)
        text=text.replace('“','').replace('‘','').replace('”','')
        return 0,text

2.1 Data Preprocessing 

1.

選擇tokenzier需要考慮data語言、詞彙大小、上下文、特殊字符和模型等因素


我們可以使用空白鍵來做tokenize

In [128]:
tokenizer = get_tokenizer('basic_english')
tokenizer("Analysts predict strong sales, but some investors are worried about the company's high debt load.")

['analysts',
 'predict',
 'strong',
 'sales',
 ',',
 'but',
 'some',
 'investors',
 'are',
 'worried',
 'about',
 'the',
 'company',
 "'",
 's',
 'high',
 'debt',
 'load',
 '.']

In [129]:
tokenizer = get_tokenizer('spacy')
tokenizer("Analysts predict strong sales, but some investors are worried about the company's high debt load.")



['Analysts',
 'predict',
 'strong',
 'sales',
 ',',
 'but',
 'some',
 'investors',
 'are',
 'worried',
 'about',
 'the',
 'company',
 "'s",
 'high',
 'debt',
 'load',
 '.']

可以看出使用 get_tokenizer('basic_english') 會只把單詞和標點符號分割開，而使用 get_tokenizer('spacy') 則會更詳細地分割，包括各種詞類和標點符號。

2.

⟨pad⟩是用來填充句子長度的，因為每個句子長度不一樣，所以當你處理資料時為了讓他們長度一樣，需要將不夠長的句子補上⟨pad⟩。


⟨unk⟩是用在當vocab沒看過的字時，會將它視為⟨unk⟩。

3.

我使用basic_english當作我的tokenizer，因為此次作業資料相對簡單，不複雜，所以不需要使用額外的tokenizer。我在collate_fn處理文字長度不一問題，torch.nn.utils.rnn.pad_sequence會將每個batch的文字自動補到最長的文字。

In [130]:
tokenizer = get_tokenizer('basic_english')

dataset=Dataset()

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

myvocab = build_vocab_from_iterator(yield_tokens(dataset), specials=["<pad>","<unk>",])
myvocab.set_default_index(myvocab["<unk>"])

glove_vectors = torchtext.vocab.GloVe(name='6B',dim=100)
embedding_matrix=torch.stack([glove_vectors.get_vecs_by_tokens(myvocab.get_itos()) ]).squeeze(0)
# embedding_matrix = glove_vectors.vectors
# embedding_matrix = torch.cat([torch.zeros(2, 100), embedding_matrix])

# myvocab = torchtext.vocab.vocab(glove_vectors.stoi, min_freq = 0,specials=["<pad>","<unk>"])
# myvocab.set_default_index(myvocab["<unk>"])
# ret = vec.get_vecs_by_tokens(examples, lower_case_backup=True)

In [131]:
def collate_batch(batch):
    #batch:[(label,text),...]
    #return:label(b,),text(b,seq_lenth) 
    
    labels,texts = zip(*batch)

    text_tensors = []
    for text in texts:
        text_tensor = torch.tensor(myvocab(tokenizer(text)), dtype=torch.int)
    
        text_tensors.append(text_tensor)

    
    text_tensors = torch.nn.utils.rnn.pad_sequence(text_tensors, batch_first=True)

    
    label_tensor = torch.tensor(labels, dtype=torch.long)
    
    return label_tensor, text_tensors

In [132]:

dataset=Dataset()

train_size = int(0.8* len(dataset))
val_size = len(dataset) - train_size

traindataset, valdataset = torch.utils.data.random_split(dataset, [train_size, val_size])
traindataloader = DataLoader(traindataset, batch_size=32, shuffle=True, collate_fn=collate_batch)
valdataloader = DataLoader(valdataset, batch_size=32, shuffle=True, collate_fn=collate_batch)

2.2 Transformer

我的Transformer超參數為:d_model=100, nhead=4, d_hid=200 , nlayers=2, dropout=0.1

d_model代表模型的輸入的特徵的尺寸，因為越大模型也越大且模型能夠更好地捕捉序列中的關係，但是模型的計算量和內存需求也會增加，所以我選擇100。

nhead是用來控制注意力機制中頭數的超參數，數值越大可以提高模型的表現，因為它可以更好地捕捉不同位置之間的依賴關係，但是也有可能造成overfitting，此數值的選擇我是以實驗慢慢進行調整。

d_hid也就是Transformer裡面linear layer的維度，大小與訓練資料大小和複雜度有關，此數值選擇也是透過經驗法則，實驗中慢慢進行調整，直到選擇到滿意的參數。

nlayers代表總共有幾層Transformer，越多層，模型需要做的運算也越多，但是訓練資料如果很簡單，需要的層數也就沒必要太多，避免出現overfitting現象，此參數我選擇2，是因為我認為此次訓練資料不算太複雜，層數也不需太多。

dropout可以避免overfitting現象，我選擇0.1。

In [133]:
config={
    'd_model':100,
    'nhead':4,
    'd_hid':200,
    'dropout':0.1,
    'num_layers':2,
    'epochs':200,
    'lr':5e-4,
    'wd':1e-4,

}

In [134]:
class Transformer(nn.Module):

    def __init__(self, d_model,dropout,nhead,dim_hid, num_layers,vocab_size,num_class):
        super(Transformer, self).__init__()
        #nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model)
        self.emb=nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.pos_encoder = PositionalEncoding(d_model=d_model, dropout=dropout)
        encoder_layers = TransformerEncoderLayer(d_model=d_model, nhead=nhead,\
                                                  dim_feedforward=dim_hid, dropout=dropout,batch_first=True,activation=F.silu)
        self.transformer_encoder = TransformerEncoder(encoder_layers, num_layers=num_layers)
        self.fc=nn.Sequential(
            nn.Linear(d_model,num_class),
        )
        
        self.d_model = d_model
    def forward(self, text):
        
        text=self.emb(text)#* math.sqrt(self.d_model) #8xntokenxd_model
        text=torch.transpose(text, 0, 1)
        text=self.pos_encoder(text)
        text=torch.transpose(text, 0, 1)
        text=self.transformer_encoder(text)
        
        text=torch.mean(text,dim=1)
       
        text=self.fc(text)
        
        return text
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x) :
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [135]:
vocab_size = len(myvocab)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Transformer(d_model=config['d_model']\
    ,dropout=config['dropout'],nhead=config['nhead'],dim_hid=config['d_hid']\
        ,num_layers=config['num_layers'],vocab_size=vocab_size, num_class=4).to(device)

In [136]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=config['lr'],weight_decay=config['wd'])
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, config['epochs'], eta_min=1e-4)

In [137]:
def train(epoch):
  model.train()
  acc_list=[]
  loss_list=[]
  for label,text  in (bar:=tqdm(traindataloader,ncols=0)):
      optimizer.zero_grad()
      #print('text',text.shape)
      text=text.to(device)
      label=label.to(device)
      predicted_label = model(text)
      
      loss = criterion(predicted_label, label)
      loss.backward()
      optimizer.step()
      acc = (predicted_label.argmax(1) == label).sum().item()/label.shape[0]
      #print(acc)
      acc_list.append(acc)
      loss_list.append(loss.item())
      bar.set_description(f"epochs[{epoch+1}/{config['epochs']}]|training")
      bar.set_postfix({'loss ': '{:.4f}'.format(sum(loss_list)/len(loss_list)),
            'acc': '{:.4f}'.format(sum(acc_list)/len(acc_list))
    })
  scheduler.step()
  #return sum(acc_list)/len(acc_list)
def val():
  model.eval()
  acc_list=[]
  with torch.no_grad():
    for label,text  in (bar:=tqdm(valdataloader,ncols=0)):
        
        text=text.to(device)
        label=label.to(device)
        predicted_label = model(text)
        
        acc = (predicted_label.argmax(1) == label).sum().item()/label.shape[0]
        
        acc_list.append(acc)
        
        bar.set_description(f"validation")
        bar.set_postfix({'acc': '{:.4f}'.format(sum(acc_list)/len(acc_list))})
  
  return sum(acc_list)/len(acc_list)


In [138]:
best_acc=0.7
for epoch in range(config['epochs']):
  train(epoch)
  acc=val()
  if acc>best_acc:
      torch.save(model.state_dict(), 'best.pt')
      best_acc=acc

epochs[1/200]|training: 100% 50/50 [00:00<00:00, 114.91it/s, loss =1.2571, acc=0.4075]
validation: 100% 13/13 [00:00<00:00, 237.59it/s, acc=0.6635]
epochs[2/200]|training: 100% 50/50 [00:00<00:00, 127.99it/s, loss =0.7445, acc=0.6850]
validation: 100% 13/13 [00:00<00:00, 231.88it/s, acc=0.7933]
epochs[3/200]|training: 100% 50/50 [00:00<00:00, 114.37it/s, loss =0.5090, acc=0.8075]
validation: 100% 13/13 [00:00<00:00, 238.54it/s, acc=0.8245]
epochs[4/200]|training: 100% 50/50 [00:00<00:00, 127.19it/s, loss =0.4191, acc=0.8438]
validation: 100% 13/13 [00:00<00:00, 238.47it/s, acc=0.8630]
epochs[5/200]|training: 100% 50/50 [00:00<00:00, 128.07it/s, loss =0.3598, acc=0.8694]
validation: 100% 13/13 [00:00<00:00, 238.82it/s, acc=0.8678]
epochs[6/200]|training: 100% 50/50 [00:00<00:00, 126.43it/s, loss =0.2848, acc=0.9012]
validation: 100% 13/13 [00:00<00:00, 237.10it/s, acc=0.8822]
epochs[7/200]|training: 100% 50/50 [00:00<00:00, 119.34it/s, loss =0.2551, acc=0.9125]
validation: 100% 13/13 [0

In [139]:
best_acc

0.8846153846153846

In [140]:
testdataset=TestDataset()
testdataloader = DataLoader(testdataset, batch_size=1, shuffle=False, collate_fn=collate_batch)

In [141]:
def test(model):
  model.eval()
  predict=[]
  with torch.no_grad():
    for no_use,text  in (bar:=tqdm(testdataloader)):
      
      text=text.cpu()#.to(device)
      predicted_label = model(text)
      # print(predicted_label.shape)
      output=predicted_label.argmax(1).cpu().numpy()
      # print(output)
      predict.extend(output)
    
  return predict

In [142]:
vocab_size = len(myvocab)
model = Transformer(d_model=config['d_model']\
    ,dropout=config['dropout'],nhead=config['nhead'],dim_hid=config['d_hid']\
        ,num_layers=config['num_layers'],vocab_size=vocab_size, num_class=4).cpu()#


model.load_state_dict(torch.load('best.pt'))



<All keys matched successfully>

In [143]:
predict=test(model)

data = [(i+1, val+1) for i, val in enumerate(predict)]
with open('submission.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['id', 'category'])  
    writer.writerows(data)  

100%|██████████| 400/400 [00:00<00:00, 1262.89it/s]
