In [136]:
import argparse
import glob
import os
import time
import pandas as pd
import numpy as np
from itertools import chain
from tqdm import notebook

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizer
from transformers import (
    MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
)

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
folder_path = '../kaggle-nlp-disaster/data/raw'

In [4]:
data_path = os.path.join(folder_path, 'train.csv')
eval_path = os.path.join(folder_path, 'test.csv')

In [5]:
def generate_model(args, num_labels):
    
    config = AutoConfig.from_pretrained(
        args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=args.task_name,
        )
    tokenizer = AutoTokenizer.from_pretrained(
        args.model_name_or_path,
        do_lower_case=args.do_lower_case
    )
    model = AutoModelForSequenceClassification.from_pretrained(
        args.model_name_or_path,
        config=config
    )
    
    return config, tokenizer, model

In [6]:
class DisasterDataset():
    def __init__(self, data_path, eval_path, tokenizer):
        d_data = pd.read_table(data_path, sep=',')
        d_eval = pd.read_table(eval_path, sep=',')
        
        row, col = d_data.shape
        d_train = d_data[:int(row * 0.8)]
        d_test = d_data[int(row*0.8):]

        d_train.reset_index(drop=True, inplace=True)
        d_test.reset_index(drop=True, inplace=True)
        
        self.tokenizer = tokenizer
        self.dataset = {'train': (d_train, len(d_train)),
                       'test': (d_test, len(d_test)),
                       'eval': (d_eval, len(d_eval))}
        self.num_labels = len(d_train.target.unique().tolist())
        self.set_split('train')
    
    def get_vocab(self):
        text = " ".join(self.data.text.tolist())
        text = text.lower()
        vocab = text.split(" ")
        with open('vocab.txt', 'w') as file:
            for word in vocab:
                file.write(word)
                file.write('\n')
        file.close()
        return 'vocab.txt'
        
    
    def set_split(self, split = 'train'):
        self.split = split
        self.data, self.length = self.dataset[split]
    
    def __getitem__(self, idx):
        x = self.data.loc[idx, "text"].lower()
        x = self.tokenizer.encode(x, return_tensors="pt")[0]
    
        if self.split != 'eval':
            y = self.data.loc[idx, "target"]
            return {'id': idx, 'x': x, 'y': y}
        else:
            id_ = self.data.loc[idx, "id"]
            return {'id': id_, 'x': x}
    
    def __len__(self):
        return self.length

In [7]:
args = argparse.Namespace(
    model_name_or_path = 'bert-base-uncased',
    task_name = "binary_classification",
    do_lower_case = True
)

In [8]:
print("generate config, tokenizer, model")

generate config, tokenizer, model


In [9]:
config, tokenizer, model = generate_model(args, 2)

In [10]:
print("running dataset")

running dataset


In [11]:
dd = DisasterDataset(data_path, eval_path, tokenizer)

In [12]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [13]:
optimizer = optim.Adam(model.parameters(), lr = 1e-5)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 20, gamma = 0.01)

In [14]:
criterion = nn.CrossEntropyLoss()

In [112]:
def padded(batch):
    tensors = []
    y = []
    idx = []
    for data in batch:
        idx.append(data['id'])
        tensors.append(data['x'])
        try:
            y.append(data['y'])
        except:
            pass

    x = nn.utils.rnn.pad_sequence(tensors)
    
    return {'id': idx, 'x': x, 'y': torch.Tensor(y)}

In [16]:
def compute_accuracy(y_true, y_pred):
    y_pred = y_pred.argmax(dim=1).type(torch.LongTensor)
    y_true = y_true.type(torch.LongTensor)
    
    n_correct = torch.eq(y_true, y_pred).sum()
    accuracy = (int(n_correct) / y_true.shape[0]) * 100
    
    return accuracy

In [17]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
status = True

In [None]:
print("training...")

In [None]:
history_dict = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}

for epoch in range(1, 101):
    running_loss = 0
    running_accuracy = 0
    running_loss_val = 0
    running_accuracy_val = 0
    
    start_time = time.time()
    
    dd.set_split('train')
    dataset = DataLoader(dd, batch_size=64, shuffle=True, collate_fn=padded)
    model.train()
    for batch_index, batch_dict in enumerate(dataset, 1):
        optimizer.zero_grad()

        x = batch_dict['x'].permute(1, 0)
        x = x.to(device)
        y = batch_dict['y'].to(device)

        output = model(x)[0]
        output = torch.softmax(output.squeeze(), dim=1)
        loss = criterion(output, y.type(torch.LongTensor).to(device))

        running_loss += (loss.item() - running_loss) / batch_index

        accuracy = compute_accuracy(y, output)
        running_accuracy += (accuracy - running_accuracy) / batch_index

        loss.backward()

        optimizer.step()
        
    dd.set_split('test')
    dataset = DataLoader(dd, batch_size=64, shuffle=True, collate_fn=padded)
    model.eval()        
    for batch_index, batch_dict in enumerate(dataset, 1):

        x = batch_dict['x'].permute(1, 0)
        x = x.to(device)
        y = batch_dict['y'].to(device)

        output = model(x)[0]
        output = torch.softmax(output.squeeze(), dim=1)
        loss = criterion(output, y.type(torch.LongTensor).to(device))

        running_loss_val += (loss.item() - running_loss_val) / batch_index

        accuracy = compute_accuracy(y, output)
        running_accuracy_val += (accuracy - running_accuracy_val) / batch_index
    
    scheduler.step()
    
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        
    print(f'epoch {epoch} | time {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain loss: {running_loss:.2f} | Train accuracy: {running_accuracy:.2f}')
    print(f'\t  Val loss: {running_loss_val:.2f} |   Val accuracy: {running_accuracy_val:.2f}')
    
    history_dict['train_loss'].append(running_loss)
    history_dict['train_acc'].append(running_accuracy)
    history_dict['val_loss'].append(running_loss_val)
    history_dict['val_acc'].append(running_accuracy_val)
    
    if (running_accuracy > 90) & (running_accuracy_val > 85):
        status = False
        model.save_pretrained('kaggle-disaster/')
        break
    

In [None]:
if status:
    model.save_pretrained('kaggle-disaster/')

### eval

In [18]:
model = model.from_pretrained('kaggle-disaster/')

In [19]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [159]:
dd.set_split('eval')

In [160]:
data_gen = DataLoader(dd, batch_size=10, collate_fn=padded)

In [161]:
eval_bar = notebook.tqdm(total = dd.__len__(), desc = "eval")

HBox(children=(FloatProgress(value=0.0, description='eval', max=3263.0, style=ProgressStyle(description_width=…

In [162]:
submission_dict = {'id': [], 'target': []}
model.eval()
for batch_dict in data_gen:
    x = batch_dict['x'].permute(1, 0)
    x = x.to(device)
    y_pred = model(x)[0]
    y_pred = torch.softmax(y_pred, dim = 1)
    y_pred = y_pred.argmax(dim = 1).cpu()
    
    submission_dict['id'].extend(batch_dict['id'])
    submission_dict['target'].extend(y_pred.tolist())
    
    eval_bar.update(10)

In [163]:
d_submission = pd.DataFrame(submission_dict)

In [164]:
d_submission.target.value_counts()

0    1949
1    1314
Name: target, dtype: int64

In [165]:
d_submission

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,0
3259,10865,1
3260,10868,1
3261,10874,1


In [166]:
d_submission.to_csv('submission.csv', index=False)