## Setup

In [4]:
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category=DeprecationWarning)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
import nlpaug.augmenter.word as naw

import torch
from torch import tensor
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

from collections import OrderedDict
import random
random.seed(42)

In [10]:
p = {
    'BATCH_SZ': 10,
    'MAX_LEN': 120,
    'MODEL': 'bert-base-cased', #xlnet-base-cased xlm-mlm-en-2048
    'TRAIN_VALID_SPLIT': 0.07,
    'DROPOUT_0': 0.8,
    'DROPOUT_1': 0.8,
    'N_CLASSES': 2,
    'CLIPPING': True,
    'SCHEDULER': True,
    'LR': 2e-5,
    'LIN_0_HIDDEN_SZ': 256,
    'ADDED_AUGMENTED_TWEETS': 4000,
    'AUGMENTATION_MODEL': 'bert-base-cased'
}

## Dataset

In [94]:
DATA = Path('/home/sharif/Documents/Challenges/nlp-with-disaster-tweets/data')
train_df, test_df = pd.read_csv(DATA/'train.csv'), pd.read_csv(DATA/'test.csv')

In [5]:
class DS(Dataset):
    def __init__(self, texts, targets, tokenizer, max_len):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self): return len(self.texts)
    
    def __getitem__(self, item):
        text = str(self.texts[item])
        target = self.targets[item]
        
        encoding = self.tokenizer.encode_plus(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          return_token_type_ids=False,
          pad_to_max_length=True,
          return_attention_mask=True,
          return_tensors='pt',
          truncation=True
        )
        
        return {
          'text': text,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'targets': torch.tensor(target, dtype=torch.long)
        }

In [95]:
train_df, valid_df = train_test_split(train_df, test_size=p['TRAIN_VALID_SPLIT'], random_state=42)

In [7]:
train_df.reset_index(inplace=True)
valid_df.reset_index(inplace=True)

In [8]:
len(train_df), len(valid_df)

(7080, 533)

In [9]:
train_df.head()

Unnamed: 0,index,id,keyword,location,text,target
0,965,1395,body%20bag,"Greenville,SC",@TR_jdavis Bruh you wanna fight I'm down meet ...,0
1,7231,10355,weapons,,@NRO Except when ordered not to carry unauthor...,1
2,4421,6292,hostage,,Egyptian Militants Tied to ISIS Threaten to Ki...,1
3,7133,10217,volcano,,Eruption of Indonesian volcano sparks transpor...,1
4,6276,8967,storm,"Wilmington, NC",New item: Pillow Covers ANY SIZE Pillow Cover ...,0


## Augmenatation

In [10]:
aug = naw.ContextualWordEmbsAug(model_path=p['AUGMENTATION_MODEL'], action="substitute")

  and should_run_async(code)


In [11]:
def augment(train_df):
    n_tweets = p['ADDED_AUGMENTED_TWEETS']
    for i in range(n_tweets):
        if i % 100 == 0: print(f'{i}/{n_tweets}')
        idx = random.randint(0, len(train_df))
        target = train_df.target[idx]
        text = train_df.text[idx]
        text_aug = aug.augment(text)
        train_df = train_df.append({'text': text_aug, 'target': target}, ignore_index=True)
    return train_df

In [12]:
train_df = augment(train_df); len(train_df)

0/4000
100/4000
200/4000
300/4000
400/4000
500/4000
600/4000
700/4000
800/4000
900/4000
1000/4000
1100/4000
1200/4000
1300/4000
1400/4000
1500/4000
1600/4000
1700/4000
1800/4000
1900/4000
2000/4000
2100/4000
2200/4000
2300/4000
2400/4000
2500/4000
2600/4000
2700/4000
2800/4000
2900/4000
3000/4000
3100/4000
3200/4000
3300/4000
3400/4000
3500/4000
3600/4000
3700/4000
3800/4000
3900/4000


11080

In [13]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = DS(
        texts=df.text.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len,
        targets=df.target.to_numpy()
    )
    
    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=4,
        shuffle=True
    )

## Model + Training

In [14]:
tokenizer = BertTokenizer.from_pretrained(p['MODEL'])

In [6]:
class BertClassifier(pl.LightningModule):
    def __init__(self, train_df, valid_df, n_c=2, params=p):
        super().__init__()
        self.hparams = p
        self.train_df, self.valid_df = train_df, valid_df
        
        self.bert = BertModel.from_pretrained(p['MODEL'])
        self.drop0 = nn.Dropout(p=p['DROPOUT_0'])
        self.drop1 = nn.Dropout(p=p['DROPOUT_1'])
        self.lin0 = nn.Linear(self.bert.config.hidden_size, p['LIN_0_HIDDEN_SZ'])
        self.lin1 = nn.Linear(p['LIN_0_HIDDEN_SZ'], p['N_CLASSES'])
    
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        y = self.drop0(pooled_output)
        y = self.lin0(y)
        y = self.drop1(y)
        return self.lin1(y)
    
    def step(self, batch):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        targets = batch["targets"]
        
        outputs = self(
          input_ids=input_ids,
          attention_mask=attention_mask
        )
        
        _, preds = torch.max(outputs, dim=1)
        
        acc = (preds == targets).float().mean()
        loss =  F.cross_entropy(outputs, targets)
        return OrderedDict({
            'loss': loss,
            'accuracy': acc
        })
    
    def training_step(self, batch, batch_idx):
        return self.step(batch)
    
    def training_epoch_end(self, outputs):
        loss_mean = torch.stack([output['loss'] for output in outputs]).float().mean()
        acc_mean = torch.stack([output['accuracy'] for output in outputs]).float().mean()
        self.log('train_loss', loss_mean)
        self.log('train_accuracy', acc_mean, prog_bar=True)
        
        if p['CLIPPING']: nn.utils.clip_grad_norm_(self.parameters(), max_norm=1.0)
        if p['SCHEDULER']: self.scheduler.step()
    
    def validation_step(self, batch, batch_idx):
        return self.step(batch)
    
    def validation_epoch_end(self, outputs):
        loss_mean = torch.stack([output['loss'] for output in outputs]).float().mean()
        acc_mean = torch.stack([output['accuracy'] for output in outputs]).float().mean()
        self.log('valid_loss', loss_mean, prog_bar=True)
        self.log('valid_accuracy', acc_mean, prog_bar=True)
        
    def configure_optimizers(self):
        optim = AdamW(self.parameters(), lr=p['LR'], correct_bias=False) 
        self.scheduler = get_linear_schedule_with_warmup(
          optim,
          num_warmup_steps=0,
          num_training_steps=len(self.train_dataloader())*100
        )
        return optim
            
    def train_dataloader(self): return create_data_loader(self.train_df, tokenizer, p['MAX_LEN'], p['BATCH_SZ'])
    def val_dataloader(self): return create_data_loader(self.valid_df, tokenizer, p['MAX_LEN'], p['BATCH_SZ'])

In [16]:
classifier = BertClassifier(train_df, valid_df, params=p)

In [17]:
checkpoint = ModelCheckpoint(
    monitor='valid_accuracy',
    filename='transformer-{epoch:02d}-{valid_accuracy:.4f}',
    save_top_k=3,
    mode='max'
)

early_stopping = EarlyStopping(
    monitor='valid_accuracy',
    patience=30,
    verbose=True,
    mode='max'
)

In [18]:
trainer = pl.Trainer(gpus=1, max_epochs=100, deterministic=True, callbacks=[checkpoint, early_stopping])#, fast_dev_run=True)
trainer.fit(classifier)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type      | Params
------------------------------------
0 | bert  | BertModel | 108 M 
1 | drop0 | Dropout   | 0     
2 | drop1 | Dropout   | 0     
3 | lin0  | Linear    | 196 K 
4 | lin1  | Linear    | 514   


HBox(children=(HTML(value='Validation sanity check'), FloatProgress(value=1.0, bar_style='info', layout=Layout…

HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…




1

## Inference

In [134]:
ckp_f = '/home/sharif/Documents/Challenges/nlp-with-disaster-tweets/lightning_logs/architecture_1/version_0/checkpoints/transformer-epoch=10-valid_accuracy=0.8136.ckpt'

In [135]:
classifier = BertClassifier(None, None, params=p)

In [136]:
def load_model(p):
    ckp = torch.load(p)
    classifier.load_state_dict(ckp['state_dict'])

In [137]:
load_model(ckp_f)

In [138]:
tokenizer = BertTokenizer.from_pretrained(p['MODEL'])

In [139]:
def predict(text):
    encoding = tokenizer.encode_plus(
              text,
              add_special_tokens=True,
              max_length=120,
              return_token_type_ids=False,
              pad_to_max_length=True,
              return_attention_mask=True,
              return_tensors='pt',
              truncation=True
    )
    y_hat = classifier(encoding['input_ids'], encoding['attention_mask'])
    return y_hat.argmax().item()

In [140]:
def accuracy():
    correct = 0.0
    for i, row in enumerate(valid_df.values):
        if i % 100 == 0: 
            print(f'{i}/{len(valid_df)}')
            print(correct/(i+1))
        s = row[3]
        y = int(row[4])
        y_hat = predict(s)
        correct += int(y_hat == y)

In [130]:
accuracy()

0/533
0.0
100/533
0.8217821782178217
200/533
0.8009950248756219
300/533
0.8006644518272426
400/533
0.8179551122194514
500/533
0.8183632734530938


## Submission

In [141]:
preds = []
for i, row in enumerate(test_df.values):
    if i % 100 == 0: print(f'{i}/{len(test_df)}')
    id = row[0]
    s = row[-1]
    y_hat = predict(s)
    preds.append([id, y_hat])

0/3263
100/3263
200/3263
300/3263
400/3263
500/3263
600/3263
700/3263
800/3263
900/3263
1000/3263
1100/3263
1200/3263
1300/3263
1400/3263
1500/3263
1600/3263
1700/3263
1800/3263
1900/3263
2000/3263
2100/3263
2200/3263
2300/3263
2400/3263
2500/3263
2600/3263
2700/3263
2800/3263
2900/3263
3000/3263
3100/3263
3200/3263


In [142]:
submission = pd.DataFrame(preds, columns=['id', 'target']).set_index('id')
submission.head()

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,1
2,1
3,1
9,1
11,1


In [143]:
submission.to_csv('submission.csv')