<a href="https://colab.research.google.com/github/adeepH/SPDRA_2021_SharedTask/blob/main/SPDRA_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers==3.3.1
!pip install pytorch_lightning



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoModel,AutoTokenizer,AdamW,get_linear_schedule_with_warmup

import torch
from torch import tensor
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader

import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping,ModelCheckpoint
from pytorch_lightning.metrics.functional import accuracy
#import pytorch_lightning.metrics.sklearns.F1 as F1
from collections import OrderedDict 
import random
random.seed(42) 

In [4]:
p = {
    'BATCH_SIZE' : 32,
     'MAX_LEN' : 128,
     'MODEL' : 'allenai/scibert_scivocab_uncased',
     'TRAIN_VALID_SPLIT' : 0.1,
     'DROPOUT_0' : 0.4,
     'DROPOUT_1' : 0.4,
     'N_CLASSES' : 7,
     'CLIPPING' : True,
     'SCHEDULER' : True,
     'LR' : 2e-5,
     'INTER_HIDDEN_DIM' : 128,
     'MAX_EPOCH' : 5

}

In [5]:
import pandas as pd
train = pd.read_csv('/content/drive/MyDrive/sdpra2021/train.csv',delimiter=',',
                 header=None,names=['sentence','label'])

train.label = train.label.apply({'CL':0,'CR':1,'DC':2,
'DS':3,'LO':4, 'NI':5, 'SE':6}.get)
val = pd.read_csv('/content/drive/MyDrive/sdpra2021/validation.csv',delimiter=',',
                 header=None,names=['sentence','label'])

val.label = val.label.apply({'CL':0,'CR':1,'DC':2,
'DS':3,'LO':4, 'NI':5, 'SE':6}.get)

test = pd.read_csv('/content/drive/MyDrive/sdpra2021/test.csv',delimiter=',',
                 header=None,names=['sentence'])
print('Training set size:',train.shape)
print('Testing set size:',test.shape)
print('validation set size:',val.shape)

Training set size: (16800, 2)
Testing set size: (7000, 1)
validation set size: (11200, 2)


In [6]:
class TaskDataset(Dataset): 
    
  def __init__(self,sentence,label,tokenizer,max_len):
    self.sentence = sentence
    self.label = label
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.sentence)
  
  def __getitem__(self,item):
    sentence = str(self.sentence[item])
    label = self.label[item]

    encoding = self.tokenizer.encode_plus(
        sentence,
        add_special_tokens=True,
        max_length = self.max_len,
        return_token_type_ids = False,
        padding = 'max_length',
        return_attention_mask= True,
        return_tensors='pt',
        truncation=True
    )

    return {
        'sentences' : sentence,
        'input_ids' : encoding['input_ids'].flatten(),
        'attention_mask' : encoding['attention_mask'].flatten(),
        'label' : torch.tensor(label,dtype=torch.long)

    }
 

In [7]:
def create_data_loader(df,tokenizer,max_len,batch_size):

    ds = TaskDataset(
        sentence = df.sentence.to_numpy(),         
        tokenizer = tokenizer,
        max_len = max_len,
        label = df.label.to_numpy()
         
    )

    return DataLoader(
        ds,
        batch_size = batch_size,
        num_workers = 4,
        shuffle = True
    )

In [8]:
tokenizer = AutoTokenizer.from_pretrained(p['MODEL'])


In [9]:
class Classifier(pl.LightningModule):
    
    def __init__(self,train_df,valid_df,test_df,n_c=7,params=p):
        
        super().__init__()
        self.hparams = p
        self.train_df = train_df
        self.valid_df = valid_df
        self.test_df = test_df
        self.model = AutoModel.from_pretrained(p['MODEL'])
        self.drop0 = nn.Dropout(p=p['DROPOUT_0'])
        self.drop1 = nn.Dropout(p=p['DROPOUT_1'])
        #self.log_softmax = F.log_softmax()
        self.lin0 = nn.Linear(self.model.config.hidden_size,p['INTER_HIDDEN_DIM'])
        self.lin1 = nn.Linear(p['INTER_HIDDEN_DIM'],p['N_CLASSES'])
        

    def forward(self,input_ids,attention_mask):

        _, pooled_output = self.model(
            input_ids = input_ids,
            attention_mask = attention_mask
        )

        x = self.drop0(pooled_output)
        x = self.lin0(x)
        x = F.softmax(x)
        x = self.drop1(x)
        return self.lin1(x)
    
    
    def step(self,batch):

        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        label = batch["label"]

        outputs = self(
            input_ids = input_ids,
            attention_mask = attention_mask
        )

        _, preds = torch.max(outputs,dim=1)

        acc = (preds == label).float().mean()
        loss = F.cross_entropy(outputs,label)

        return OrderedDict({
            'loss' : loss,
            'accuracy' : acc
        })

    
    def training_step(self,batch,batch_idx):
        
        return self.step(batch)

    
    def training_epoch_end(self,outputs):
        
        loss_mean = torch.stack([output['loss'] for output in outputs]).float().mean()
        acc_mean = torch.stack([output['accuracy'] for output in outputs]).float().mean()

        self.log('train_loss' , loss_mean)
        self.log('train_accuracy', acc_mean)

        if p['CLIPPING'] : nn.utils.clip_grad_norm_(self.parameters(),max_norm=1.0)
        if p['SCHEDULER'] : self.scheduler.step()

    
    def validation_step(self,batch,batch_idx):

        return self.step(batch)
    

    def validation_epoch_end(self,outputs):

        loss_mean = torch.stack([output['loss'] for output in outputs]).float().mean()
        acc_mean = torch.stack([output['accuracy'] for output in outputs]).float().mean()
        #metrics = 
        self.log('valid_loss' , loss_mean, prog_bar = True)
        self.log('valid_accuracy', acc_mean,prog_bar=True)

    def configure_optimizers(self):

        optim = AdamW(self.parameters(),lr=p['LR'],correct_bias=False)

        self.scheduler = get_linear_schedule_with_warmup(
            optim,
            num_warmup_steps = 0,
            num_training_steps = len(self.train_dataloader()) * 100
        )
        return optim
    
    #def test_step(self,batch):

    #    return self.step(batch)
   
    def test_step(self,batch,batch_idx):
#
        return self.step(batch),self.log_dict


    #def test_epoch_end(self,outputs):

    #    loss_mean = torch.stack([output['loss'] for output in outputs]).float().mean()
    #    acc_mean = torch.stack([output['accuracy'] for output in outputs]).float().mean()

    #    self.log('test_loss',loss_mean,prog_bar=True)
    #    self.log('test_acc',acc_mean,prog_bar=True)
        
#
#        metrics = self.validation_step(batch,batch_idx)
#        metrics = {
#            'test_acc' : metrics['val_acc'],
#            'test_loss' : metrics['val_loss']
#        }
#        self.log_dict(metrics)"""
    
    def train_dataloader(self):

        return create_data_loader(self.train_df,tokenizer,p['MAX_LEN'],p['BATCH_SIZE'])
    
    
    def val_dataloader(self):
        
        return create_data_loader(self.valid_df,tokenizer,p['MAX_LEN'],p['BATCH_SIZE'])
    
    def test_dataloader(self):

        return create_data_loader(self.test_df,tokenizer,p['MAX_LEN'],p['BATCH_SIZE']) 



In [10]:
classifier = Classifier(train,val,test,7,params=p)

In [11]:
checkpoint = ModelCheckpoint(
    monitor='valid_accuracy',
    filename='bert',
    mode='max'
)

early_stopping = EarlyStopping(
    monitor='valid_accuracy',
    patience=2,
    verbose=True,
    mode='max'
)

In [12]:
trainer = pl.Trainer(gpus=1, max_epochs=p['MAX_EPOCH'], deterministic=True,progress_bar_refresh_rate=30,callbacks=[checkpoint, early_stopping])
trainer.fit(classifier)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type      | Params
------------------------------------
0 | model | BertModel | 109 M 
1 | drop0 | Dropout   | 0     
2 | drop1 | Dropout   | 0     
3 | lin0  | Linear    | 98.4 K
4 | lin1  | Linear    | 903   
------------------------------------
110 M     Trainable params
0         Non-trainable params
110 M     Total params


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…





HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




1

# ***Inference***

In [21]:
ckp_f = '/content/lightning_logs/version_0/checkpoints/'

In [14]:
classifier = Classifier(None,None,None,None,params=p)

In [15]:
def load_model(p):
    ckp = torch.load(p)
    classifier.load_state_dict(ckp['state_dict'])

In [22]:
load_model(ckp_f)

IsADirectoryError: ignored

In [None]:
test = create_test_data_loader(test,tokenizer,p['MAX_LEN'],p['BATCH_SIZE'])

In [None]:
trainer.test(test_dataloaders=test)