<a href="https://colab.research.google.com/github/am-khan/affect/blob/master/BertBaseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
!pip install torch
!pip install transformers
!pip install pandas
!pip install tqdm
!pip install pytorch-lightning

Collecting pytorch-lightning
[?25l  Downloading https://files.pythonhosted.org/packages/71/ab/561d1fa6e5af30b2fd7cb4001f93eb08531e1b72976f13eebf7f7cdc021c/pytorch_lightning-0.7.6-py3-none-any.whl (248kB)
[K     |████████████████████████████████| 256kB 2.8MB/s 
Collecting future>=0.17.1
[?25l  Downloading https://files.pythonhosted.org/packages/45/0b/38b06fd9b92dc2b68d58b75f900e97884c45bedd2ff83203d933cf5851c9/future-0.18.2.tar.gz (829kB)
[K     |████████████████████████████████| 829kB 44.2MB/s 
Building wheels for collected packages: future
  Building wheel for future (setup.py) ... [?25l[?25hdone
  Created wheel for future: filename=future-0.18.2-cp36-none-any.whl size=491057 sha256=5991540081a19b7cd1c3c798965142f958ff5cb5cbca222599e1ade3791b7e4e
  Stored in directory: /root/.cache/pip/wheels/8b/99/a0/81daf51dcd359a9377b110a8a886b3895921802d2fc1b2397e
Successfully built future
Installing collected packages: future, pytorch-lightning
  Found existing installation: future 0.16.0
 

In [0]:
import torch
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
import logging
#logging.basicConfig(level=logging.INFO)

In [0]:
# Load pre-trained model tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Data Imports


In [0]:
import os
import pandas as pd
import re
from tqdm.notebook import tqdm

# path to data 
DATA_DIR = os.path.abspath('drive/My Drive/AffectData')

# text and label column names
TEXT_COL = "text"
LABEL_COL = "label"

def read_imdb(data_dir: str):
    datasets = {}
    for partition in ["train","test"]:
        df = pd.read_csv(os.path.join(DATA_DIR, f'imdb_{partition}.csv'))
        datasets[partition] = df
    return datasets    

datasets = read_imdb(DATA_DIR)

In [0]:
# list of labels
labels = list(set(datasets["train"][LABEL_COL].tolist()))

# labels to integers mapping
label2int = {label: i for i, label in enumerate(labels)}

In [0]:
import torch
from torch.utils.data import TensorDataset, random_split, DataLoader
from tqdm.notebook import tqdm
from typing import Tuple

NUM_MAX_POSITIONS = 500
BATCH_SIZE = 16

class TextProcessor:   
    def __init__(self, tokenizer, label2id: dict, num_max_positions:int=512):
        self.tokenizer=tokenizer
        self.label2id = label2id
        self.num_max_positions = num_max_positions     
    
    def process_example(self, example: Tuple[str, str]):
        "Convert text (example[0]) to sequence of IDs and label (example[1]) to integer"
        assert len(example) == 2
        label, text = example[0], example[1]
        assert isinstance(text, str)
        encoded = tokenizer.encode_plus(text,
                                        max_length=self.num_max_positions,
                                        pad_to_max_length=True,
                                        return_attention_mask=True,
                                        return_token_type_ids=True)  
          
        return encoded.input_ids, encoded.token_type_ids, encoded.attention_mask, self.label2id[label], 


# initialize a TextProcessor
processor = TextProcessor(tokenizer, label2int, num_max_positions=NUM_MAX_POSITIONS)

# Data Processing

In [10]:
from concurrent.futures import ProcessPoolExecutor
from multiprocessing import cpu_count
from itertools import repeat

num_cores = cpu_count()

def process_row(processor, row):
    example = row[1][LABEL_COL], row[1][TEXT_COL]

    return processor.process_example(example=example)

def create_dataloader(df: pd.DataFrame,
                      processor: TextProcessor,
                      batch_size: int = BATCH_SIZE,
                      validation_pct: float = 0,
                      shuffle: bool = True):
    "Process rows in `df` with `num_cores` workers using `processor`."

    with ProcessPoolExecutor(max_workers=num_cores) as executor:
        result = list(
            tqdm(executor.map(process_row,
                              repeat(processor),
                              df.iterrows(),
                              chunksize=len(df) // 10),
                 desc=f"Processing {len(df)} examples on {num_cores} cores",
                 total=len(df)))

    ids = [r[0] for r in result]
    token_types = [r[1] for r in result]
    attention_masks = [r[2] for r in result]
    labels = [r[3] for r in result]

    dataset = TensorDataset(torch.tensor(ids, dtype=torch.long),
                            torch.tensor(token_types, dtype=torch.long),
                            torch.tensor(attention_masks, dtype=torch.long),
                            torch.tensor(labels, dtype=torch.long))
    
    if validation_pct is not 0:
        valid_size = int(validation_pct * len(df))
        train_size = len(df) - valid_size
        valid_dataset, train_dataset = random_split(dataset,
                                                    [valid_size, train_size])
        valid_loader = DataLoader(valid_dataset,
                                  batch_size=batch_size,
                                  shuffle=False)
        train_loader = DataLoader(train_dataset,
                                  batch_size=batch_size,
                                  shuffle=True)
        
        return train_loader, valid_loader


    data_loader = DataLoader(dataset,
                             batch_size=batch_size,
                             num_workers=0,
                             shuffle=shuffle)
    return data_loader


train_loader, validation_loader = create_dataloader(datasets['train'], processor, validation_pct=0.1)
test_loader = create_dataloader(datasets['test'], processor)

HBox(children=(FloatProgress(value=0.0, description='Processing 25000 examples on 2 cores', max=25000.0, style…




HBox(children=(FloatProgress(value=0.0, description='Processing 25000 examples on 2 cores', max=25000.0, style…




# Model

# Fine-tuning

In [12]:
from collections import namedtuple
import torch

LOG_DIR = "./logs/"
CACHE_DIR = "./cache/"

device = "cuda" if torch.cuda.is_available() else "cpu"

FineTuningConfig = namedtuple('FineTuningConfig',
      field_names="num_classes, dropout, init_range, batch_size, lr, max_norm, n_epochs,"
                  "n_warmup, valid_pct, gradient_acc_steps, device, log_dir, dataset_cache")

finetuning_config = FineTuningConfig(
                2, 0.1, 0.02, BATCH_SIZE, 2e-5, 1.0, 1,
                0, 0.1, 1, device, LOG_DIR, 
                CACHE_DIR+'dataset_cache.bin')

finetuning_config

FineTuningConfig(num_classes=2, dropout=0.1, init_range=0.02, batch_size=10, lr=2e-05, max_norm=1.0, n_epochs=1, n_warmup=0, valid_pct=0.1, gradient_acc_steps=1, device='cuda', log_dir='./logs/', dataset_cache='./cache/dataset_cache.bin')

In [0]:
# Bert optimizer
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(), lr=finetuning_config.lr, correct_bias=False)

# PyTorch scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=finetuning_config.n_warmup,
                                            num_training_steps=len(train_loader)*finetuning_config.n_epochs)  


In [0]:
from sklearn.metrics import accuracy_score
import pytorch_lightning as pl

class BertIMDBFinetuner(pl.LightningModule):
    def __init__(self):
        super(BertIMDBFinetuner, self).__init__()  
        self.bert = bert

    def forward(self, input_ids, attention_mask, token_type_ids, labels):
        return self.bert(input_ids=input_ids, 
                         attention_mask=attention_mask, 
                         token_type_ids=token_type_ids,
                         labels=labels)
    

    def training_step(self, batch, batch_nb):
        # batch
        input_ids, token_type_ids, attention_mask, labels = batch
         
        # fwd
        outputs = self(input_ids, attention_mask, token_type_ids, labels)
        
        loss, logits = outputs[:2]
        # logs
        tensorboard_logs = {'train_loss': loss}
        return {'loss': loss, 'log': tensorboard_logs}

    def validation_step(self, batch, batch_nb):
        # batch
        input_ids, token_type_ids, attention_mask, labels = batch
         
        # fwd
        outputs = self(input_ids, attention_mask, token_type_ids, labels)
        loss, logits = outputs[:2]
        
        # acc
        a, y_hat = torch.max(logits, dim=1)
        val_acc = accuracy_score(y_hat.cpu(), labels.cpu())
        val_acc = torch.tensor(val_acc)

        return {'val_loss': loss, 'val_acc': val_acc}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        avg_val_acc = torch.stack([x['val_acc'] for x in outputs]).mean()

        tensorboard_logs = {'val_loss': avg_loss, 'avg_val_acc': avg_val_acc}
        return {'val_loss': avg_loss, 'progress_bar': tensorboard_logs}

    def test_step(self, batch, batch_nb):
        input_ids, attention_mask, token_type_ids, label = batch
        
        y_hat, attn = self(input_ids, attention_mask, token_type_ids)
        
        a, y_hat = torch.max(y_hat, dim=1)
        test_acc = accuracy_score(y_hat.cpu(), label.cpu())
        
        return {'test_acc': torch.tensor(test_acc)}

    def test_epoch_end(self, outputs):
        avg_test_acc = torch.stack([x['test_acc'] for x in outputs]).mean()

        tensorboard_logs = {'avg_test_acc': avg_test_acc}
        return {'avg_test_acc': avg_test_acc, 'log': tensorboard_logs, 'progress_bar': tensorboard_logs}
    
    def configure_optimizers(self):
        optimizer = AdamW(self.bert.parameters(), lr=finetuning_config.lr, correct_bias=False)
        scheduler = get_linear_schedule_with_warmup(optimizer,
                                                    num_warmup_steps=finetuning_config.n_warmup,
                                                    num_training_steps=len(train_loader)*finetuning_config.n_epochs)  
        return [optimizer], [scheduler]

    def train_dataloader(self):
        return train_loader

    def val_dataloader(self):
        return validation_loader

    def test_dataloader(self):

        return test_loader

In [0]:
torch.cuda.empty_cache()
bert_finetuner = BertIMDBFinetuner()

# most basic trainer, uses good defaults (1 gpu)
trainer = pl.Trainer(gpus=1)    
trainer.fit(bert_finetuner) 

GPU available: True, used: True
No environment variable for node rank defined. Set as 0.
CUDA_VISIBLE_DEVICES: [0]

    | Name                                                  | Type                          | Params
----------------------------------------------------------------------------------------------------
0   | bert                                                  | BertForSequenceClassification | 109 M 
1   | bert.bert                                             | BertModel                     | 109 M 
2   | bert.bert.embeddings                                  | BertEmbeddings                | 23 M  
3   | bert.bert.embeddings.word_embeddings                  | Embedding                     | 23 M  
4   | bert.bert.embeddings.position_embeddings              | Embedding                     | 393 K 
5   | bert.bert.embeddings.token_type_embeddings            | Embedding                     | 1 K   
6   | bert.bert.embeddings.LayerNorm                        | LayerNorm     

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…





HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)


In [0]:
# Start tensorboard.
%load_ext tensorboard
%tensorboard --logdir lightning_logs/