<a href="https://colab.research.google.com/github/amrtanair/master_thesis/blob/main/BERT_linguistic_acceptability_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
all_datasets = [ "CoLA", "MegaAcceptability"]
dataset = all_datasets[0]
debug = False
model_save = True
model_name = 'bert-large-uncased'

if not debug:
  !pip install wandb
  import wandb
  wandb.login()

!pip install wget
import wget
import os

import time
import datetime
import random
import json
import shutil
from tqdm import tqdm
import re
import math

import torch
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import matthews_corrcoef

from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup

from sklearn.model_selection import KFold, train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from transformers import logging
logging.set_verbosity_error()

if dataset == "CoLA":
  url = 'https://nyu-mll.github.io/CoLA/cola_public_1.1.zip'
  if not os.path.exists('./cola_public_1.1.zip'):
    print('Downloading dataset...')
    wget.download(url, './cola_public_1.1.zip')
  if not os.path.exists('./cola_public/'):
    !unzip cola_public_1.1.zip

  train_path = '/content/cola_public/raw/in_domain_train.tsv'
  dev_path = '/content/cola_public/raw/in_domain_dev.tsv'
  test_path = '/content/cola_public/raw/out_of_domain_dev.tsv'

from google.colab import drive
drive.mount('/content/drive')


Collecting wandb
  Downloading wandb-0.17.4-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-2.10.0-py2.py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.1/302.1 kB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x8

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Collecting wget
  Using cached wget-3.2-py3-none-any.whl
Installing collected packages: wget
Successfully installed wget-3.2
Downloading dataset...
Archive:  cola_public_1.1.zip
   creating: cola_public/
  inflating: cola_public/README      
   creating: cola_public/tokenized/
  inflating: cola_public/tokenized/in_domain_dev.tsv  
  inflating: cola_public/tokenized/in_domain_train.tsv  
  inflating: cola_public/tokenized/out_of_domain_dev.tsv  
   creating: cola_public/raw/
  inflating: cola_public/raw/in_domain_dev.tsv  
  inflating: cola_public/raw/in_domain_train.tsv  
  inflating: cola_public/raw/out_of_domain_dev.tsv  
Mounted at /content/drive


In [None]:
class AcceptabilityDataset():
  def __init__(self, texts, labels, length):
    self.texts = texts
    self.labels = labels
    self.max_length = length
    self.tokenizer = BertTokenizer.from_pretrained(model_name)
    self.n_examples = len(labels)
    return

  def __len__(self):
    return self.n_examples

  def __getitem__(self, idx):
    text = self.texts[idx]
    label = self.labels[idx]

    encoding = self.tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=self.max_length,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    return {
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'labels': torch.tensor(label, dtype=torch.long)
    }

class FocalLoss(nn.Module):
  def __init__(self, alpha=1, gamma=2, reduction='mean'):
    super(FocalLoss, self).__init__()
    self.alpha = alpha
    self.gamma = gamma
    self.reduction = reduction

  def forward(self, inputs, targets):
    inputs_cpu = inputs.cpu()
    y_targets = torch.zeros(inputs.shape[0], 2)
    y_targets[range(y_targets.shape[0]), targets]=1

    p = torch.sigmoid(inputs_cpu)
    ce_loss = F.binary_cross_entropy_with_logits(inputs_cpu, y_targets, reduction="none")
    p_t = p * y_targets + (1 - p) * (1 - y_targets)
    focal_loss = ce_loss * ((1 - p_t) ** self.gamma)

    if self.reduction == 'mean':
        return focal_loss.mean()
    elif self.reduction == 'sum':
        return focal_loss.sum()
    else:
        return focal_loss

def train(model, dataloader, optimizer, device, scheduler):
  criterion = FocalLoss()
  model.train()
  total_loss = 0
  total_correct = 0
  total_samples = 0

  for batch in dataloader:
    optimizer.zero_grad()

    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    loss = criterion(logits, labels)
    total_loss += loss.item()

    loss.backward()
    optimizer.step()
    scheduler.step()

    predictions = torch.argmax(logits, dim=1)
    total_correct += (predictions == labels).sum().item()
    total_samples += labels.size(0)

  avg_loss = total_loss / len(dataloader)
  accuracy = total_correct / total_samples
  return model, avg_loss, accuracy

def validate(model, dataloader, device):
  criterion = FocalLoss()
  model.eval()
  total_loss = 0
  total_correct = 0
  total_samples = 0

  all_preds = []
  all_labels = []

  with torch.no_grad():
    for batch in dataloader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)

      outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
      logits = outputs.logits
      loss = criterion(logits, labels)
      total_loss += loss.item()

      predictions = torch.argmax(logits, dim=1)
      all_preds.extend(predictions.cpu().numpy())
      all_labels.extend(labels.cpu().numpy())

      total_correct += (predictions == labels).sum().item()
      total_samples += labels.size(0)

  avg_loss = total_loss / len(dataloader)
  accuracy = total_correct / total_samples
  return avg_loss, accuracy, all_preds, all_labels

def save_model(model, path):
	model.save_pretrained(path)

def load_model(path):
	return BertForSequenceClassification.from_pretrained(path)


In [None]:
def run():
  if not debug:
    wandb.init()
    learning_rate = wandb.config.learning_rate
    batch_size = wandb.config.batch_size
    length = wandb.config.length
    optimizer = wandb.config.optimizer
    weight_decay = wandb.config.weight_decay
    hidden_dropout_prob = wandb.config.hidden_dropout_prob
  else:
    learning_rate = 5e-05
    batch_size = 16
    length = 32
    optimizer = 'AdamW'
    weight_decay = 0.2
    hidden_dropout_prob = 0.1

  seed = 42
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)

  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
    device = torch.device("cuda")
    device_name = torch.cuda.get_device_name(0)
    print('GPU:', device_name)
  else:
    print('Using CPU')
    device = torch.device("cpu")
    device_name = 'cpu'

  if dataset == "CoLA":
    train_df = pd.read_csv(train_path, delimiter='\t', header=None,
                    names=['sentence_source', 'label', 'label_notes', 'sentence'])

    val_df = pd.read_csv(dev_path, delimiter='\t', header=None,
                    names=['sentence_source', 'label', 'label_notes', 'sentence'])

    test_df = pd.read_csv(test_path, delimiter='\t', header=None,
                    names=['sentence_source', 'label', 'label_notes', 'sentence'])

  elif dataset in ["MegaAcceptability"]:
    path = "/content/drive/MyDrive/thesis/mega_acceptability.tsv"
    df = pd.read_csv(path, delimiter='\t', header=None,
                    names=['sentence_source', 'label', 'label_notes', 'sentence'])

    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
    test_df, val_df = train_test_split(temp_df, test_size=0.5, random_state=42)

  texts = train_df.sentence.values
  labels = train_df.label.values
  train_dataset = AcceptabilityDataset(texts, labels, length)
  train_dataloader = DataLoader(train_dataset,
                                sampler = RandomSampler(train_dataset),
                                batch_size = batch_size)

  val_texts = val_df.sentence.values
  val_labels = val_df.label.values
  val_dataset = AcceptabilityDataset(val_texts, val_labels, length)
  val_dataloader = DataLoader(val_dataset,
                                sampler = SequentialSampler(val_dataset),
                                batch_size = batch_size)

  test_texts = test_df.sentence.values
  test_labels = test_df.label.values
  test_dataset = AcceptabilityDataset(test_texts, test_labels, length)
  test_dataloader = DataLoader(test_dataset,
                                sampler = SequentialSampler(test_dataset),
                                batch_size = batch_size)

  model = BertForSequenceClassification.from_pretrained(model_name,
                                                        num_labels = 2,
                                                        hidden_dropout_prob = hidden_dropout_prob)
  model.to(device)

  if optimizer == 'AdamW':
    optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate, weight_decay = weight_decay)
  elif optimizer == 'Adam':
    optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate, weight_decay = weight_decay)

  total_steps = len(train_dataloader) * 2
  scheduler = get_linear_schedule_with_warmup(optimizer,
                                              num_warmup_steps = 10000,
                                              num_training_steps = total_steps)

  all_loss = {'train_loss':[], 'val_loss':[]}
  all_acc = {'train_acc':[], 'val_acc':[]}

  curr_val_loss = None
  prev_val_loss = None
  epoch = 1

  #dynamic early stopping based on validation loss
  while True:
    print("Epoch: ", epoch)
    print('Training...')
    train_dataloader_tqdm = tqdm(train_dataloader, desc=f"Training Epoch {epoch}")
    model, train_loss, train_accuracy = train(model, train_dataloader_tqdm, optimizer, device, scheduler)

    print('Validation...')
    val_loss, val_accuracy, val_predict, val_labels = validate(model, val_dataloader, device)
    prev_val_loss = curr_val_loss
    curr_val_loss = val_loss

    print("  train_loss: %.5f - val_loss: %.5f - train_acc: %.5f - valid_acc: %.5f"%(train_loss, val_loss, train_accuracy, val_accuracy))
    all_loss['train_loss'].append(train_loss)
    all_loss['val_loss'].append(val_loss)
    all_acc['train_acc'].append(train_accuracy)
    all_acc['val_acc'].append(val_accuracy)

    if curr_val_loss is not None and prev_val_loss is not None:
      if curr_val_loss > prev_val_loss or curr_val_loss == prev_val_loss:
        print(f'Early stopping due to no improvement in validation loss.')
        break
      if epoch > 4:
        print(f'Early stopping due to maximum number of epochs exceeded(maximum is 5).')
        break
    epoch = epoch + 1

  mcc = matthews_corrcoef(val_labels, val_predict)
  print(f'Final MCC is {mcc} for dev data')

  if not debug:
    wandb.log({"epoch": epoch, "train_loss": train_loss,
              "val_loss": val_loss, "train_acc": train_accuracy,
              "val_acc": val_accuracy, "MCC": mcc})
  else:
    _, test_accuracy, test_predict, test_labels = validate(model, test_dataloader, device)
    test_mcc = matthews_corrcoef(test_labels, test_predict)
    print(f'Final MCC is {test_mcc} for test data')
    print(f'Difference in MCC is: ', abs(mcc-test_mcc))

    if model_save:
      output_dir =  './' + dataset + '-' + model_name + '/'
      training_args = {'created': datetime.datetime.now().strftime('%d_%m_%Y_%H_%M_%S'),
              'dataset': dataset,
              'model': model_name,
              'device': device_name,
              'batch_size': batch_size,
              'epochs': epoch,
              'learning_rate': learning_rate,
              'optimizer': type(optimizer).__name__,
              'seed': seed,
              'test_MCC': test_mcc,
              'dev_MCC': mcc,
              'accuracy': test_accuracy,
              'weight_decay': weight_decay,
              'hidden_dropout_prob': hidden_dropout_prob,
              'length': length,
              }
      os.makedirs(output_dir)
      print("Saving model to: ", output_dir)
      if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
      save_model(model, output_dir)

      with open(output_dir + '/training_args.json', "w") as json_file:
        json.dump(training_args, json_file)
      with open(output_dir + '/all_loss.json', "w") as json_file:
        json.dump(all_loss, json_file)
      with open(output_dir + '/all_acc.json', "w") as json_file:
          json.dump(all_acc, json_file)


In [None]:
if __name__ == "__main__":
  if debug:
    run()

In [None]:
if debug and model_save:
  !cp -r /content/MegaAcceptability-bert-large-uncased /content/drive/MyDrive/thesis/models/MegaAcceptability-bert-large-uncased

In [None]:
if __name__ == "__main__":
  sweep_configuration = {
      "method": "bayes",
      "name": "sweep",
      "metric": {"goal": "maximize", "name": "MCC"},
      "parameters": {
          "optimizer": {"values": ['Adam', 'AdamW']},
          "batch_size": {"values": [16, 32]},
          "learning_rate": {"values": [2e-5, 3e-5, 5e-5]},
          "length": {"values": [32, 64]},
          "weight_decay": {"min": 0.01, "max": 0.3},
          "hidden_dropout_prob": {"min": 0.01, "max": 0.3},
      },
  }
  project_name = dataset + "-" + model_name
  if not debug:
    sweep_id = wandb.sweep(sweep = sweep_configuration, project = project_name)
    wandb.agent(sweep_id, function = run, count = 20)