<a href="https://colab.research.google.com/github/amrtanair/sentence_Deepex/blob/master/cola_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# code from stack overflow that allows code output to overflow to next line
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [2]:
!pip install wget
!pip install transformers

import wget
import os

import time
import datetime
import random
import json
from tqdm import tqdm


import torch
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import matthews_corrcoef

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup

from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

print('Downloading dataset...')

url = 'https://nyu-mll.github.io/CoLA/cola_public_1.1.zip'

if not os.path.exists('./cola_public_1.1.zip'):
    wget.download(url, './cola_public_1.1.zip')

if not os.path.exists('./cola_public/'):
    !unzip cola_public_1.1.zip

if torch.cuda.is_available():
    device = torch.device("cuda")
    print('GPU:', torch.cuda.get_device_name(0))
else:
    print('Using CPU')
    device = torch.device("cpu")

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9655 sha256=95b9e4ff85e11cbf437d722f12ff2393878f55435834ce1d5b8a07b0983cd32e
  Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m57.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
Co

In [3]:
# Function to calculate the accuracy of predictions vs labels

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [4]:
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [5]:
def _balance_classes(df):
  grouped = df.groupby('label')
  dfs = []
  min_group_size = grouped.size().min()

  for _, group_df in grouped:
      dfs.append(group_df.iloc[:min_group_size])

  df = pd.concat(dfs, ignore_index=True)
  df = df.sample(frac=1).reset_index(drop=True)

  return df

In [6]:
def create_dataset(sentences, labels):
  input_ids = []
  attention_masks = []

  for sent in sentences:
      encoded_dict = tokenizer.encode_plus(
                          sent,
                          add_special_tokens = True,
                          max_length = train_max_length,
                          truncation = True,
                          padding='max_length',
                          return_attention_mask = True,
                          return_tensors = 'pt',
                    )
      input_ids.append(encoded_dict['input_ids'])
      attention_masks.append(encoded_dict['attention_mask'])

  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  labels = torch.tensor(labels)
  dataset = TensorDataset(input_ids, attention_masks, labels)

  return dataset

In [7]:
# hyperparameters

train_batch_size = 32
eval_batch_size = 32
split = 0.9
model_name = "bert-large-uncased"
learning_rate = 2e-05
epochs = 4
train_max_length = 64
eval_max_length = 64
balanced_classes = False
lr_scheduler = False
hidden_dropout_prob = 0.235
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
if torch.cuda.is_available():
	torch.cuda.manual_seed_all(seed_val)

inference = False

In [8]:
df = pd.read_csv("cola_public/raw/in_domain_train.tsv",
                 delimiter='\t',
                 header=None,
                 names=['sentence_source', 'label', 'label_notes', 'sentence']
                 )

if balanced_classes:
  print('OG Number of training sentences: ', df.shape[0])
  df = _balance_classes(df)
  print('Number of training sentences after balancing the classes: ', df.shape[0])

tokenizer = BertTokenizer.from_pretrained(model_name)
dataset = create_dataset(df.sentence.values, df.label.values)


if not inference:
  train_size = int(split * len(dataset))
  val_size = len(dataset) - train_size
  train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

  print('{:>5,} training samples'.format(train_size))
  print('{:>5,} validation samples'.format(val_size))
else:
  train_dataset = dataset

train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = train_batch_size
        )

if not inference:
  validation_dataloader = DataLoader(
              val_dataset,
              sampler = SequentialSampler(val_dataset),
              batch_size = eval_batch_size
          )

model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels = 2,
    hidden_dropout_prob = hidden_dropout_prob)

if torch.cuda.is_available():
	model.cuda()

# optimizer = torch.optim.AdamW(model.parameters(),
#                   lr = learning_rate,
#                   eps = 1e-8)

# optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9,0.999), eps=1e-08)

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = len(train_dataloader) * epochs)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

7,695 training samples
  856 validation samples


Downloading model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
def focal_loss(logits, targets, alpha = 0.25, gamma = 2, reduction='mean'):
    # ce_loss = nn.CrossEntropyLoss(reduction='none')(logits, targets)
    bce_loss = F.binary_cross_entropy_with_logits(logits, targets.float(), reduction='none')
    pt = torch.exp(-bce_loss)
    focal_loss = (alpha * (1 - pt) ** gamma * bce_loss)

    if reduction == 'mean':
        return focal_loss.mean()
    else:
        return focal_loss
    return loss

In [10]:
# training
if lr_scheduler:
  print("using lr scheduler")

training_stats = []

for epoch_i in range(epochs):
  print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
  print('Training...')

  total_train_loss = 0
  model.train()

  for step, batch in enumerate(train_dataloader):
    if step % 50 == 0 and not step == 0:
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

    input_ids = batch[0].to(device)
    input_mask = batch[1].to(device)
    labels = batch[2].to(device)

    model.zero_grad()
    result = model(input_ids = input_ids, attention_mask = input_mask, labels = labels)
    logits = result.logits

    loss = focal_loss(logits[:, 1], labels, alpha = 0.5, gamma = 2)

    total_train_loss = total_train_loss + loss.item()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()

    if lr_scheduler:
      scheduler.step()

  avg_train_loss = total_train_loss / len(train_dataloader)
  print("Average training loss: ", avg_train_loss)

  if not inference:
    print("Running Validation...")
    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0

    for batch in validation_dataloader:
      input_ids = batch[0].to(device)
      input_mask = batch[1].to(device)
      labels = batch[2].to(device)

      with torch.no_grad():
        result = model(input_ids = input_ids, attention_mask = input_mask, labels = labels)
        loss = result.loss
        logits = result.logits

        total_eval_loss = total_eval_loss + loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        total_eval_accuracy = total_eval_accuracy + flat_accuracy(logits, label_ids)

    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    avg_val_loss = total_eval_loss / len(validation_dataloader)

    print("  Validation Accuracy: {0:.3f}".format(avg_val_accuracy))
    print("  Validation Loss: {0:.3f}".format(avg_val_loss))

  if not inference:
  # Record all statistics from this epoch.
    training_stats.append({'epoch': epoch_i + 1,
                          'Training Loss': avg_train_loss,
                          'Valid. Loss': avg_val_loss,
                          'Valid. Accur.': avg_val_accuracy})
  else:
    training_stats.append({'epoch': epoch_i + 1,
                          'Training Loss': avg_train_loss})

print("Training complete!")
df_stats = pd.DataFrame(data=training_stats)
df_stats = df_stats.set_index('epoch')

print(df_stats)

Training...
  Batch    50  of    241.
  Batch   100  of    241.
  Batch   150  of    241.
  Batch   200  of    241.
Average training loss:  0.07318701392572945
Running Validation...
  Validation Accuracy: 0.796
  Validation Loss: 0.494
Training...
  Batch    50  of    241.
  Batch   100  of    241.
  Batch   150  of    241.
  Batch   200  of    241.
Average training loss:  0.05883855035690846
Running Validation...
  Validation Accuracy: 0.824
  Validation Loss: 0.433
Training...
  Batch    50  of    241.
  Batch   100  of    241.
  Batch   150  of    241.
  Batch   200  of    241.
Average training loss:  0.04212156451342017
Running Validation...
  Validation Accuracy: 0.836
  Validation Loss: 0.413
Training...
  Batch    50  of    241.
  Batch   100  of    241.
  Batch   150  of    241.
  Batch   200  of    241.
Average training loss:  0.02940220180106701
Running Validation...
  Validation Accuracy: 0.846
  Validation Loss: 0.379
Training complete!
       Training Loss  Valid. Loss  Va

In [11]:
# evaluation
df = pd.read_csv("cola_public/raw/out_of_domain_dev.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])

print('Number of test sentences: {:,}\n'.format(df.shape[0]))

sentences = df.sentence.values
labels = df.label.values
input_ids = []
attention_masks = []

for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,
                        add_special_tokens = True,
                        max_length = eval_max_length,
                        truncation = True,
                        padding='max_length',
                        return_attention_mask = True,
                        return_tensors = 'pt',
                  )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

prediction_data = TensorDataset(input_ids, attention_masks, labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=eval_batch_size)

print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))
model.eval()

predictions , true_labels = [], []

for batch in prediction_dataloader:
  batch = tuple(t.to(device) for t in batch)

  b_input_ids, b_input_mask, b_labels = batch

  with torch.no_grad():
      result = model(b_input_ids,
                     attention_mask=b_input_mask,
                     return_dict=True)

  logits = result.logits
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  predictions.append(logits)
  true_labels.append(label_ids)

print('    DONE.')

print('Positive samples: %d of %d (%.3f%%)' % (df.label.sum(), len(df.label), (df.label.sum() / len(df.label) * 100.0)))

matthews_set = []
print('Calculating Matthews Corr. Coef. for each batch...')

for i in range(len(true_labels)):
  pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
  matthews = matthews_corrcoef(true_labels[i], pred_labels_i)
  matthews_set.append(matthews)

flat_predictions = np.concatenate(predictions, axis=0)
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
flat_true_labels = np.concatenate(true_labels, axis=0)
mcc = matthews_corrcoef(flat_true_labels, flat_predictions)
print('Total MCC: %.3f' % mcc)

Number of test sentences: 516

Predicting labels for 516 test sentences...
    DONE.
Positive samples: 354 of 516 (68.605%)
Calculating Matthews Corr. Coef. for each batch...
Total MCC: 0.576


In [12]:
now = datetime.datetime.now().strftime('%d_%m_%Y_%H_%M_%S')
output_dir = './model_save_' + now + '/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

args = {"model_name": model_name,
        "split": split,
        "train_max_length": train_max_length,
        "eval_max_length": eval_max_length,
        "optimizer": str(type (optimizer).__name__),
        "train_size": train_size,
        "validation_size": val_size,
        "train_batch_size": train_batch_size,
        "eval_batch_size": eval_batch_size,
        "learning_rate": learning_rate,
        "hidden_dropout_prob": hidden_dropout_prob,
        "epochs": epochs,
        "mcc": mcc,
        "output_dir": 'training_args_' + now + '.json',
        "balance_classes": balanced_classes,
        "lr_scheduler": lr_scheduler
        }
print(args)

with open(os.path.join(output_dir, 'training_args.json'), "w") as json_file:
    json.dump(args, json_file)

Saving model to ./model_save_02_11_2023_17_12_52/
{'model_name': 'bert-large-uncased', 'split': 0.9, 'train_max_length': 64, 'eval_max_length': 64, 'optimizer': 'Adam', 'train_size': 7695, 'validation_size': 856, 'train_batch_size': 32, 'eval_batch_size': 32, 'learning_rate': 2e-05, 'hidden_dropout_prob': 0.235, 'epochs': 4, 'mcc': 0.5761476398334989, 'output_dir': 'training_args_02_11_2023_17_12_52.json', 'balance_classes': False, 'lr_scheduler': False}


In [13]:
input_text = "Pittsburgh 'S History Democrat, he became the youngest mayor in September"
# input_text = input_text.replace("[SEP] ", "")
input_id = tokenizer(input_text, return_tensors="pt")["input_ids"].squeeze(1).cpu()
model = model_to_save.to('cpu')
output = model(input_id)
#pos = (torch.softmax(output.logits, dim=1)[0][1].item() + value[1])/2
pos = torch.softmax(output.logits, dim=1)[0][0].item()
print(pos)

0.7001261711120605


In [14]:
print(torch.softmax(output.logits, dim=1))

tensor([[0.7001, 0.2999]], grad_fn=<SoftmaxBackward0>)
