<a href="https://colab.research.google.com/github/amrtanair/sentence_Deepex/blob/master/gpt2_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
!pip install transformers
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()


# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

!pip install wget
import wget
import os

print('Downloading dataset...')

# The URL for the dataset zip file.
url = 'https://nyu-mll.github.io/CoLA/cola_public_1.1.zip'

# Download the file (if we haven't already)
if not os.path.exists('./cola_public_1.1.zip'):
    wget.download(url, './cola_public_1.1.zip')

if not os.path.exists('./cola_public/'):
    !unzip cola_public_1.1.zip

import torch

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

import time
import datetime
import datetime
import random
import os
import json

import torch
import pandas as pd
import numpy as np
from sklearn.metrics import matthews_corrcoef

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup

from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

Found GPU at: /device:GPU:0
Downloading dataset...
There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [13]:
# hyperparameters
train_batch_size = 8
eval_batch_size = 8
split = 0.9
model_name = "bert-large-uncased"
learning_rate = 0.00001
epochs = 3
train_max_length = 64
eval_max_length = 64
balanced_classes = True
lr_scheduler = False

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
if torch.cuda.is_available():
	torch.cuda.manual_seed_all(seed_val)

In [14]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [15]:
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [16]:
def _balance_classes(df):
  grouped = df.groupby('label')
  dfs = []
  min_group_size = grouped.size().min()

  for _, group_df in grouped:
      dfs.append(group_df.iloc[:min_group_size])

  df = pd.concat(dfs, ignore_index=True)
  df = df.sample(frac=1).reset_index(drop=True)

  return df

In [17]:
df = pd.read_csv("cola_public/raw/in_domain_train.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])
print('OG Number of training sentences: {:,}\n'.format(df.shape[0]))

if balanced_classes:
  df = _balance_classes(df)
  print('Number of training sentences after processing: {:,}\n'.format(df.shape[0]))

sentences = df.sentence.values
labels = df.label.values

print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained(model_name)

max_len = 0

for sent in sentences:
    input_ids = tokenizer.encode(sent, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

input_ids = []
attention_masks = []

for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,
                        add_special_tokens = True,
                        max_length = train_max_length,
                        truncation=True,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)
dataset = TensorDataset(input_ids, attention_masks, labels)

train_size = int(split * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = train_batch_size
        )

validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset),
            batch_size = train_batch_size
        )

model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
    hidden_dropout_prob = 0.3,
)

if torch.cuda.is_available():
	model.cuda()

optimizer = torch.optim.AdamW(model.parameters(),
                  lr = learning_rate, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8
                )

# optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

# optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9,0.999), eps=1e-08)

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)
training_stats = []

total_t0 = time.time()

OG Number of training sentences: 8,551

Number of training sentences after processing: 5,056

Loading BERT tokenizer...
Max sentence length:  47




4,550 training samples
  506 validation samples


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
for epoch_i in range(0, epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):
        if step % 100 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()

        result = model(b_input_ids,
                       token_type_ids=None,
                       attention_mask=b_input_mask,
                       labels=b_labels)

        loss = result.loss
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        if lr_scheduler:
          scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            result = model(b_input_ids,
                           token_type_ids=None,
                           attention_mask=b_input_mask,
                           labels=b_labels)

        loss = result.loss
        logits = result.logits

        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_eval_accuracy += flat_accuracy(logits, label_ids)

    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    avg_val_loss = total_eval_loss / len(validation_dataloader)

    validation_time = format_time(time.time() - t0)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

df_stats = pd.DataFrame(data=training_stats)
df_stats = df_stats.set_index('epoch')

print(df_stats)


Training...
  Batch   100  of    569.    Elapsed: 0:00:39.
  Batch   200  of    569.    Elapsed: 0:01:17.
  Batch   300  of    569.    Elapsed: 0:01:56.
  Batch   400  of    569.    Elapsed: 0:02:35.
  Batch   500  of    569.    Elapsed: 0:03:14.

  Average training loss: 0.61
  Training epcoh took: 0:03:41

Running Validation...
  Accuracy: 0.67
  Validation Loss: 0.68
  Validation took: 0:00:07

Training...
  Batch   100  of    569.    Elapsed: 0:00:40.
  Batch   200  of    569.    Elapsed: 0:01:19.
  Batch   300  of    569.    Elapsed: 0:01:59.
  Batch   400  of    569.    Elapsed: 0:02:39.
  Batch   500  of    569.    Elapsed: 0:03:19.

  Average training loss: 0.51
  Training epcoh took: 0:03:46

Running Validation...
  Accuracy: 0.76
  Validation Loss: 0.59
  Validation took: 0:00:07

Training...
  Batch   100  of    569.    Elapsed: 0:00:40.
  Batch   200  of    569.    Elapsed: 0:01:20.
  Batch   300  of    569.    Elapsed: 0:02:00.
  Batch   400  of    569.    Elapsed: 0:02:4

In [19]:
# evaluation
df = pd.read_csv("cola_public/raw/out_of_domain_dev.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])

print('Number of test sentences: {:,}\n'.format(df.shape[0]))

sentences = df.sentence.values
labels = df.label.values
input_ids = []
attention_masks = []

for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = eval_max_length,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',
                        truncation=True     # Return pytorch tensors.
                   )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

prediction_data = TensorDataset(input_ids, attention_masks, labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=eval_batch_size)

print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))

model.eval()

predictions , true_labels = [], []

for batch in prediction_dataloader:
  batch = tuple(t.to(device) for t in batch)

  b_input_ids, b_input_mask, b_labels = batch

  with torch.no_grad():
      result = model(b_input_ids,
                     token_type_ids=None,
                     attention_mask=b_input_mask,
                     return_dict=True)

  logits = result.logits
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  predictions.append(logits)
  true_labels.append(label_ids)

print('    DONE.')

print('Positive samples: %d of %d (%.2f%%)' % (df.label.sum(), len(df.label), (df.label.sum() / len(df.label) * 100.0)))

matthews_set = []
print('Calculating Matthews Corr. Coef. for each batch...')

for i in range(len(true_labels)):
  pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
  matthews = matthews_corrcoef(true_labels[i], pred_labels_i)
  matthews_set.append(matthews)

flat_predictions = np.concatenate(predictions, axis=0)
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
flat_true_labels = np.concatenate(true_labels, axis=0)
mcc = matthews_corrcoef(flat_true_labels, flat_predictions)
print('Total MCC: %.3f' % mcc)

Number of test sentences: 516





Predicting labels for 516 test sentences...
    DONE.
Positive samples: 354 of 516 (68.60%)
Calculating Matthews Corr. Coef. for each batch...
Total MCC: 0.506


In [20]:
now = datetime.datetime.now().strftime('%d_%m_%Y_%H_%M_%S')
output_dir = './model_save_' + now + '/'

# Create output directory if needed
# if not os.path.exists(output_dir):
#     os.makedirs(output_dir)

# print("Saving model to %s" % output_dir)

# model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
# model_to_save.save_pretrained(output_dir)
# tokenizer.save_pretrained(output_dir)

args = {"model_name": model_name,
        "split": split,
        "train_max_length": train_max_length,
        "eval_max_length": eval_max_length,
        "optimizer": str(type (optimizer).__name__),
        "train_size": train_size,
        "validation_size": val_size,
        "train_batch_size": train_batch_size,
        "eval_batch_size": eval_batch_size,
        "learning_rate": learning_rate,
        "epochs": epochs,
        "mcc": mcc,
        "output_dir": 'training_args_' + now + '.json',
        "balance_classes": balanced_classes,
        "lr_scheduler": lr_scheduler
        }
print(args)

with open('training_args_' + now + '.json', "w") as json_file:
    json.dump(args, json_file)

{'model_name': 'bert-large-uncased', 'split': 0.9, 'train_max_length': 64, 'eval_max_length': 64, 'optimizer': 'AdamW', 'train_size': 4550, 'validation_size': 506, 'train_batch_size': 8, 'eval_batch_size': 8, 'learning_rate': 1e-05, 'epochs': 3, 'mcc': 0.5059672171779087, 'output_dir': 'training_args_23_10_2023_20_41_43.json', 'balance_classes': True, 'lr_scheduler': False}
