<a href="https://colab.research.google.com/github/amrtanair/sentence_Deepex/blob/master/cola_gpt2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# code from stack overflow that allows code output to overflow to next line
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [2]:
!pip install wget
!pip install transformers

import wget
import os

import time
import datetime
import random
import json
from tqdm import tqdm


import torch
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import matthews_corrcoef

from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, accuracy_score
from transformers import (set_seed,
                          TrainingArguments,
                          Trainer,
                          GPT2Config,
                          GPT2Tokenizer,
                          AdamW,
                          get_linear_schedule_with_warmup,
                          GPT2ForSequenceClassification)

print('Downloading dataset...')

url = 'https://nyu-mll.github.io/CoLA/cola_public_1.1.zip'

if not os.path.exists('./cola_public_1.1.zip'):
    wget.download(url, './cola_public_1.1.zip')

if not os.path.exists('./cola_public/'):
    !unzip cola_public_1.1.zip

if torch.cuda.is_available():
    device = torch.device("cuda")
    print('GPU:', torch.cuda.get_device_name(0))
else:
    print('Using CPU')
    device = torch.device("cpu")



Downloading dataset...
GPU: Tesla T4


In [3]:
#hyperparameters

epochs = 6
batch_size = 24
max_length = 48
model_name_or_path = 'gpt2-medium'
warmup = True
learning_rate = 2e-05
seed_val = 42

labels_ids = {
    "0" : 0,
    "1" : 1,
    }

n_labels = len(labels_ids)

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)

if torch.cuda.is_available():
	torch.cuda.manual_seed_all(seed_val)

In [4]:
class PromptDataset(Dataset):
  def __init__(self, path, use_tokenizer, test = False):
    df = pd.read_csv(path, delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])
    df['label'] = df['label'].astype(str)
    grouped = df.groupby('label')
    dfs = []
    min_group_size = grouped.size().min()

    for _, group_df in grouped:
        dfs.append(group_df.iloc[:min_group_size])

    df = pd.concat(dfs, ignore_index=True)
    df = df.sample(frac=1).reset_index(drop=True)

    self.dataframe = df
    # if not test:
    #     self.dataframe = self.dataframe.head(int(0.85 * len(self.dataframe)))
    self.texts = self.dataframe.sentence.values
    self.dataframe['label'] = self.dataframe['label'].astype(str)
    self.labels = self.dataframe.label.values

    self.n_examples = len(self.labels)
    return

  def __len__(self):
    return self.n_examples

  def __getitem__(self, item):
    return {'text':self.texts[item],
            'label':self.labels[item]}

class PromptCollator(object):
    def __init__(self, use_tokenizer, labels_encoder, max_sequence_len=None):
        self.use_tokenizer = use_tokenizer
        self.max_sequence_len = use_tokenizer.model_max_length if max_sequence_len is None else max_sequence_len
        self.labels_encoder = labels_encoder

        return

    def __call__(self, sequences):
        texts = [sequence['text'] for sequence in sequences]
        labels = [sequence['label'] for sequence in sequences]
        labels = [self.labels_encoder[label] for label in labels]

        inputs = self.use_tokenizer(text=texts, return_tensors="pt", padding=True, truncation=True,  max_length=self.max_sequence_len)
        inputs.update({'labels': torch.tensor(labels)})

        return inputs

In [5]:
def train(dataloader, optimizer_, device_, scheduler_):
    global model
    predictions_labels = []
    true_labels = []

    total_loss = 0
    model.train()

    for batch in dataloader:
        true_labels += batch['labels'].numpy().flatten().tolist()
        batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}
        model.zero_grad()
        outputs = model(**batch)
        loss, logits = outputs[:2]
        total_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer_.step()
        scheduler_.step()

        logits = logits.detach().cpu().numpy()
        predictions_labels += logits.argmax(axis=-1).flatten().tolist()
    avg_epoch_loss = total_loss / len(dataloader)
    return true_labels, predictions_labels, avg_epoch_loss

def validation(dataloader, device_):
    global model
    predictions_labels = []
    true_labels = []
    total_loss = 0
    model.eval()

    for batch in dataloader:
        true_labels += batch['labels'].numpy().flatten().tolist()
        batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}

        with torch.no_grad():
            outputs = model(**batch)
            loss, logits = outputs[:2]
            logits = logits.detach().cpu().numpy()
            total_loss += loss.item()
            predict_content = logits.argmax(axis=-1).flatten().tolist()
            predictions_labels += predict_content
    avg_epoch_loss = total_loss / len(dataloader)
    return true_labels, predictions_labels, avg_epoch_loss


In [6]:
print('Loading model and tokenizer')

model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path=model_name_or_path, num_labels=n_labels)
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path)
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

model = GPT2ForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, config=model_config)
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = model.config.eos_token_id
model.to(device)

prompt_collator = PromptCollator(use_tokenizer=tokenizer,
                                                          labels_encoder=labels_ids,
                                                          max_sequence_len=max_length)


train_dataset = PromptDataset(path='./cola_public/raw/in_domain_train.tsv', use_tokenizer=tokenizer)
print('Created `train_dataset` with %d examples!'%len(train_dataset))

train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle=True, collate_fn = prompt_collator)
print('Created `train_dataloader` with %d batches!'%len(train_dataloader))

valid_dataset =  PromptDataset(path='./cola_public/raw/out_of_domain_dev.tsv',
                                use_tokenizer=tokenizer, test= True)
print('Created `valid_dataset` with %d examples!'%len(valid_dataset))

valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=prompt_collator)
print('Created `valid_dataloader` with %d batches!'%len(valid_dataloader))

# optimizer = AdamW(model.parameters(),
#                   lr = learning_rate,
#                   eps = 1e-08
#                   )

optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate, betas=(0.9, 0.999), eps=1e-08)

# optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

# optimizer = torch.optim.Rprop(model.parameters(), lr=0.01, etas=(0.5, 1.2), step_sizes=(1e-06, 50))

total_steps = len(train_dataloader) * epochs

if warmup:
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps = 0,
                                                num_training_steps = total_steps)


for epoch in range(epochs):
    print(f'Epoch {epoch + 1}')
    print('Training on batches...')
    train_labels, train_predict, train_loss = train(train_dataloader, optimizer, device, scheduler)
    train_acc = accuracy_score(train_labels, train_predict)

    print('Validation on batches...')
    valid_labels, valid_predict, val_loss = validation(valid_dataloader, device)
    val_acc = accuracy_score(valid_labels, valid_predict)

    print("  \n train_loss: %.5f - val_loss: %.5f - train_acc: %.5f - valid_acc: %.5f"%(train_loss, val_loss, train_acc, val_acc))

true_labels, predictions_labels, avg_epoch_loss = validation(valid_dataloader, device)
evaluation_report = classification_report(true_labels, predictions_labels, labels=list(labels_ids.values()), target_names=list(labels_ids.keys()))
print(evaluation_report)

mcc = matthews_corrcoef(true_labels, predictions_labels)
print('Total MCC: %.5f' % mcc)

args = {
        "epochs": epochs,
        "batch_size": batch_size,
        "optimizer": str(type (optimizer).__name__),
        "learning_rate": learning_rate,
        "max_length": max_length,
        "model_name": model_name_or_path,
        "warmup": str(warmup),
        "mcc": mcc
        }
print(args)

now = datetime.datetime.now().strftime('%d_%m_%Y_%H_%M_%S')
output_dir = './model_' + model_name_or_path+ "_" + now + '/'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# model.save_pretrained(output_dir)
# tokenizer.save_pretrained(output_dir)

with open(os.path.join(output_dir, 'training_args.json'), "w") as json_file:
    json.dump(args, json_file)


Loading model and tokenizer


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2-medium and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created `train_dataset` with 5056 examples!
Created `train_dataloader` with 211 batches!
Created `valid_dataset` with 324 examples!
Created `valid_dataloader` with 14 batches!
Epoch 1
Training on batches...
Validation on batches...
  
 train_loss: 0.78836 - val_loss: 0.69695 - train_acc: 0.52927 - valid_acc: 0.53086
Epoch 2
Training on batches...
Validation on batches...
  
 train_loss: 0.69301 - val_loss: 0.71679 - train_acc: 0.56507 - valid_acc: 0.45679
Epoch 3
Training on batches...
Validation on batches...
  
 train_loss: 0.66705 - val_loss: 0.72223 - train_acc: 0.59256 - valid_acc: 0.50000
Epoch 4
Training on batches...
Validation on batches...
  
 train_loss: 0.65009 - val_loss: 0.68881 - train_acc: 0.61946 - valid_acc: 0.54938
Epoch 5
Training on batches...
Validation on batches...
  
 train_loss: 0.62668 - val_loss: 0.69455 - train_acc: 0.63726 - valid_acc: 0.55556
Epoch 6
Training on batches...
Validation on batches...
  
 train_loss: 0.61226 - val_loss: 0.70592 - train_acc: 0