<a href="https://colab.research.google.com/github/camilotorron/NLP/blob/main/BERT_CoLA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pytorch-pretrained-bert pytorch-nlp


Collecting pytorch-pretrained-bert
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123 kB)
[K     |████████████████████████████████| 123 kB 5.1 MB/s 
[?25hCollecting pytorch-nlp
  Downloading pytorch_nlp-0.5.0-py3-none-any.whl (90 kB)
[K     |████████████████████████████████| 90 kB 8.0 MB/s 
Collecting boto3
  Downloading boto3-1.21.2-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 45.1 MB/s 
Collecting s3transfer<0.6.0,>=0.5.0
  Downloading s3transfer-0.5.1-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 6.7 MB/s 
[?25hCollecting botocore<1.25.0,>=1.24.2
  Downloading botocore-1.24.2-py3-none-any.whl (8.5 MB)
[K     |████████████████████████████████| 8.5 MB 54.4 MB/s 
[?25hCollecting jmespath<1.0.0,>=0.7.1
  Downloading jmespath-0.10.0-py2.py3-none-any.whl (24 kB)
Collecting urllib3<1.27,>=1.25.4
  Downloading urllib3-1.26.8-py2.py3-none-any.whl (138 kB)
[K     |████████████████████████████████| 138 kB 68.0 MB/

In [2]:
import tensorflow as tf
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/nyu-mll/CoLA-baselines/master/acceptability_corpus/cola_public/raw/in_domain_train.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])

In [4]:
df.sample(20)

Unnamed: 0,sentence_source,label,label_notes,sentence
820,bc01,0,*,John wagered a stranger to have been in that h...
3359,l-93,0,*,The lecture squirmed Sylvia.
4583,ks08,1,,I have never been spoken to so rudely!
6467,d_98,0,*,There's anything Mary had asked for in this st...
4334,ks08,1,,Gregory appears to have wanted to be loyal to ...
2903,l-93,1,,I detached the handle from the box.
5382,b_73,1,,There is enough bread for all of you.
2354,l-93,0,*,We rummaged the desk for papers.
4663,ks08,1,,John kicked him.
6940,m_02,1,,Harriet talked to Emma for hours.


In [5]:
df.shape

(8551, 4)

In [6]:
sentences = df.sentence.values

In [19]:
sentences = ["[CLS] "+sentence+" [SEP]" for sentence in sentences]
labels = df.label.values

In [8]:
sentences[10:15]

['[CLS] The critics laughed the play off the stage. [SEP]',
 '[CLS] The pond froze solid. [SEP]',
 '[CLS] Bill rolled out of the room. [SEP]',
 '[CLS] The gardener watered the flowers flat. [SEP]',
 '[CLS] The gardener watered the flowers. [SEP]']

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print("Tokenize the first sentence:")
print(tokenized_texts[0])

100%|██████████| 231508/231508 [00:00<00:00, 3001109.34B/s]


Tokenize the first sentence:
['[CLS]', 'our', 'friends', 'won', "'", 't', 'buy', 'this', 'analysis', ',', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose', '.', '[SEP]']


In [11]:
MAX_LEN = 128

input_ids=[tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

In [12]:
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [14]:
attention_masks = []

for seq in input_ids:
  seq_masks = [float(i>0) for i in seq]
  attention_masks.append(seq_masks)

In [22]:
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=2018,test_size=0.1)
train_masks, validation_masks, _, _= train_test_split(attention_masks, input_ids, random_state=2019, test_size=0.1)

In [23]:
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [25]:
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

Data is prepared for finetuning the BERT Model

In [26]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

100%|██████████| 407873900/407873900 [00:12<00:00, 32457765.52B/s]


In [28]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params':[p for n, p in param_optimizer if any(nd in n for nd in no_decay)],'weight_decay_rate':0.0}]

In [29]:
optimizer = BertAdam(optimizer_grouped_parameters, lr=2e-5, warmup=.1)


t_total value of -1 results in schedule not being applied


In [30]:
def flat_accuracy(preds, labels):
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [6]:
t=[]
train_loss_set = []
epochs = 2

for _ in trange(epochs, desc="Epoch"):
  model.train()
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0,0

  for step, batch in enumerate(train_dataloader):
    b_input_ids, b_input_mask, b_labels = batch
    optimizer.zero_grad()
    loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    train_loss_set.append(loss.item())
    loss.backward()
    optimizer.step()

    tr_loss +=loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps +=1
  print("train loss: {}".format(tr_loss/nb_tr_steps))

  model.eval()

  # Tracking variables 
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  # Evaluate data for one epoch
  for batch in validation_dataloader:

    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

  print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))


NameError: ignored