In [48]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [49]:
!pip install transformers==2.5.1



In [50]:
import torch
import torch.nn as nn 
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
# from transformers import AutoTokenizer, BertTokenizer, EvalPrediction, BertPreTrainedModel, BertConfig, BertModel, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

import random
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [51]:
    import torch
    torch.cuda.empty_cache()

In [52]:
cd '/content/drive/MyDrive/'

/content/drive/MyDrive


## Data pre-processing

Relaxed-matched

In [53]:
# circa_og = pd.read_csv('NLU_Project/circa-data.tsv', sep='\t', index_col='id')
circa_og = pd.read_csv('/content/drive/MyDrive/AMBER_TENG_NYU_DRIVE_TO_BROWN/NYU_SPRING2021/NLU/NLU_Project/circa-data.tsv', sep='\t', index_col='id')
circa_r = circa_og.drop(circa_og.loc[circa_og['goldstandard2']=='Other'].index)
circa_r = circa_r.drop(circa_r.loc[circa_r['goldstandard2'].isnull()].index)

In [54]:
YN_r = (circa_r['question-X'].map(str)+' '+circa_r['answer-Y']).apply(lambda row: row.strip())
relaxed_labels = circa_r['goldstandard2'].unique()
relaxed_label = circa_r['goldstandard2']
relaxed_dict = {}
for idx, label in enumerate(relaxed_labels):
    relaxed_dict[label] = idx
circa_r['relaxed'] = circa_r.goldstandard2.replace(relaxed_dict)
relaxed = circa_r['relaxed']

## Modeling

In [55]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()
print(device_name)

if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

/device:GPU:0


### Relaxed

In [56]:
from transformers import PreTrainedModel
from transformers import BertConfig, BertModel
from transformers.modeling_tf_bert import TFBertForSequenceClassification
from transformers import BertForSequenceClassification

## Model Specs
- BERT-MNLI-YN Relaxed learning rate: 5e-5
- MNLI learning rate: 2e-5 ; 3 epochs; batch size 16 vs 32 
  - https://huggingface.co/ishan/bert-base-uncased-mnli 
  - The training parameters were kept the same as Devlin et al., 2019 (learning rate = 2e-5, training epochs = 3, max_sequence_len = 128 and batch_size = 32).
- BERT-MNLI-YN Strict learning rate: 2e-5

In [57]:
# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") 
tokenizer = AutoTokenizer.from_pretrained('ishan/bert-base-uncased-mnli') 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 3, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)
# device = torch.device("cpu")
# Tell pytorch to run this model on the GPU.
# model.cuda()
model.to(device)

learning_rate = 5e-5
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)

In [58]:
torch.cuda.memory_summary(device=None, abbreviated=False)



In [59]:
max_len = 0
for entry in YN_r.values:
    input_ids = tokenizer.encode(entry,  add_special_tokens=True)
    max_len = max(max_len, len(input_ids))
print(max_len)

43


In [60]:
df = pd.concat([YN_r, relaxed_label, relaxed], axis=1).rename(columns={0:'YN_r'})
df

Unnamed: 0_level_0,YN_r,goldstandard2,relaxed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Are you employed? I'm a veterinary technician.,Yes,0
1,Are you a fan of Korean food? I wouldn't say so,No,1
2,Are you bringing any pets into the flat? I do ...,No,1
3,Would you like to get some fresh air in your f...,Yes,0
4,Is your family still living in the neighborhoo...,"In the middle, neither yes nor no",2
...,...,...,...
34263,Do you like to drink? I am in AA.,No,1
34264,Do you like pie? My favorite pie is pecan.,Yes,0
34265,Want to go to a concert with me? I'd rather do...,No,1
34266,Do you like hip/hop music? I can't dance to hi...,No,1


In [61]:
train_relaxed, val_relaxed, trainy_relaxed, valy_relaxed = train_test_split(df.index.values, df.relaxed.values, test_size=.4, stratify=df.relaxed.values)
test_relaxed, dev_relaxed, testy_relaxed, devy_relaxed = train_test_split(val_relaxed, valy_relaxed, test_size=.5, stratify=valy_relaxed)

In [62]:
df['data_type'] = ['not_set']*df.shape[0]
df.loc[train_relaxed,'data_type'] = 'train'
df.loc[dev_relaxed,'data_type'] = 'dev'
df.loc[test_relaxed,'data_type'] = 'test'

In [63]:
df.groupby(['goldstandard2','relaxed','data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,YN_r
goldstandard2,relaxed,data_type,Unnamed: 3_level_1
"In the middle, neither yes nor no",2,dev,190
"In the middle, neither yes nor no",2,test,190
"In the middle, neither yes nor no",2,train,569
No,1,dev,2566
No,1,test,2567
No,1,train,7700
Yes,0,dev,3326
Yes,0,test,3326
Yes,0,train,9976
"Yes, subject to some conditions",3,dev,517


In [64]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].YN_r.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_dev = tokenizer.batch_encode_plus(
    df[df.data_type=='dev'].YN_r.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].relaxed.values)

input_ids_dev = encoded_data_dev['input_ids']
attention_masks_dev = encoded_data_dev['attention_mask']
labels_dev = torch.tensor(df[df.data_type=='dev'].relaxed.values)

In [65]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_dev = TensorDataset(input_ids_dev, attention_masks_dev, labels_dev)

In [66]:
len(dataset_train)

19795

In [67]:
# model
# relaxed_dict
len(relaxed_dict)

4

In [68]:
model.classifier

Linear(in_features=768, out_features=3, bias=True)

In [69]:
# model

In [70]:
# model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
#                                                       num_labels=len(relaxed_dict),
#                                                       output_attentions=False,
#                                                       output_hidden_states=False)

# model = BertForSequenceClassification.from_pretrained('ishan/bert-base-uncased-mnli',
#                                                       num_labels=4,
#                                                       # num_labels=len(relaxed_dict),
#                                                       output_attentions=False,
#                                                       output_hidden_states=False)
model.classifier = torch.nn.Linear(model.classifier.in_features, 4)
model.num_labels = 4
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [71]:
# batch_size = 32
batch_size = 16

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_dev, 
                                   sampler=SequentialSampler(dataset_dev), 
                                   batch_size=batch_size)

optimizer = AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = 1e-8)

In [72]:
# epochs = 3
epochs = 2
total_steps = len(dataloader_train) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps)

In [73]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in relaxed_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)} = {len(y_preds[y_preds==label])/len(y_true)}\n')

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [74]:
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [75]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')
model.to(device)

print(device)

cuda


In [76]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [77]:
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        # scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'finetuned_BERT_relaxed_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    dev_loss, predictions, true_vals = evaluate(dataloader_validation)
    dev_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {dev_loss}')
    tqdm.write(f'F1 Score (Weighted): {dev_f1}')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Epoch 1:  33%|███▎      | 407/1238 [09:23<19:10,  1.38s/it, training_loss=0.288][A[A[A


Epoch 1:  33%|███▎      | 408/1238 [09:23<19:07,  1.38s/it, training_loss=0.288][A[A[A


Epoch 1:  33%|███▎      | 408/1238 [09:25<19:07,  1.38s/it, training_loss=0.137][A[A[A


Epoch 1:  33%|███▎      | 409/1238 [09:25<19:03,  1.38s/it, training_loss=0.137][A[A[A


Epoch 1:  33%|███▎      | 409/1238 [09:26<19:03,  1.38s/it, training_loss=0.124][A[A[A


Epoch 1:  33%|███▎      | 410/1238 [09:26<19:04,  1.38s/it, training_loss=0.124][A[A[A


Epoch 1:  33%|███▎      | 410/1238 [09:27<19:04,  1.38s/it, training_loss=0.245][A[A[A


Epoch 1:  33%|███▎      | 411/1238 [09:27<19:02,  1.38s/it, training_loss=0.245][A[A[A


Epoch 1:  33%|███▎      | 411/1238 [09:29<19:02,  1.38s/it, training_loss=0.237][A[A[A


Epoch 1:  33%|███▎      | 412/1238 [09:29<18:58,  1.38s/it, training_loss=0.237][A[A[A


Epoch 1:  33%|


Epoch 1
Training loss: 0.5769116283425992




[A[A
  0%|          | 0/1 [1:26:08<?, ?it/s]

  0%|          | 0/2 [31:59<?, ?it/s][A[A


[A[A
  0%|          | 0/1 [1:26:08<?, ?it/s]

  0%|          | 0/2 [31:59<?, ?it/s][A[A
Epoch 1:  51%|█████     | 10037/19795 [1:26:08<36:40,  4.43it/s, training_loss=1.225][A

 50%|█████     | 1/2 [31:59<31:59, 1919.15s/it][A[A


Epoch 2:   0%|          | 0/1238 [00:00<?, ?it/s][A[A[A

Validation loss: 0.4541814619039363
F1 Score (Weighted): 0.8272012721755291


[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Epoch 2:  33%|███▎      | 407/1238 [09:23<19:10,  1.38s/it, training_loss=0.060][A[A[A


Epoch 2:  33%|███▎      | 408/1238 [09:23<19:11,  1.39s/it, training_loss=0.060][A[A[A


Epoch 2:  33%|███▎      | 408/1238 [09:24<19:11,  1.39s/it, training_loss=0.029][A[A[A


Epoch 2:  33%|███▎      | 409/1238 [09:24<19:10,  1.39s/it, training_loss=0.029][A[A[A


Epoch 2:  33%|███▎      | 409/1238 [09:26<19:10,  1.39s/it, training_loss=0.016][A[A[A


Epoch 2:  33%|███▎      | 410/1238 [09:26<19:05,  1.38s/it, training_loss=0.016][A[A[A


Epoch 2:  33%|███▎      | 410/1238 [09:27<19:05,  1.38s/it, training_loss=0.102][A[A[A


Epoch 2:  33%|███▎      | 411/1238 [09:27<19:00,  1.38s/it, training_loss=0.102][A[A[A


Epoch 2:  33%|███▎      | 411/1238 [09:28<19:00,  1.38s/it, training_loss=0.133][A[A[A


Epoch 2:  33%|███▎      | 412/1238 [09:28<19:00,  1.38s/it, training_loss=0.133][A[A[A


Epoch 2:  33%|


Epoch 2
Training loss: 0.3635261633082839




[A[A
  0%|          | 0/1 [1:58:05<?, ?it/s]

 50%|█████     | 1/2 [1:03:56<31:59, 1919.15s/it][A[A


[A[A
  0%|          | 0/1 [1:58:05<?, ?it/s]

 50%|█████     | 1/2 [1:03:56<31:59, 1919.15s/it][A[A
Epoch 1:  51%|█████     | 10037/19795 [1:58:05<36:40,  4.43it/s, training_loss=1.225][A

100%|██████████| 2/2 [1:03:56<00:00, 1918.10s/it]

Validation loss: 0.49066849989189654
F1 Score (Weighted): 0.8334374480214185





In [78]:
accuracy_per_class(predictions, true_vals)

Class: Yes
Accuracy: 3113/3326 = 0.9359591100420926

Class: No
Accuracy: 1941/2566 = 0.7564302416212003

Class: In the middle, neither yes nor no
Accuracy: 37/190 = 0.19473684210526315

Class: Yes, subject to some conditions
Accuracy: 455/517 = 0.8800773694390716



In [79]:
print('Dev Accuracy:', end = ' ')
flat_accuracy(predictions, true_vals)

Dev Accuracy: 

0.8404303682376117

In [80]:
encoded_data_test = tokenizer.batch_encode_plus(
    df[df.data_type=='test'].YN_r.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(df[df.data_type=='test'].relaxed.values)

dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

dataloader_test = DataLoader(dataset_test, 
                                   sampler=SequentialSampler(dataset_test), 
                                   batch_size=batch_size)

test_loss, test_predictions, test_true_vals = evaluate(dataloader_test)
test_f1 = f1_score_func(test_predictions, test_true_vals)

In [81]:
accuracy_per_class(test_predictions, test_true_vals)
print('Test Accuracy:', end = ' ')
flat_accuracy(test_predictions, test_true_vals)

Class: Yes
Accuracy: 3156/3326 = 0.9488875526157546

Class: No
Accuracy: 1941/2567 = 0.7561355668095052

Class: In the middle, neither yes nor no
Accuracy: 33/190 = 0.1736842105263158

Class: Yes, subject to some conditions
Accuracy: 442/516 = 0.8565891472868217

Test Accuracy: 

0.8443703591453251