In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers==2.5.1

Collecting transformers==2.5.1
[?25l  Downloading https://files.pythonhosted.org/packages/13/33/ffb67897a6985a7b7d8e5e7878c3628678f553634bd3836404fef06ef19b/transformers-2.5.1-py3-none-any.whl (499kB)
[K     |▋                               | 10kB 28.6MB/s eta 0:00:01[K     |█▎                              | 20kB 31.7MB/s eta 0:00:01[K     |██                              | 30kB 21.6MB/s eta 0:00:01[K     |██▋                             | 40kB 24.6MB/s eta 0:00:01[K     |███▎                            | 51kB 24.1MB/s eta 0:00:01[K     |████                            | 61kB 26.8MB/s eta 0:00:01[K     |████▋                           | 71kB 18.1MB/s eta 0:00:01[K     |█████▎                          | 81kB 19.3MB/s eta 0:00:01[K     |██████                          | 92kB 18.0MB/s eta 0:00:01[K     |██████▋                         | 102kB 18.0MB/s eta 0:00:01[K     |███████▏                        | 112kB 18.0MB/s eta 0:00:01[K     |███████▉                   

In [3]:
import torch
import torch.nn as nn 
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
# from transformers import AutoTokenizer, BertTokenizer, EvalPrediction, BertPreTrainedModel, BertConfig, BertModel, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

import random
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [4]:
cd '/content/drive/MyDrive/'

/content/drive/MyDrive


## Data pre-processing

Relaxed-matched

In [5]:
circa_og = pd.read_csv('NLU_Project/circa-data.tsv', sep='\t', index_col='id')
circa_r = circa_og.drop(circa_og.loc[circa_og['goldstandard2']=='Other'].index)
circa_r = circa_r.drop(circa_r.loc[circa_r['goldstandard2'].isnull()].index)

In [6]:
YN_r = (circa_r['question-X'].map(str)+' '+circa_r['answer-Y']).apply(lambda row: row.strip())
relaxed_labels = circa_r['goldstandard2'].unique()
relaxed_label = circa_r['goldstandard2']
relaxed_dict = {}
for idx, label in enumerate(relaxed_labels):
    relaxed_dict[label] = idx
circa_r['relaxed'] = circa_r.goldstandard2.replace(relaxed_dict)
relaxed = circa_r['relaxed']

## Modeling

In [7]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()
print(device_name)

if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

/device:GPU:0


### Relaxed

In [8]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") 

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
saved = torch.load('DS-GA 1012/boolq.pth')
model.load_state_dict(saved['model'])
# optimizer = model['optimizer']
model.to(device) # Send the model to the GPU if we have one

learning_rate = 5e-5
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [9]:
max_len = 0
for entry in YN_r.values:
    input_ids = tokenizer.encode(entry,  add_special_tokens=True)
    max_len = max(max_len, len(input_ids))
print(max_len)

43


In [10]:
df = pd.concat([YN_r, relaxed_label, relaxed], axis=1).rename(columns={0:'YN_r'})
df

Unnamed: 0_level_0,YN_r,goldstandard2,relaxed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Are you employed? I'm a veterinary technician.,Yes,0
1,Are you a fan of Korean food? I wouldn't say so,No,1
2,Are you bringing any pets into the flat? I do ...,No,1
3,Would you like to get some fresh air in your f...,Yes,0
4,Is your family still living in the neighborhoo...,"In the middle, neither yes nor no",2
...,...,...,...
34263,Do you like to drink? I am in AA.,No,1
34264,Do you like pie? My favorite pie is pecan.,Yes,0
34265,Want to go to a concert with me? I'd rather do...,No,1
34266,Do you like hip/hop music? I can't dance to hi...,No,1


In [11]:
train_relaxed, val_relaxed, trainy_relaxed, valy_relaxed = train_test_split(df.index.values, df.relaxed.values, test_size=.4, stratify=df.relaxed.values)
test_relaxed, dev_relaxed, testy_relaxed, devy_relaxed = train_test_split(val_relaxed, valy_relaxed, test_size=.5, stratify=valy_relaxed)

In [12]:
df['data_type'] = ['not_set']*df.shape[0]
df.loc[train_relaxed,'data_type'] = 'train'
df.loc[dev_relaxed,'data_type'] = 'dev'
df.loc[test_relaxed,'data_type'] = 'test'

In [13]:
df.groupby(['goldstandard2','relaxed','data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,YN_r
goldstandard2,relaxed,data_type,Unnamed: 3_level_1
"In the middle, neither yes nor no",2,dev,190
"In the middle, neither yes nor no",2,test,190
"In the middle, neither yes nor no",2,train,569
No,1,dev,2567
No,1,test,2566
No,1,train,7700
Yes,0,dev,3326
Yes,0,test,3326
Yes,0,train,9976
"Yes, subject to some conditions",3,dev,516


In [14]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].YN_r.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_dev = tokenizer.batch_encode_plus(
    df[df.data_type=='dev'].YN_r.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].relaxed.values)

input_ids_dev = encoded_data_dev['input_ids']
attention_masks_dev = encoded_data_dev['attention_mask']
labels_dev = torch.tensor(df[df.data_type=='dev'].relaxed.values)

In [15]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_dev = TensorDataset(input_ids_dev, attention_masks_dev, labels_dev)

In [16]:
len(dataset_train)

19795

In [17]:
# model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
#                                                       num_labels=len(relaxed_dict),
#                                                       output_attentions=False,
#                                                       output_hidden_states=False)

model.classifier = torch.nn.Linear(model.classifier.in_features, 4)
model.num_labels = 4
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [18]:
batch_size = 32

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_dev, 
                                   sampler=SequentialSampler(dataset_dev), 
                                   batch_size=batch_size)

optimizer = AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = 1e-8)

In [19]:
epochs = 3
total_steps = len(dataloader_train) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps)

In [20]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in relaxed_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)} = {len(y_preds[y_preds==label])/len(y_true)}\n')

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [21]:
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [22]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [23]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [24]:
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        # scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'finetuned_BERT_relaxed_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    dev_loss, predictions, true_vals = evaluate(dataloader_validation)
    dev_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {dev_loss}')
    tqdm.write(f'F1 Score (Weighted): {dev_f1}')

  0%|          | 0/3 [00:00<?, ?it/s]
	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:1005.)
  exp_avg.mul_(beta1).add_(1.0 - beta1, grad)

Epoch 1:   0%|          | 0/619 [00:01<?, ?it/s, training_loss=0.487][A
Epoch 1:   0%|          | 1/619 [00:01<15:28,  1.50s/it, training_loss=0.487][A
Epoch 1:   0%|          | 1/619 [00:02<15:28,  1.50s/it, training_loss=0.387][A
Epoch 1:   0%|          | 2/619 [00:02<14:34,  1.42s/it, training_loss=0.387][A
Epoch 1:   0%|          | 2/619 [00:03<14:34,  1.42s/it, training_loss=0.376][A
Epoch 1:   0%|          | 3/619 [00:03<13:57,  1.36s/it, training_loss=0.376][A
Epoch 1:   0%|          | 3/619 [00:05<13:57,  1.36s/it, training_loss=0.370][A
Epoch 1:   1%|          | 4/619 [00:05<13:31,  1.32s/it, training_loss=0.370][A
Epoch 1:   1%|          | 4/619 [00:06<13:31,  1.32s/it, training_l


Epoch 1
Training loss: 0.5581654872570746


 33%|███▎      | 1/3 [16:38<33:17, 998.72s/it]
Epoch 2:   0%|          | 0/619 [00:00<?, ?it/s][A

Validation loss: 0.44115540349253135
F1 Score (Weighted): 0.8387910141023737



Epoch 2:   0%|          | 0/619 [00:01<?, ?it/s, training_loss=0.075][A
Epoch 2:   0%|          | 1/619 [00:01<14:47,  1.44s/it, training_loss=0.075][A
Epoch 2:   0%|          | 1/619 [00:02<14:47,  1.44s/it, training_loss=0.056][A
Epoch 2:   0%|          | 2/619 [00:02<14:50,  1.44s/it, training_loss=0.056][A
Epoch 2:   0%|          | 2/619 [00:04<14:50,  1.44s/it, training_loss=0.126][A
Epoch 2:   0%|          | 3/619 [00:04<14:52,  1.45s/it, training_loss=0.126][A
Epoch 2:   0%|          | 3/619 [00:05<14:52,  1.45s/it, training_loss=0.135][A
Epoch 2:   1%|          | 4/619 [00:05<14:52,  1.45s/it, training_loss=0.135][A
Epoch 2:   1%|          | 4/619 [00:07<14:52,  1.45s/it, training_loss=0.122][A
Epoch 2:   1%|          | 5/619 [00:07<14:51,  1.45s/it, training_loss=0.122][A
Epoch 2:   1%|          | 5/619 [00:08<14:51,  1.45s/it, training_loss=0.076][A
Epoch 2:   1%|          | 6/619 [00:08<14:50,  1.45s/it, training_loss=0.076][A
Epoch 2:   1%|          | 6/619 [00


Epoch 2
Training loss: 0.3337677720940171


 67%|██████▋   | 2/3 [33:33<16:43, 1003.55s/it]
Epoch 3:   0%|          | 0/619 [00:00<?, ?it/s][A

Validation loss: 0.41227923132083266
F1 Score (Weighted): 0.8531241809482162



Epoch 3:   0%|          | 0/619 [00:01<?, ?it/s, training_loss=0.086][A
Epoch 3:   0%|          | 1/619 [00:01<14:44,  1.43s/it, training_loss=0.086][A
Epoch 3:   0%|          | 1/619 [00:02<14:44,  1.43s/it, training_loss=0.050][A
Epoch 3:   0%|          | 2/619 [00:02<14:45,  1.43s/it, training_loss=0.050][A
Epoch 3:   0%|          | 2/619 [00:04<14:45,  1.43s/it, training_loss=0.034][A
Epoch 3:   0%|          | 3/619 [00:04<14:47,  1.44s/it, training_loss=0.034][A
Epoch 3:   0%|          | 3/619 [00:05<14:47,  1.44s/it, training_loss=0.039][A
Epoch 3:   1%|          | 4/619 [00:05<14:47,  1.44s/it, training_loss=0.039][A
Epoch 3:   1%|          | 4/619 [00:07<14:47,  1.44s/it, training_loss=0.045][A
Epoch 3:   1%|          | 5/619 [00:07<14:49,  1.45s/it, training_loss=0.045][A
Epoch 3:   1%|          | 5/619 [00:08<14:49,  1.45s/it, training_loss=0.112][A
Epoch 3:   1%|          | 6/619 [00:08<14:49,  1.45s/it, training_loss=0.112][A
Epoch 3:   1%|          | 6/619 [00


Epoch 3
Training loss: 0.22066833620653253


100%|██████████| 3/3 [50:29<00:00, 1009.96s/it]

Validation loss: 0.43640579412812774
F1 Score (Weighted): 0.8533916048425736





In [25]:
accuracy_per_class(predictions, true_vals)

Class: Yes
Accuracy: 3051/3326 = 0.9173180998196031

Class: No
Accuracy: 2083/2567 = 0.8114530580444098

Class: In the middle, neither yes nor no
Accuracy: 68/190 = 0.35789473684210527

Class: Yes, subject to some conditions
Accuracy: 445/516 = 0.8624031007751938



In [26]:
print('Dev Accuracy:', end = ' ')
flat_accuracy(predictions, true_vals)

Dev Accuracy: 

0.8557357175329595

In [27]:
encoded_data_test = tokenizer.batch_encode_plus(
    df[df.data_type=='test'].YN_r.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(df[df.data_type=='test'].relaxed.values)

dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

dataloader_test = DataLoader(dataset_test, 
                                   sampler=SequentialSampler(dataset_test), 
                                   batch_size=batch_size)

test_loss, test_predictions, test_true_vals = evaluate(dataloader_test)
test_f1 = f1_score_func(test_predictions, test_true_vals)
print(test_f1)

0.8551599146117593


In [28]:
accuracy_per_class(test_predictions, test_true_vals)
print('Test Accuracy:', end = ' ')
flat_accuracy(test_predictions, test_true_vals)

Class: Yes
Accuracy: 3021/3326 = 0.9082982561635599

Class: No
Accuracy: 2118/2566 = 0.8254091971940763

Class: In the middle, neither yes nor no
Accuracy: 70/190 = 0.3684210526315789

Class: Yes, subject to some conditions
Accuracy: 444/517 = 0.8588007736943907

Test Accuracy: 

0.8566449462039702