In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers==2.5.1

Collecting transformers==2.5.1
[?25l  Downloading https://files.pythonhosted.org/packages/13/33/ffb67897a6985a7b7d8e5e7878c3628678f553634bd3836404fef06ef19b/transformers-2.5.1-py3-none-any.whl (499kB)
[K     |▋                               | 10kB 25.0MB/s eta 0:00:01[K     |█▎                              | 20kB 33.0MB/s eta 0:00:01[K     |██                              | 30kB 23.7MB/s eta 0:00:01[K     |██▋                             | 40kB 27.4MB/s eta 0:00:01[K     |███▎                            | 51kB 23.2MB/s eta 0:00:01[K     |████                            | 61kB 19.2MB/s eta 0:00:01[K     |████▋                           | 71kB 17.3MB/s eta 0:00:01[K     |█████▎                          | 81kB 17.9MB/s eta 0:00:01[K     |██████                          | 92kB 16.8MB/s eta 0:00:01[K     |██████▋                         | 102kB 16.2MB/s eta 0:00:01[K     |███████▏                        | 112kB 16.2MB/s eta 0:00:01[K     |███████▉                   

In [3]:
import torch
import torch.nn as nn 
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
# from transformers import AutoTokenizer, BertTokenizer, EvalPrediction, BertPreTrainedModel, BertConfig, BertModel, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

import random
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [4]:
cd '/content/drive/MyDrive/'

/content/drive/MyDrive


## Data pre-processing

Strict-matched

In [5]:
circa_og = pd.read_csv('NLU_Project/circa-data.tsv', sep='\t', index_col='id')
circa_s = circa_og.drop(circa_og.loc[circa_og['goldstandard1']=='Other'].index)
circa_s = circa_s.drop(circa_s.loc[circa_s['goldstandard1'].isnull()].index)

In [6]:
YN_s = (circa_s['question-X'].map(str)+' '+circa_s['answer-Y']).apply(lambda row: row.strip())
strict_labels = circa_s['goldstandard1'].unique()
strict_label = circa_s['goldstandard1']
strict_dict = {}
for idx, label in enumerate(strict_labels):
    strict_dict[label] = idx
circa_s['strict'] = circa_s.goldstandard1.replace(strict_dict)
strict = circa_s['strict']

In [7]:
strict_dict

{'I am not sure how X will interpret Y’s answer': 6,
 'In the middle, neither yes nor no': 2,
 'No': 1,
 'Probably no': 4,
 'Probably yes / sometimes yes': 3,
 'Yes': 0,
 'Yes, subject to some conditions': 5}

## Modeling

In [8]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()
print(device_name)

if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

/device:GPU:0


### Strict

In [9]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") 

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
saved = torch.load('DS-GA 1012/boolq.pth')
model.load_state_dict(saved['model'])
# optimizer = model['optimizer']
model.to(device) # Send the model to the GPU if we have one

learning_rate = 3e-5
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [10]:
max_len = 0
for entry in YN_s.values:
    input_ids = tokenizer.encode(entry,  add_special_tokens=True)
    max_len = max(max_len, len(input_ids))
print(max_len)

43


In [11]:
df = pd.concat([YN_s, strict_label, strict], axis=1).rename(columns={0:'YN_s'})
df

Unnamed: 0_level_0,YN_s,goldstandard1,strict
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Are you employed? I'm a veterinary technician.,Yes,0
1,Are you a fan of Korean food? I wouldn't say so,No,1
2,Are you bringing any pets into the flat? I do ...,No,1
3,Would you like to get some fresh air in your f...,Yes,0
4,Is your family still living in the neighborhoo...,"In the middle, neither yes nor no",2
...,...,...,...
34263,Do you like to drink? I am in AA.,No,1
34264,Do you like pie? My favorite pie is pecan.,Yes,0
34265,Want to go to a concert with me? I'd rather do...,No,1
34266,Do you like hip/hop music? I can't dance to hi...,Probably no,4


In [12]:
train_strict, val_strict, trainy_strict, valy_strict = train_test_split(df.index.values, df.strict.values, test_size=.4, stratify=df.strict.values)
test_strict, dev_strict, testy_strict, devy_strict = train_test_split(val_strict, valy_strict, test_size=.5, stratify=valy_strict)

In [13]:
df['data_type'] = ['not_set']*df.shape[0]
df.loc[train_strict,'data_type'] = 'train'
df.loc[dev_strict,'data_type'] = 'dev'
df.loc[test_strict,'data_type'] = 'test'

In [14]:
df.groupby(['goldstandard1','strict','data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,YN_s
goldstandard1,strict,data_type,Unnamed: 3_level_1
I am not sure how X will interpret Y’s answer,6,dev,12
I am not sure how X will interpret Y’s answer,6,test,13
I am not sure how X will interpret Y’s answer,6,train,38
"In the middle, neither yes nor no",2,dev,128
"In the middle, neither yes nor no",2,test,127
"In the middle, neither yes nor no",2,train,383
No,1,dev,2166
No,1,test,2166
No,1,train,6497
Probably no,4,dev,232


In [15]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].YN_s.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_dev = tokenizer.batch_encode_plus(
    df[df.data_type=='dev'].YN_s.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].strict.values)

input_ids_dev = encoded_data_dev['input_ids']
attention_masks_dev = encoded_data_dev['attention_mask']
labels_dev = torch.tensor(df[df.data_type=='dev'].strict.values)

In [16]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_dev = TensorDataset(input_ids_dev, attention_masks_dev, labels_dev)

In [17]:
len(dataset_train)

18612

In [18]:
model.classifier = torch.nn.Linear(model.classifier.in_features, 7)
model.num_labels = 7
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [19]:
batch_size = 32

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_dev, 
                                   sampler=SequentialSampler(dataset_dev), 
                                   batch_size=batch_size)

optimizer = AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = 1e-8)

In [20]:
epochs = 3
total_steps = len(dataloader_train) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps)

In [21]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in strict_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)} = {len(y_preds[y_preds==label])/len(y_true)}\n')

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [22]:
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [23]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [24]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [25]:
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        # scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'finetuned_BERT_strict_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    dev_loss, predictions, true_vals = evaluate(dataloader_validation)
    dev_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {dev_loss}')
    tqdm.write(f'F1 Score (Weighted): {dev_f1}')

  0%|          | 0/3 [00:00<?, ?it/s]
	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:1005.)
  exp_avg.mul_(beta1).add_(1.0 - beta1, grad)

Epoch 1:   0%|          | 0/582 [00:01<?, ?it/s, training_loss=0.564][A
Epoch 1:   0%|          | 1/582 [00:01<14:50,  1.53s/it, training_loss=0.564][A
Epoch 1:   0%|          | 1/582 [00:02<14:50,  1.53s/it, training_loss=0.592][A
Epoch 1:   0%|          | 2/582 [00:02<14:00,  1.45s/it, training_loss=0.592][A
Epoch 1:   0%|          | 2/582 [00:04<14:00,  1.45s/it, training_loss=0.499][A
Epoch 1:   1%|          | 3/582 [00:04<13:22,  1.39s/it, training_loss=0.499][A
Epoch 1:   1%|          | 3/582 [00:05<13:22,  1.39s/it, training_loss=0.503][A
Epoch 1:   1%|          | 4/582 [00:05<12:57,  1.35s/it, training_loss=0.503][A
Epoch 1:   1%|          | 4/582 [00:06<12:57,  1.35s/it, training_l


Epoch 1
Training loss: 0.7972609580484862


 33%|███▎      | 1/3 [15:02<30:04, 902.32s/it]
Epoch 2:   0%|          | 0/582 [00:00<?, ?it/s][A

Validation loss: 0.6589933294303638
F1 Score (Weighted): 0.7698438166093962



Epoch 2:   0%|          | 0/582 [00:01<?, ?it/s, training_loss=0.142][A
Epoch 2:   0%|          | 1/582 [00:01<13:12,  1.36s/it, training_loss=0.142][A
Epoch 2:   0%|          | 1/582 [00:02<13:12,  1.36s/it, training_loss=0.147][A
Epoch 2:   0%|          | 2/582 [00:02<13:14,  1.37s/it, training_loss=0.147][A
Epoch 2:   0%|          | 2/582 [00:04<13:14,  1.37s/it, training_loss=0.136][A
Epoch 2:   1%|          | 3/582 [00:04<13:15,  1.37s/it, training_loss=0.136][A
Epoch 2:   1%|          | 3/582 [00:05<13:15,  1.37s/it, training_loss=0.126][A
Epoch 2:   1%|          | 4/582 [00:05<13:15,  1.38s/it, training_loss=0.126][A
Epoch 2:   1%|          | 4/582 [00:06<13:15,  1.38s/it, training_loss=0.187][A
Epoch 2:   1%|          | 5/582 [00:06<13:16,  1.38s/it, training_loss=0.187][A
Epoch 2:   1%|          | 5/582 [00:08<13:16,  1.38s/it, training_loss=0.245][A
Epoch 2:   1%|          | 6/582 [00:08<13:16,  1.38s/it, training_loss=0.245][A
Epoch 2:   1%|          | 6/582 [00


Epoch 2
Training loss: 0.5204353871726498


 67%|██████▋   | 2/3 [30:13<15:04, 904.96s/it]
Epoch 3:   0%|          | 0/582 [00:00<?, ?it/s][A

Validation loss: 0.6060280970384165
F1 Score (Weighted): 0.7892480509405266



Epoch 3:   0%|          | 0/582 [00:01<?, ?it/s, training_loss=0.059][A
Epoch 3:   0%|          | 1/582 [00:01<13:16,  1.37s/it, training_loss=0.059][A
Epoch 3:   0%|          | 1/582 [00:02<13:16,  1.37s/it, training_loss=0.115][A
Epoch 3:   0%|          | 2/582 [00:02<13:14,  1.37s/it, training_loss=0.115][A
Epoch 3:   0%|          | 2/582 [00:04<13:14,  1.37s/it, training_loss=0.159][A
Epoch 3:   1%|          | 3/582 [00:04<13:14,  1.37s/it, training_loss=0.159][A
Epoch 3:   1%|          | 3/582 [00:05<13:14,  1.37s/it, training_loss=0.087][A
Epoch 3:   1%|          | 4/582 [00:05<13:13,  1.37s/it, training_loss=0.087][A
Epoch 3:   1%|          | 4/582 [00:06<13:13,  1.37s/it, training_loss=0.083][A
Epoch 3:   1%|          | 5/582 [00:06<13:12,  1.37s/it, training_loss=0.083][A
Epoch 3:   1%|          | 5/582 [00:08<13:12,  1.37s/it, training_loss=0.283][A
Epoch 3:   1%|          | 6/582 [00:08<13:14,  1.38s/it, training_loss=0.283][A
Epoch 3:   1%|          | 6/582 [00


Epoch 3
Training loss: 0.3560123329412487


100%|██████████| 3/3 [45:24<00:00, 908.08s/it]

Validation loss: 0.6711228408457077
F1 Score (Weighted): 0.799392932238563





In [26]:
accuracy_per_class(predictions, true_vals)

Class: Yes
Accuracy: 2462/2901 = 0.848672871423647

Class: No
Accuracy: 1920/2166 = 0.8864265927977839

Class: In the middle, neither yes nor no
Accuracy: 55/128 = 0.4296875

Class: Probably yes / sometimes yes
Accuracy: 91/249 = 0.3654618473895582

Class: Probably no
Accuracy: 39/232 = 0.16810344827586207

Class: Yes, subject to some conditions
Accuracy: 464/517 = 0.8974854932301741

Class: I am not sure how X will interpret Y’s answer
Accuracy: 0/12 = 0.0



In [27]:
print('Dev Accuracy:', end = ' ')
flat_accuracy(predictions, true_vals)

Dev Accuracy: 

0.8107977437550362

In [28]:
encoded_data_test = tokenizer.batch_encode_plus(
    df[df.data_type=='test'].YN_s.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(df[df.data_type=='test'].strict.values)

dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

dataloader_test = DataLoader(dataset_test, 
                                   sampler=SequentialSampler(dataset_test), 
                                   batch_size=batch_size)

test_loss, test_predictions, test_true_vals = evaluate(dataloader_test)
test_f1 = f1_score_func(test_predictions, test_true_vals)
print(test_f1)

0.8108468193210941


In [29]:
accuracy_per_class(test_predictions, test_true_vals)
print('Test Accuracy:', end = ' ')
flat_accuracy(test_predictions, test_true_vals)

Class: Yes
Accuracy: 2495/2901 = 0.8600482592209583

Class: No
Accuracy: 1934/2166 = 0.8928901200369345

Class: In the middle, neither yes nor no
Accuracy: 58/127 = 0.4566929133858268

Class: Probably yes / sometimes yes
Accuracy: 103/249 = 0.41365461847389556

Class: Probably no
Accuracy: 39/232 = 0.16810344827586207

Class: Yes, subject to some conditions
Accuracy: 466/516 = 0.9031007751937985

Class: I am not sure how X will interpret Y’s answer
Accuracy: 0/13 = 0.0

Test Accuracy: 

0.821244358478401