In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
cd '/content/drive/MyDrive/'

/content/drive/MyDrive


In [4]:
!pip install transformers
!pip install sentencepiece
import torch
import torch.nn as nn 
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
# from transformers import AutoTokenizer, BertTokenizer, EvalPrediction, BertPreTrainedModel, BertConfig, BertModel, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

import random
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 8.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 47.4MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 53.3MB/s 
Installing collected packages: sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1
Collecting sentencepiece
[?25l  Downloading https://files.pythonh

In [5]:
circa_og = pd.read_csv('NLU_Project/circa-data.tsv', sep='\t', index_col='id')
circa_r = circa_og.drop(circa_og.loc[circa_og['goldstandard2']=='Other'].index)
circa_r = circa_r.drop(circa_r.loc[circa_r['goldstandard2'].isnull()].index)

In [6]:
circa_r

Unnamed: 0_level_0,context,question-X,canquestion-X,answer-Y,judgements,goldstandard1,goldstandard2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Y has just travelled from a different city to ...,Are you employed?,I am employed .,I'm a veterinary technician.,Yes#Yes#Yes#Yes#Yes,Yes,Yes
1,X wants to know about Y's food preferences.,Are you a fan of Korean food?,I am a fan of Korean food .,I wouldn't say so,Probably no#No#No#No#Probably yes / sometimes yes,No,No
2,Y has just told X that he/she is thinking of b...,Are you bringing any pets into the flat?,I am bringing pets into the flat .,I do not own any pets,No#No#No#No#No,No,No
3,X wants to know what activities Y likes to do ...,Would you like to get some fresh air in your f...,I would like to get fresh air in my free time .,I am desperate to get out of the city.,"Yes#Yes, subject to some conditions#Probably y...",Yes,Yes
4,X and Y are childhood neighbours who unexpecte...,Is your family still living in the neighborhood?,My family is living in the neighborhood .,My parents are snowbirds now.,"No#In the middle, neither yes nor no#Probably ...","In the middle, neither yes nor no","In the middle, neither yes nor no"
...,...,...,...,...,...,...,...
34263,X wants to know what activities Y likes to do ...,Do you like to drink?,I like to drink .,I am in AA.,No#No#No#Probably no#No,No,No
34264,X wants to know about Y's food preferences.,Do you like pie?,I like pie .,My favorite pie is pecan.,"Yes#Yes#Yes, subject to some conditions#Yes#Yes",Yes,Yes
34265,X wants to know about Y's music preferences.,Want to go to a concert with me?,I want to go to a concert with me .,I'd rather do something else.,"No#In the middle, neither yes nor no#Probably ...",No,No
34266,X wants to know about Y's music preferences.,Do you like hip/hop music?,I like hip/hop music .,I can't dance to hip/hop music,"Probably no#Probably no#In the middle, neither...",Probably no,No


In [7]:
import re
def normalize_text(text):
    """Lowercase and remove quotes from a TensorFlow string."""
    text = text.lower()
    text = re.sub("'(.*)'", r"\1",text)
    return text

In [8]:
circa_r['question-X'] = circa_r['question-X'].map(normalize_text)
circa_r['answer-Y'] = circa_r['answer-Y'].map(normalize_text)
circa_r['goldstandard2'] = circa_r['goldstandard2'].map(normalize_text)
df = circa_r.drop(columns=['context', 'canquestion-X', 'judgements', 'goldstandard1'])

In [9]:
df

Unnamed: 0_level_0,question-X,answer-Y,goldstandard2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,are you employed?,i'm a veterinary technician.,yes
1,are you a fan of korean food?,i wouldn't say so,no
2,are you bringing any pets into the flat?,i do not own any pets,no
3,would you like to get some fresh air in your f...,i am desperate to get out of the city.,yes
4,is your family still living in the neighborhood?,my parents are snowbirds now.,"in the middle, neither yes nor no"
...,...,...,...
34263,do you like to drink?,i am in aa.,no
34264,do you like pie?,my favorite pie is pecan.,yes
34265,want to go to a concert with me?,i'd rather do something else.,no
34266,do you like hip/hop music?,i can't dance to hip/hop music,no


In [10]:
candidates = df['goldstandard2'].unique()
candidates = " ".join([f"({chr(ord('A') + i)}) {x}" for i, x in enumerate(candidates)])
candidates

'(A) yes (B) no (C) in the middle, neither yes nor no (D) yes, subject to some conditions'

In [11]:
def format_input(row, candidates):
  question = row['question-X']
  paragraph = row['answer-Y']
  return f"{question} \\n {candidates} \\n {paragraph}"

df['uqa_input'] = df.apply(lambda x: format_input(x, candidates), axis = 1)


In [12]:
from transformers import AutoTokenizer, T5ForConditionalGeneration

model_name = "allenai/unifiedqa-t5-small" # you can specify the model size here
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def run_model(input_string, **generator_args):
    input_ids = tokenizer.encode(input_string, return_tensors="pt")
    res = model.generate(input_ids, **generator_args)
    return tokenizer.batch_decode(res, skip_special_tokens=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1233.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1786.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=25.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=242089239.0, style=ProgressStyle(descri…




In [13]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()
print(device_name)

if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

train_relaxed, val_relaxed, trainy_relaxed, valy_relaxed = train_test_split(df.index.values, df.goldstandard2.values, test_size=.4, stratify=df.goldstandard2.values)
test_relaxed, dev_relaxed, testy_relaxed, devy_relaxed = train_test_split(val_relaxed, valy_relaxed, test_size=.5, stratify=valy_relaxed)

/device:GPU:0


In [14]:
df['data_type'] = ['not_set']*df.shape[0]
df.loc[train_relaxed,'data_type'] = 'train'
df.loc[dev_relaxed,'data_type'] = 'dev'
df.loc[test_relaxed,'data_type'] = 'test'

In [15]:
df.head()

Unnamed: 0_level_0,question-X,answer-Y,goldstandard2,uqa_input,data_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,are you employed?,i'm a veterinary technician.,yes,are you employed? \n (A) yes (B) no (C) in the...,train
1,are you a fan of korean food?,i wouldn't say so,no,are you a fan of korean food? \n (A) yes (B) n...,train
2,are you bringing any pets into the flat?,i do not own any pets,no,are you bringing any pets into the flat? \n (A...,dev
3,would you like to get some fresh air in your f...,i am desperate to get out of the city.,yes,would you like to get some fresh air in your f...,train
4,is your family still living in the neighborhood?,my parents are snowbirds now.,"in the middle, neither yes nor no",is your family still living in the neighborhoo...,train


In [16]:
df[df.data_type=='train'].uqa_input.values

array(["are you employed? \\n (A) yes (B) no (C) in the middle, neither yes nor no (D) yes, subject to some conditions \\n i'm a veterinary technician.",
       "are you a fan of korean food? \\n (A) yes (B) no (C) in the middle, neither yes nor no (D) yes, subject to some conditions \\n i wouldn't say so",
       'would you like to get some fresh air in your free time? \\n (A) yes (B) no (C) in the middle, neither yes nor no (D) yes, subject to some conditions \\n i am desperate to get out of the city.',
       ...,
       "want to go to a concert with me? \\n (A) yes (B) no (C) in the middle, neither yes nor no (D) yes, subject to some conditions \\n i'd rather do something else.",
       "do you like hip/hop music? \\n (A) yes (B) no (C) in the middle, neither yes nor no (D) yes, subject to some conditions \\n i can't dance to hip/hop music",
       'do you see yourself raising a family in new york? \\n (A) yes (B) no (C) in the middle, neither yes nor no (D) yes, subject to some co

In [17]:
# list(df[df.data_type=='train'].goldstandard2.values)

In [18]:
encoded_data_train = tokenizer.batch_encode_plus(
    list(df[df.data_type=='train'].uqa_input.values),
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256,  
    return_tensors='pt'
)

encoded_labels_train = tokenizer.batch_encode_plus(
    list(df[df.data_type=='train'].goldstandard2.values),
    add_special_tokens=False, 
    return_attention_mask=False, 
    pad_to_max_length=True, 
    max_length=100, 
    return_tensors='pt'
)

encoded_data_dev = tokenizer.batch_encode_plus(
    list(df[df.data_type=='dev'].uqa_input.values),
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256,  
    return_tensors='pt'
)

encoded_labels_dev = tokenizer.batch_encode_plus(
    list(df[df.data_type=='dev'].goldstandard2.values),
    add_special_tokens=False, 
    return_attention_mask=False, 
    pad_to_max_length=True, 
    max_length=100, 
    return_tensors='pt'
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
# labels_train = df[df.data_type=='train'].goldstandard2.values
labels_train = encoded_labels_train['input_ids']

input_ids_dev = encoded_data_dev['input_ids']
attention_masks_dev = encoded_data_dev['attention_mask']
labels_dev = encoded_labels_dev['input_ids']


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [19]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_dev = TensorDataset(input_ids_dev, attention_masks_dev, labels_dev)

In [20]:
batch_size = 32

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_dev, 
                                   sampler=SequentialSampler(dataset_dev), 
                                   batch_size=batch_size)

optimizer = AdamW(model.parameters(),
                  eps = 1e-8)

In [21]:
epochs = 3
total_steps = len(dataloader_train) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps)

In [22]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in relaxed_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)} = {len(y_preds[y_preds==label])/len(y_true)}\n')

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [23]:
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [24]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

In [25]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

saved = torch.load('finetuned_UQA_relaxed_epoch_3.model')
model.load_state_dict(saved)
model.to(device)

print(device)

cuda


In [26]:
import gc
gc.collect()
torch.cuda.empty_cache()

### Training

In [None]:
for epoch in tqdm(range(3, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        # scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'finetuned_UQA_relaxed_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    dev_loss, predictions, true_vals = evaluate(dataloader_validation)
    dev_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {dev_loss}')
    tqdm.write(f'F1 Score (Weighted): {dev_f1}')

  0%|          | 0/1 [00:00<?, ?it/s]
Epoch 3:   0%|          | 0/619 [00:00<?, ?it/s][A
Epoch 3:   0%|          | 0/619 [00:01<?, ?it/s, training_loss=0.001][A
Epoch 3:   0%|          | 1/619 [00:01<10:41,  1.04s/it, training_loss=0.001][A
Epoch 3:   0%|          | 1/619 [00:01<10:41,  1.04s/it, training_loss=0.002][A
Epoch 3:   0%|          | 2/619 [00:01<09:29,  1.08it/s, training_loss=0.002][A
Epoch 3:   0%|          | 2/619 [00:02<09:29,  1.08it/s, training_loss=0.002][A
Epoch 3:   0%|          | 3/619 [00:02<08:44,  1.17it/s, training_loss=0.002][A
Epoch 3:   0%|          | 3/619 [00:03<08:44,  1.17it/s, training_loss=0.002][A
Epoch 3:   1%|          | 4/619 [00:03<08:08,  1.26it/s, training_loss=0.002][A
Epoch 3:   1%|          | 4/619 [00:03<08:08,  1.26it/s, training_loss=0.001][A
Epoch 3:   1%|          | 5/619 [00:03<07:45,  1.32it/s, training_loss=0.001][A
Epoch 3:   1%|          | 5/619 [00:04<07:45,  1.32it/s, training_loss=0.001][A
Epoch 3:   1%|          | 6


Epoch 3
Training loss: 0.004476959314588487


### Evaluation

In [27]:
model.to('cpu')

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dro

In [47]:
preds = []
dev_data = df[df.data_type=='dev'].uqa_input.values

for i in tqdm(range(len(dev_data))):
  preds.append(run_model(dev_data[i]))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 24%|██▍       | 1600/6599 [12:36<39:32,  2.11it/s][A
 24%|██▍       | 1601/6599 [12:36<40:04,  2.08it/s][A
 24%|██▍       | 1602/6599 [12:37<40:29,  2.06it/s][A
 24%|██▍       | 1603/6599 [12:37<40:18,  2.07it/s][A
 24%|██▍       | 1604/6599 [12:38<40:32,  2.05it/s][A
 24%|██▍       | 1605/6599 [12:38<40:02,  2.08it/s][A
 24%|██▍       | 1606/6599 [12:39<40:00,  2.08it/s][A
 24%|██▍       | 1607/6599 [12:39<39:52,  2.09it/s][A
 24%|██▍       | 1608/6599 [12:40<39:35,  2.10it/s][A
 24%|██▍       | 1609/6599 [12:40<39:39,  2.10it/s][A
 24%|██▍       | 1610/6599 [12:41<39:26,  2.11it/s][A
 24%|██▍       | 1611/6599 [12:41<39:28,  2.11it/s][A
 24%|██▍       | 1612/6599 [12:42<39:41,  2.09it/s][A
 24%|██▍       | 1613/6599 [12:42<39:15,  2.12it/s][A
 24%|██▍       | 1614/6599 [12:43<39:40,  2.09it/s][A
 24%|██▍       | 1615/6599 [12:43<39:30,  2.10it/s][A
 24%|██▍       | 1616/6599 [12:44<39:26,  2.11it/s][A


In [48]:
dev_df = df[df.data_type=='dev']
predictions = [i[0] for i in preds]
dev_df['predictions'] = predictions
dev_df.to_csv('uqa_relaxed_dev.csv', index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [49]:
print('Dev Accuracy:', sum(dev_df['goldstandard2'] == dev_df['predictions'])/dev_df.shape[0])

Dev Accuracy: 0.8930140930444007


In [50]:
preds = []
test_data = df[df.data_type=='test'].uqa_input.values

for i in tqdm(range(len(test_data))):
  preds.append(run_model(test_data[i]))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 24%|██▍       | 1600/6599 [12:34<39:44,  2.10it/s][A
 24%|██▍       | 1601/6599 [12:35<39:41,  2.10it/s][A
 24%|██▍       | 1602/6599 [12:35<39:48,  2.09it/s][A
 24%|██▍       | 1603/6599 [12:36<39:29,  2.11it/s][A
 24%|██▍       | 1604/6599 [12:36<39:28,  2.11it/s][A
 24%|██▍       | 1605/6599 [12:37<39:30,  2.11it/s][A
 24%|██▍       | 1606/6599 [12:37<39:54,  2.09it/s][A
 24%|██▍       | 1607/6599 [12:38<39:44,  2.09it/s][A
 24%|██▍       | 1608/6599 [12:38<39:27,  2.11it/s][A
 24%|██▍       | 1609/6599 [12:38<39:00,  2.13it/s][A
 24%|██▍       | 1610/6599 [12:39<39:14,  2.12it/s][A
 24%|██▍       | 1611/6599 [12:39<39:27,  2.11it/s][A
 24%|██▍       | 1612/6599 [12:40<40:01,  2.08it/s][A
 24%|██▍       | 1613/6599 [12:40<39:48,  2.09it/s][A
 24%|██▍       | 1614/6599 [12:41<39:45,  2.09it/s][A
 24%|██▍       | 1615/6599 [12:41<39:56,  2.08it/s][A
 24%|██▍       | 1616/6599 [12:42<40:10,  2.07it/s][A


In [51]:
test_df = df[df.data_type=='test']
predictions = [i[0] for i in preds]
test_df['predictions'] = predictions
test_df.to_csv('uqa_relaxed_test.csv', index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [53]:
print('Test Accuracy:', sum(test_df['goldstandard2'] == test_df['predictions'])/test_df.shape[0])

Test Accuracy: 0.8964994696166085
