In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd '/content/drive/MyDrive/NYU_class/NLU/Project'

/content/drive/MyDrive/NYU_class/NLU/Project


In [3]:
!pip install --quiet simpletransformers

[K     |████████████████████████████████| 215kB 7.5MB/s 
[K     |████████████████████████████████| 51kB 7.9MB/s 
[K     |████████████████████████████████| 1.2MB 47.5MB/s 
[K     |████████████████████████████████| 225kB 53.5MB/s 
[K     |████████████████████████████████| 122kB 49.4MB/s 
[K     |████████████████████████████████| 2.1MB 54.0MB/s 
[K     |████████████████████████████████| 3.3MB 49.8MB/s 
[K     |████████████████████████████████| 81kB 11.5MB/s 
[K     |████████████████████████████████| 8.2MB 27.7MB/s 
[K     |████████████████████████████████| 1.8MB 41.0MB/s 
[K     |████████████████████████████████| 245kB 56.1MB/s 
[K     |████████████████████████████████| 112kB 61.3MB/s 
[K     |████████████████████████████████| 901kB 37.7MB/s 
[K     |████████████████████████████████| 112kB 60.2MB/s 
[K     |████████████████████████████████| 163kB 54.2MB/s 
[K     |████████████████████████████████| 4.2MB 47.2MB/s 
[K     |████████████████████████████████| 81kB 11.6MB/s 
[

In [4]:
import random
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from pathlib import Path
import os
import pickle
import matplotlib.pyplot as plt

import torch
import torch.nn as nn 
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler, TensorDataset

import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, 
    f1_score, 
    classification_report
)

## Data pre-processing

Relaxed-matched

In [5]:
circa_og = pd.read_csv('circa-data.tsv', sep='\t', index_col='id')
circa_r = circa_og.drop(circa_og.loc[circa_og['goldstandard2']=='Other'].index)
circa_r = circa_r.drop(circa_r.loc[circa_r['goldstandard2'].isnull()].index)
circa_r = circa_r.drop(circa_r.loc[circa_r['goldstandard2']=='I am not sure how X will interpret Y’s answer'].index)

In [6]:
YN_s = (circa_r['question-X'].map(str)+' '+circa_r['answer-Y']).apply(lambda row: row.strip())
relaxed_labels = circa_r['goldstandard2'].unique()
relaxed_label = circa_r['goldstandard2']
relaxed_dict = {}
for idx, label in enumerate(relaxed_labels):
    relaxed_dict[label] = idx
circa_r['relaxed'] = circa_r.goldstandard2.replace(relaxed_dict)
relaxed = circa_r['relaxed']

In [7]:
circa_r

Unnamed: 0_level_0,context,question-X,canquestion-X,answer-Y,judgements,goldstandard1,goldstandard2,relaxed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,Y has just travelled from a different city to ...,Are you employed?,I am employed .,I'm a veterinary technician.,Yes#Yes#Yes#Yes#Yes,Yes,Yes,0
1,X wants to know about Y's food preferences.,Are you a fan of Korean food?,I am a fan of Korean food .,I wouldn't say so,Probably no#No#No#No#Probably yes / sometimes yes,No,No,1
2,Y has just told X that he/she is thinking of b...,Are you bringing any pets into the flat?,I am bringing pets into the flat .,I do not own any pets,No#No#No#No#No,No,No,1
3,X wants to know what activities Y likes to do ...,Would you like to get some fresh air in your f...,I would like to get fresh air in my free time .,I am desperate to get out of the city.,"Yes#Yes, subject to some conditions#Probably y...",Yes,Yes,0
4,X and Y are childhood neighbours who unexpecte...,Is your family still living in the neighborhood?,My family is living in the neighborhood .,My parents are snowbirds now.,"No#In the middle, neither yes nor no#Probably ...","In the middle, neither yes nor no","In the middle, neither yes nor no",2
...,...,...,...,...,...,...,...,...
34263,X wants to know what activities Y likes to do ...,Do you like to drink?,I like to drink .,I am in AA.,No#No#No#Probably no#No,No,No,1
34264,X wants to know about Y's food preferences.,Do you like pie?,I like pie .,My favorite pie is pecan.,"Yes#Yes#Yes, subject to some conditions#Yes#Yes",Yes,Yes,0
34265,X wants to know about Y's music preferences.,Want to go to a concert with me?,I want to go to a concert with me .,I'd rather do something else.,"No#In the middle, neither yes nor no#Probably ...",No,No,1
34266,X wants to know about Y's music preferences.,Do you like hip/hop music?,I like hip/hop music .,I can't dance to hip/hop music,"Probably no#Probably no#In the middle, neither...",Probably no,No,1


In [8]:
relaxed_label = circa_r['goldstandard2']
YN_s = (circa_r['question-X'].map(str)+' '+circa_r['answer-Y']).apply(lambda row: row.strip())
df = pd.concat([YN_s, relaxed_label], axis=1).rename(columns={0:'input_text','goldstandard2':'target_text'})
df['prefix'] = 'multilabel classification'
df = df[['prefix','input_text','target_text']]
df

Unnamed: 0_level_0,prefix,input_text,target_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,multilabel classification,Are you employed? I'm a veterinary technician.,Yes
1,multilabel classification,Are you a fan of Korean food? I wouldn't say so,No
2,multilabel classification,Are you bringing any pets into the flat? I do ...,No
3,multilabel classification,Would you like to get some fresh air in your f...,Yes
4,multilabel classification,Is your family still living in the neighborhoo...,"In the middle, neither yes nor no"
...,...,...,...
34263,multilabel classification,Do you like to drink? I am in AA.,No
34264,multilabel classification,Do you like pie? My favorite pie is pecan.,Yes
34265,multilabel classification,Want to go to a concert with me? I'd rather do...,No
34266,multilabel classification,Do you like hip/hop music? I can't dance to hi...,No


## Modeling

In [9]:
train_relaxed, val_relaxed = train_test_split(df, test_size=.4)
test_relaxed, dev_relaxed = train_test_split(val_relaxed, test_size=.5)

In [10]:
from simpletransformers.t5 import T5Model

model_args = {
    "max_seq_length": 196,
    "train_batch_size": 16,
    "eval_batch_size": 64,
    "num_train_epochs": 1,
    "evaluate_during_training": True,
    "evaluate_during_training_steps": 1500,
    "evaluate_during_training_verbose": True,
    
    "use_multiprocessing": False,
    "fp16": False,

    "save_steps": -1,
    "save_eval_checkpoints": False,
    "save_model_every_epoch": False,

    "reprocess_input_data": True,
    "overwrite_output_dir": True,

}

model = T5Model("t5", "t5-base", args=model_args)

model.train_model(train_data=train_relaxed, eval_data=dev_relaxed)

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

  0%|          | 0/19795 [00:00<?, ?it/s]



Using Adafactor for T5


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/1238 [00:00<?, ?it/s]

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:1005.)
  exp_avg_sq_row.mul_(beta2t).add_(1.0 - beta2t, update.mean(dim=-1))


  0%|          | 0/6599 [00:00<?, ?it/s]

(1238,
 {'eval_loss': [0.20543217952721393],
  'global_step': [1238],
  'train_loss': [0.20562894642353058]})

In [11]:
import json
from datetime import datetime
from pprint import pprint
from statistics import mean

from scipy.stats import pearsonr, spearmanr

from transformers.data.metrics.squad_metrics import compute_exact, compute_f1


def f1(truths, preds):
    return mean([compute_f1(truth, pred) for truth, pred in zip(truths, preds)])


def exact(truths, preds):
    return mean([compute_exact(truth, pred) for truth, pred in zip(truths, preds)])



model_args = {
    "overwrite_output_dir": True,
    "max_seq_length": 196,
    "eval_batch_size": 32,
    "num_train_epochs": 1,
    "use_multiprocessing": False,
    "num_beams": None,
    "do_sample": True,
    "max_length": 50,
    "top_k": 50,
    "top_p": 0.95,
    "num_return_sequences": 3,
}

model = T5Model("t5", "outputs", args=model_args)

df = dev_relaxed

to_predict = [
    prefix + ": " + str(input_text)
    for prefix, input_text in zip(df["prefix"].tolist(), df["input_text"].tolist())
]
truth = df["target_text"].tolist()
tasks = df["prefix"].tolist()

preds = model.predict(to_predict)


preds = [pred[0] for pred in preds]
df["predicted"] = preds

# Evaluating the tasks separately
output_dict = {
    "multilabel classification": {"truth": [], "preds": [],},
}

results_dict = {}

for task, truth_value, pred in zip(tasks, truth, preds):
    output_dict[task]["truth"].append(truth_value)
    output_dict[task]["preds"].append(pred)

print("-----------------------------------")
print("Results: ")
for task, outputs in output_dict.items():
    task_truth = output_dict[task]["truth"]
    task_preds = output_dict[task]["preds"]
    results_dict[task] = {
        "F1 Score": f1(task_truth, task_preds),
        "Accuracy score": exact(task_truth, task_preds),
    }
    print(f"F1 score: {f1(task_truth, task_preds)}")
    print(f"Accuracy score: {exact(task_truth, task_preds)}")
    print()

Generating outputs:   0%|          | 0/207 [00:00<?, ?it/s]



Decoding outputs:   0%|          | 0/19797 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


-----------------------------------
Results: 
F1 score: 0.7680942081608849
Accuracy score: 0.7469313532353387

