In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
import tensorflow as tf
from tensorflow import keras
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, AutoConfig
from sklearn.model_selection import train_test_split

from tqdm.notebook import tqdm_notebook
import tensorflow_hub as hub
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
from sklearn.model_selection import train_test_split

In [3]:
%%time
judgments = pd.read_csv('Data/tira-qrels', delim_whitespace=True, names=['topic','iteration','id','relevance'])
arguments = pd.read_pickle('Data/dataset.pkl')

tree = ET.parse('Data/topics-automatic-runs-task-1.xml')
root = tree.getroot()

topics = []
for child in root:
    d = {'topic':int(child[0].text), 'query':child[1].text}
    topics.append(d)
topics = pd.DataFrame(topics)

relevance = judgments.merge(topics)
relevance['relevance_binary'] = (relevance['relevance'] != -2).astype(int)
relevance = relevance.merge(arguments[['id', 'text', 'conclusion']])
relevance = relevance[['id', 'query', 'text', 'conclusion', 'relevance','relevance_binary']]
relevance = relevance.sample(frac=1, random_state=42)
display(topics[:5])
display(relevance[:5])

Unnamed: 0,topic,query
0,1,Should Teachers Get Tenure?
1,2,Is Vaping with E-Cigarettes Safe?
2,3,Should Insider Trading Be Allowed?
3,4,Should Corporal Punishment Be Used in Schools?
4,5,Should Social Security Be Privatized?


Unnamed: 0,id,query,text,conclusion,relevance,relevance_binary
353,d267a5af-2019-04-18T18:07:23Z-00009-000,Should Marijuana Be a Medical Option?,Marijuana is a major concern to the United Sta...,Medical Marijuana,3,1
864,7839a8e-2019-04-18T13:02:10Z-00000-000,Is Sexual Orientation Determined at Birth?,Why did you accept my debate if you agreed wit...,Sexual Orientation is a choice.,-2,0
1312,e100392e-2019-04-18T19:19:21Z-00000-000,Do Standardized Tests Improve Education?,"You don't it, you have to provide PROOF you di...",Cannabis Sativa Enhances my Life,-2,0
192,5339b784-2019-04-18T15:45:56Z-00005-000,Should the Federal Minimum Wage Be Increased?,I accept this challenge and negate the resolut...,Resolved: Minimum wages in the United States s...,2,1
782,61bcba6c-2019-04-18T15:04:19Z-00004-000,Should Animals Be Used for Scientific or Comme...,"Well, first of all, thanks for accepting.-----...",Animal Testing,3,1


CPU times: user 843 ms, sys: 483 ms, total: 1.33 s
Wall time: 1.32 s


In [11]:
data_train, data_valid = train_test_split(relevance, shuffle=False, random_state=42, train_size=0.85)

In [12]:
data_train

Unnamed: 0,id,query,text,conclusion,relevance,relevance_binary
353,d267a5af-2019-04-18T18:07:23Z-00009-000,Should Marijuana Be a Medical Option?,Marijuana is a major concern to the United Sta...,Medical Marijuana,3,1
864,7839a8e-2019-04-18T13:02:10Z-00000-000,Is Sexual Orientation Determined at Birth?,Why did you accept my debate if you agreed wit...,Sexual Orientation is a choice.,-2,0
1312,e100392e-2019-04-18T19:19:21Z-00000-000,Do Standardized Tests Improve Education?,"You don't it, you have to provide PROOF you di...",Cannabis Sativa Enhances my Life,-2,0
192,5339b784-2019-04-18T15:45:56Z-00005-000,Should the Federal Minimum Wage Be Increased?,I accept this challenge and negate the resolut...,Resolved: Minimum wages in the United States s...,2,1
782,61bcba6c-2019-04-18T15:04:19Z-00004-000,Should Animals Be Used for Scientific or Comme...,"Well, first of all, thanks for accepting.-----...",Animal Testing,3,1
...,...,...,...,...,...,...
262,f1d02517-2019-04-18T14:24:26Z-00000-000,Should People Become Vegetarian?,"First of all, some people are so twisted that ...",There should not be a death penalty,3,1
150,d55c8fd6-2019-04-18T16:31:33Z-00005-000,Should the Federal Minimum Wage Be Increased?,Resolution: The Federal Minimum Wage Should be...,The minimum wage should be increased,2,1
472,bea71e7b-2019-04-17T11:47:42Z-00093-000,Should the Death Penalty Be Allowed?,The death penalty is not cruel,Death penalty,-2,0
640,61842a89-2019-04-18T18:56:38Z-00003-000,Should Felons Who Have Completed Their Sentenc...,Con wins this debate i surrender,Resolved: that guns should not be allowed to f...,-2,0


In [5]:
def make_dataset_for_encoding(df, batch_size=128):
    df = df.copy()
    dataset = tf.data.Dataset.from_tensor_slices(dict(df[['query','text','conclusion']]))
    dataset = dataset.batch(batch_size)
    return dataset

# With Huggingface

In [4]:
MODEL_TO_USE = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_TO_USE, use_fast=True)

In [6]:
%%time
tokenized = []
for chunk in tqdm_notebook(np.array_split(relevance, 1), total=1):
    tokenized_chunk = tokenizer.batch_encode_plus(list(zip(list(chunk['query'].values), list(chunk['text'].values))), max_length=tokenizer.max_len, pad_to_max_length=True, return_overflowing_tokens=True)
    tokenized_chunk.pop('token_type_ids')

    overflow_index = tokenized_chunk.pop('overflow_to_sample_mapping')

    # Repeating indices are included as lists of the corresponding index eg: [0,1, [2,2,2,2], [3,3]...]
    overflow_index = np.hstack(overflow_index)
    text_ids = chunk['id'].values
    text_ids = text_ids[overflow_index]
    
    old_index = chunk.index.to_series().values
    old_index = old_index[overflow_index]
    
    relevance_score = chunk['relevance'].values
    relevance_score = relevance_score[overflow_index]
    
    binary_score = chunk['relevance_binary'].values
    binary_score = binary_score[overflow_index]

    df = pd.DataFrame(tokenized_chunk)
    df[['input_ids', 'attention_mask']] = df[['input_ids', 'attention_mask']].applymap(np.array)
    df['id'] = text_ids
    df['old_index'] = old_index
    df['relevance_score'] = relevance_score
    df['binary_score'] = binary_score
    tokenized.append(df)
tokenized = pd.concat(tokenized)
tokenized.reset_index(inplace=True, drop=True)
tokenized[:3]

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


CPU times: user 2.45 s, sys: 427 ms, total: 2.88 s
Wall time: 686 ms


Unnamed: 0,input_ids,attention_mask,id,old_index,relevance_score,binary_score
0,"[101, 2323, 16204, 2022, 1037, 2966, 5724, 102...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",d267a5af-2019-04-18T18:07:23Z-00009-000,353,3,1
1,"[101, 2003, 4424, 10296, 4340, 2012, 4182, 102...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",7839a8e-2019-04-18T13:02:10Z-00000-000,864,-2,0
2,"[101, 2079, 16367, 5852, 5335, 2495, 1029, 102...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",e100392e-2019-04-18T19:19:21Z-00000-000,1312,-2,0


In [9]:
tokenized_train, tokenized_valid = train_test_split(tokenized, shuffle=False, random_state=42, train_size=0.85)

In [7]:
config = AutoConfig.from_pretrained(MODEL_TO_USE)
config.num_labels=2
seq_model = TFAutoModelForSequenceClassification.from_pretrained(MODEL_TO_USE, config=config)

In [12]:
i_train = np.stack(tokenized_train['input_ids'])
m_train = np.stack(tokenized_train['attention_mask'])
target_train = np.stack(tokenized_train[['relevance_score', 'binary_score']].values)

i_valid = np.stack(tokenized['input_ids'])
m_valid = np.stack(tokenized['attention_mask'])
target_valid = np.stack(tokenized_train[['relevance_score', 'binary_score']].values)

## Define a multiple input - multiple output keras neural network using the functional API, with a huggingface sequence classifier with two outputs

In [31]:
keras.backend.clear_session()

input_ids = keras.layers.Input(shape=[512], name="input_ids", dtype=tf.int64)
input_masks = keras.layers.Input(shape=[512], name="masks", dtype=tf.int64)
two_outputs = seq_model(input_ids, attention_mask=input_masks)[0]
output_relevance = keras.layers.Lambda(lambda x: x[:,0])(two_outputs)
output_binary = keras.layers.Lambda(lambda x: x[:,1])(two_outputs)
output_binary = keras.layers.Activation('sigmoid')(output_binary)
model = keras.Model(inputs=[input_ids, input_masks], outputs=[output_relevance, output_binary])

In [32]:
model.predict([i_train[:1], m_train[:1]])

ResourceExhaustedError:  OOM when allocating tensor with shape[1,12,512,512] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node model/tf_distil_bert_for_sequence_classification/distilbert/transformer/layer_._0/attention/MatMul (defined at /usr/local/lib/python3.6/dist-packages/transformers/modeling_tf_distilbert.py:238) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
 [Op:__inference_distributed_function_283946]

Errors may have originated from an input operation.
Input Source operations connected to node model/tf_distil_bert_for_sequence_classification/distilbert/transformer/layer_._0/attention/MatMul:
 model/tf_distil_bert_for_sequence_classification/distilbert/transformer/layer_._0/attention/truediv (defined at /usr/local/lib/python3.6/dist-packages/transformers/modeling_tf_distilbert.py:237)	
 model/tf_distil_bert_for_sequence_classification/distilbert/transformer/layer_._0/attention/transpose_1 (defined at /usr/local/lib/python3.6/dist-packages/transformers/modeling_tf_distilbert.py:227)

Function call stack:
distributed_function


In [None]:
model.compile(loss=["mse", ""])

In [47]:
dataset = tf.data.Dataset.from_tensor_slices(dict(tokenized))
dataset = dataset.batch(32)

In [48]:
for thing in dataset.take(1):
    bla = thing

In [58]:
seq(inputs=bla['input_ids'], attention_mask=bla['attention_mask'])

(<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
 array([[0.09643522, 0.06895541],
        [0.10445672, 0.08956073]], dtype=float32)>,)

# With TF-Hub

In [30]:
%%time
keras.backend.clear_session()

module = hub.load('https://tfhub.dev/google/universal-sentence-encoder-qa/3')
query_embedder = module.signatures['question_encoder']
arg_embedder = module.signatures['response_encoder']

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run Identity: Dst tensor is not initialized. [Op:Identity]

In [32]:
dataset = make_dataset_for_encoding(relevance, batch_size=32)

query_encodings = []
arg_encodings = []
dot_ps = []
for batch in tqdm_notebook(dataset, total=tf.data.experimental.cardinality(dataset).numpy()):
    query_encoding = query_embedder(batch['query'])
    query_encoding = query_encoding['outputs'].numpy()
    
    arg_encoding = arg_embedder(input=batch['text'], context=batch['conclusion'])
    arg_encoding = arg_encoding['outputs'].numpy()
    
    #Use einstein notation to perform row-by-row dot-product
    dot_p = np.einsum('ij,ij->i',query_encoding, arg_encoding)
    
    query_encodings.append(query_encoding)
    arg_encodings.append(arg_encoding)
    dot_ps.append(dot_p)
query_encodings = np.vstack(query_encodings)
arg_encodings = np.vstack(arg_encodings)
encodings = np.hstack([query_encodings, arg_encodings])

#Add a column dimension to dot_ps to be able to vstack it
dot_ps = np.vstack([item[:, np.newaxis] for item in dot_ps])

HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))




In [34]:
encodings.shape

(1411, 1024)