In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [28]:
import tensorflow as tf
from tensorflow import keras
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, AutoConfig
from sklearn.model_selection import train_test_split

from tqdm.notebook import tqdm_notebook
import tensorflow_hub as hub
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
from sklearn.model_selection import train_test_split

In [3]:
%%time
judgments = pd.read_csv('Data/tira-qrels', delim_whitespace=True, names=['topic','iteration','id','relevance'])
arguments = pd.read_pickle('Data/dataset.pkl')

tree = ET.parse('Data/topics-automatic-runs-task-1.xml')
root = tree.getroot()

topics = []
for child in root:
    d = {'topic':int(child[0].text), 'query':child[1].text}
    topics.append(d)
topics = pd.DataFrame(topics)

relevance = judgments.merge(topics)
relevance['relevance_binary'] = (relevance['relevance'] != -2).astype(int)
relevance = relevance.merge(arguments[['id', 'text', 'conclusion']])
relevance = relevance[['id', 'query', 'text', 'conclusion', 'relevance','relevance_binary']]
relevance = relevance.sample(frac=1, random_state=42)
relevance

CPU times: user 885 ms, sys: 432 ms, total: 1.32 s
Wall time: 1.31 s


Unnamed: 0,id,query,text,conclusion,relevance,relevance_binary
353,d267a5af-2019-04-18T18:07:23Z-00009-000,Should Marijuana Be a Medical Option?,Marijuana is a major concern to the United Sta...,Medical Marijuana,3,1
864,7839a8e-2019-04-18T13:02:10Z-00000-000,Is Sexual Orientation Determined at Birth?,Why did you accept my debate if you agreed wit...,Sexual Orientation is a choice.,-2,0
1312,e100392e-2019-04-18T19:19:21Z-00000-000,Do Standardized Tests Improve Education?,"You don't it, you have to provide PROOF you di...",Cannabis Sativa Enhances my Life,-2,0
192,5339b784-2019-04-18T15:45:56Z-00005-000,Should the Federal Minimum Wage Be Increased?,I accept this challenge and negate the resolut...,Resolved: Minimum wages in the United States s...,2,1
782,61bcba6c-2019-04-18T15:04:19Z-00004-000,Should Animals Be Used for Scientific or Comme...,"Well, first of all, thanks for accepting.-----...",Animal Testing,3,1
...,...,...,...,...,...,...
1095,36f68365-2019-04-18T15:05:04Z-00000-000,Do Electronic Voting Machines Improve the Voti...,"As you can see, my opponent has failed to back...",Earth's orbit around the Sun,-2,0
1130,2c05e9fb-2019-04-15T20:23:05Z-00005-000,Do Electronic Voting Machines Improve the Voti...,Remote electronic voting can be conducted very...,allow the use of electronic and internet votin...,-2,0
1294,b760077b-2019-04-18T13:01:46Z-00003-000,Do Standardized Tests Improve Education?,"Challenge accepted, Standardized tests should ...",Standardized Tests,3,1
860,b9c0e12b-2019-04-18T13:51:22Z-00005-000,Is Sexual Orientation Determined at Birth?,Lol well that was a waste of a round. But I gu...,there is nothing wrong with zoosexuality/beast...,3,1


In [4]:
data_train, data_valid = train_test_split(relevance, shuffle=False, random_state=42, train_size=0.85)

In [5]:
def make_dataset_for_encoding(df, batch_size=128):
    df = df.copy()
    dataset = tf.data.Dataset.from_tensor_slices(dict(df[['query','text','conclusion']]))
    dataset = dataset.batch(batch_size)
    return dataset

# With Huggingface

In [5]:
MODEL_TO_USE = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_TO_USE, use_fast=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=546.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [42]:
%%time
tokenized = []
for chunk in tqdm_notebook(np.array_split(relevance, 1), total=1):
    tokenized_chunk = tokenizer.batch_encode_plus(list(zip(list(chunk['query'].values), list(chunk['text'].values))), max_length=tokenizer.max_len, pad_to_max_length=True, return_overflowing_tokens=True)
    tokenized_chunk.pop('token_type_ids')

    overflow_index = tokenized_chunk.pop('overflow_to_sample_mapping')

    # Repeating indices are included as lists of the corresponding index eg: [0,1, [2,2,2,2], [3,3]...]
    overflow_index = np.hstack(overflow_index)
    text_ids = chunk['id'].values
    text_ids = text_ids[overflow_index]
    
    old_index = chunk.index.to_series().values
    old_index = old_index[overflow_index]
    
    relevance_score = chunk['relevance'].values
    relevance_score = relevance_score[overflow_index]
    
    binary_score = chunk['relevance_binary'].values
    binary_score = binary_score[overflow_index]

    df = pd.DataFrame(tokenized_chunk)
    df[['input_ids', 'attention_mask']] = df[['input_ids', 'attention_mask']].applymap(np.array)
    df['id'] = text_ids
    df['old_index'] = old_index
    df['relevance_score'] = relevance_score
    df['binary_score'] = binary_score
    tokenized.append(df)
tokenized = pd.concat(tokenized)
tokenized.reset_index(inplace=True, drop=True)
tokenized[:3]

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


CPU times: user 1.64 s, sys: 18.1 ms, total: 1.66 s
Wall time: 548 ms


Unnamed: 0,input_ids,attention_mask,id,old_index,relevance_score,binary_score
0,"[101, 2323, 16204, 2022, 1037, 2966, 5724, 102...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",d267a5af-2019-04-18T18:07:23Z-00009-000,353,3,1
1,"[101, 2003, 4424, 10296, 4340, 2012, 4182, 102...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",7839a8e-2019-04-18T13:02:10Z-00000-000,864,-2,0
2,"[101, 2079, 16367, 5852, 5335, 2495, 1029, 102...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",e100392e-2019-04-18T19:19:21Z-00000-000,1312,-2,0


In [36]:
config = AutoConfig.from_pretrained(MODEL_TO_USE)
config.num_labels=2
seq = TFAutoModelForSequenceClassification.from_pretrained(MODEL_TO_USE, config=config)

In [37]:
# i = np.stack(tokenized['input_ids'])
# m = np.stack(tokenized['attention_mask'])

# dataset = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(i), tf.data.Dataset.from_tensor_slices(m)))
# dataset = dataset.batch(32)

In [47]:
dataset = tf.data.Dataset.from_tensor_slices(dict(tokenized))
dataset = dataset.batch(32)

In [48]:
for thing in dataset.take(1):
    bla = thing

In [59]:
!ping

/bin/sh: 1: ping: not found


In [58]:
seq(inputs=bla['input_ids'], attention_mask=bla['attention_mask'])

(<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
 array([[0.09643522, 0.06895541],
        [0.10445672, 0.08956073]], dtype=float32)>,)

# With TF-Hub

In [6]:
%%time
module = hub.load('https://tfhub.dev/google/universal-sentence-encoder-qa/3')
query_embedder = module.signatures['question_encoder']
arg_embedder = module.signatures['response_encoder']

INFO:absl:Using /tmp/tfhub_modules to cache modules.


CPU times: user 19.2 s, sys: 2.64 s, total: 21.9 s
Wall time: 20 s


In [32]:
dataset = make_dataset_for_encoding(relevance, batch_size=32)

query_encodings = []
arg_encodings = []
dot_ps = []
for batch in tqdm_notebook(dataset, total=tf.data.experimental.cardinality(dataset).numpy()):
    query_encoding = query_embedder(batch['query'])
    query_encoding = query_encoding['outputs'].numpy()
    
    arg_encoding = arg_embedder(input=batch['text'], context=batch['conclusion'])
    arg_encoding = arg_encoding['outputs'].numpy()
    
    #Use einstein notation to perform row-by-row dot-product
    dot_p = np.einsum('ij,ij->i',query_encoding, arg_encoding)
    
    query_encodings.append(query_encoding)
    arg_encodings.append(arg_encoding)
    dot_ps.append(dot_p)
query_encodings = np.vstack(query_encodings)
arg_encodings = np.vstack(arg_encodings)
encodings = np.hstack([query_encodings, arg_encodings])

#Add a column dimension to dot_ps to be able to vstack it
dot_ps = np.vstack([item[:, np.newaxis] for item in dot_ps])

HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))




In [34]:
encodings.shape

(1411, 1024)

In [4]:
from transformers import TFAutoModelForQuestionAnswering, 

In [6]:
qa_model = TFAutoModelForQuestionAnswering.from_pretrained(MODEL_TO_USE)
seq_model = TFAutoModelForSequenceClassification.from_pretrained(MODEL_TO_USE)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=363423424.0, style=ProgressStyle(descri…




In [22]:
qa_model.layers

[<transformers.modeling_tf_distilbert.TFDistilBertMainLayer at 0x7f5e1f2aeeb8>,
 <tensorflow.python.keras.layers.core.Dense at 0x7f5e1e2008d0>,
 <tensorflow.python.keras.layers.core.Dropout at 0x7f5e1e200c50>]

In [23]:
seq_model.layers

[<transformers.modeling_tf_distilbert.TFDistilBertMainLayer at 0x7f5e1e2313c8>,
 <tensorflow.python.keras.layers.core.Dense at 0x7f5e1f0fa0b8>,
 <tensorflow.python.keras.layers.core.Dense at 0x7f5e1f0fa2b0>,
 <tensorflow.python.keras.layers.core.Dropout at 0x7f5e1f0fa438>]

In [26]:
for i,m in dataset.take(1):
    thing = seq_model(i, attention_mask=m, training=False)

In [27]:
thing

(<tf.Tensor: shape=(32, 2), dtype=float32, numpy=
 array([[ 0.02370459,  0.04156683],
        [ 0.00032923, -0.02047816],
        [ 0.03233599,  0.00640122],
        [ 0.01530294,  0.00575107],
        [ 0.00542534,  0.01057596],
        [ 0.00386707,  0.0109483 ],
        [-0.02191666, -0.01352489],
        [ 0.05955591,  0.03452582],
        [-0.02115268,  0.05518474],
        [ 0.04437988,  0.03044362],
        [ 0.01202991, -0.00052051],
        [-0.00044268,  0.01128567],
        [ 0.00499162,  0.03732803],
        [ 0.03562927, -0.01012373],
        [-0.01354843,  0.01092469],
        [ 0.05125595,  0.00094387],
        [ 0.03309708, -0.0178038 ],
        [ 0.02648746,  0.01509987],
        [ 0.05976138, -0.00176021],
        [ 0.05338664,  0.03857746],
        [ 0.00203231, -0.00394597],
        [-0.01573052, -0.01564515],
        [-0.0220802 ,  0.04258042],
        [ 0.01276951, -0.00522505],
        [ 0.03225153,  0.03452024],
        [ 0.05629495,  0.03115635],
        [ 0.07