In [2]:
!pip install --disable-pip-version-check -q pandas transformers seaborn tensorflow_hub elasticsearch elasticsearch-dsl annoy faiss-gpu
!pip install --disable-pip-version-check -Uq scikit-learn
!pip -q install --disable-pip-version-check --no-warn-script-location --user tensorflow-text 
!pip -q uninstall -y tensorflow

In [3]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
from pathlib import Path
from urllib.request import urlopen

import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow import keras
import tensorflow_hub as hub
from transformers import TFAutoModel, AutoTokenizer, AutoConfig
from sklearn.model_selection import train_test_split

from tqdm.notebook import tqdm_notebook

In [4]:
sns.set()

In [3]:
MODEL_TO_USE = "distilbert-base-uncased"

In [4]:
import transformers
transformers.__version__

'2.5.1'

# Read the JSON file into a Pandas DataFrame

In [8]:
%%time
current_directory = Path('.')
if not (current_directory / 'Data/dataset.pkl').exists():
    print("Pickled dataset doesn't already exists. Now reading JSON file.")

    #Read in JSON file if pickled dataframe doesn't already exist
    with open('./Data/args-me.json') as f:
        d = json.load(f)
        d = d['arguments']
        context_subfields = [['context', k] for k in d[0]['context'].keys()]
        dataset = pd.json_normalize(d, record_path='premises', meta=['id', 'conclusion', *context_subfields])
        print("Now pickling Pandas DataFrame into dataset.pkl.")
        dataset.to_pickle('Data/dataset.pkl')
        print("DataFrame pickled.")
        print(" ")
else:
    #If pickle already exists, read it into dataframe
    print("Pickled dataset already exists. Now loading dataset.pkl into Pandas DataFrame")
    dataset = pd.read_pickle('Data/dataset.pkl')
    print(" ")

Pickled dataset already exists. Now loading dataset.pkl into Pandas DataFrame
 
CPU times: user 673 ms, sys: 640 ms, total: 1.31 s
Wall time: 1.76 s


In [20]:
dataset.shape

(387692, 11)

# Pick a Model to use

In [6]:
def tokenize(dataset, model_to_use, chunks=5, folder_name="Tokenized"):
    current_directory = Path('.')
    if not (current_directory / f'{folder_name}/{model_to_use}.pkl').exists():
        print("Pickled dataset doesn't already exists. Now Tokenizing arguments dataframe.")
        tokenizer = AutoTokenizer.from_pretrained(model_to_use, use_fast=True)
        tokenized = []
        for chunk in tqdm_notebook(np.array_split(dataset, chunks), total=chunks):
            tokenized_chunk = tokenizer.batch_encode_plus(list(chunk['text'].values), max_length=tokenizer.max_len, pad_to_max_length=True, return_overflowing_tokens=True)
            tokenized_chunk.pop('token_type_ids')

            overflow_index = tokenized_chunk.pop('overflow_to_sample_mapping')

            # Repeating indices are included as lists of the corresponding index eg: [0,1, [2,2,2,2], [3,3]...]
            overflow_index = np.hstack(overflow_index)
            text_ids = chunk['id'].values
            text_ids = text_ids[overflow_index]

            df = pd.DataFrame(tokenized_chunk)
            df[['input_ids', 'attention_mask']] = df[['input_ids', 'attention_mask']].applymap(np.array)
            df['id'] = text_ids
            tokenized.append(df)
        tokenized = pd.concat(tokenized)
        tokenized.reset_index(inplace=True, drop=True)
        print(f"Now Pickling DataFrame to Tokenized/{model_to_use}.pkl")
        tokenized.to_pickle(f'{folder_name}/{model_to_use}.pkl')
    else:
        print(f"Pickled tokenized dataset already exists. Now loading {folder_name}/{model_to_use}.pkl into Pandas Dataframe")
        tokenized = pd.read_pickle(f'{folder_name}/{model_to_use}.pkl')
    return tokenized

# Tokenize the Dataset using the pretrained tokenizer of MODEL_TO_USE. If previously tokenized, load the corresponding dataframe from the saved pickle

In [16]:
%%time
tokenized = tokenize(dataset, MODEL_TO_USE)

Pickled tokenized dataset already exists. Now loading Tokenized/distilbert-base-uncased.pkl into Pandas Dataframe
CPU times: user 3.32 s, sys: 2.48 s, total: 5.8 s
Wall time: 8.9 s


In [17]:
tokenized

Unnamed: 0,input_ids,attention_mask,id
0,"[101, 2026, 7116, 2005, 21156, 2098, 2296, 246...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",c67482ba-2019-04-18T13:32:05Z-00000-000
1,"[101, 2129, 2079, 2017, 16599, 1996, 2082, 209...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",c67482ba-2019-04-18T13:32:05Z-00001-000
2,"[101, 2816, 2031, 2053, 17075, 3037, 1999, 434...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",c67482ba-2019-04-18T13:32:05Z-00002-000
3,"[101, 2004, 1037, 3026, 2012, 2026, 2082, 1012...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",c67482ba-2019-04-18T13:32:05Z-00003-000
4,"[101, 1996, 5813, 2109, 2011, 4013, 1008, 1598...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",4d3d4471-2019-04-18T11:45:01Z-00000-000
...,...,...,...
555578,"[101, 6662, 5472, 28212, 4801, 1012, 1000, 212...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",671509c8-2019-04-17T11:47:34Z-00067-000
555579,"[101, 6111, 2816, 4013, 15509, 9250, 2205, 285...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ...",671509c8-2019-04-17T11:47:34Z-00052-000
555580,"[101, 2270, 2816, 2064, 3749, 2673, 6111, 2816...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...",671509c8-2019-04-17T11:47:34Z-00037-000
555581,"[101, 6111, 2816, 2024, 15011, 2797, 2816, 102...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...",671509c8-2019-04-17T11:47:34Z-00022-000


# Turn the 'tokenized' Dataframe columns input_ids and attention_mask into a Tensorflow Dataset to feed into MODEL_TO_USE and encode the arguments into dense vectors

In [11]:
def encode(tokenized_df=None, model_to_use=None, batch_size=32, folder_name="Encoded", dtype=np.float16):
    current_directory = Path('.')
    config = AutoConfig.from_pretrained(MODEL_TO_USE)
    config.output_hidden_states=True
    encoded = None
    if not (current_directory / f'{folder_name}/{model_to_use}.npy').exists():
        print(f"Encodings don't yet exist. Now feeding tokens into {model_to_use}. This will take a while")
        
        model = TFAutoModel.from_pretrained(model_to_use, config=config)

        i = np.stack(tokenized_df['input_ids'])
        m = np.stack(tokenized_df['attention_mask'])

        dataset = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(i), tf.data.Dataset.from_tensor_slices(m)))
        dataset = dataset.batch(batch_size)
        encoded = np.zeros((i.shape[0], config.dim*(config.n_layers+1)))
        
        #Feed tokens in batches of batch_size into the huggingface model
        for iteration, (input_tensor, mask_tensor) in tqdm_notebook(enumerate(dataset), total=i.shape[0]/batch_size):
            #SET TRAINING TO FALSE TO TURN OFF DROPOUT
            output = model(input_tensor, attention_mask=mask_tensor, training=False)
            hidden = np.hstack([thing.numpy()[:,0,:] for thing in reversed(output[-1])])
            encoded[iteration*batch_size:(iteration+1)*batch_size] = hidden
        print(f"Saving encodings in ./{folder_name}. Now writing ./Encoded/{model_to_use}.npy")

        np.save(f'./Encoded/{model_to_use}.npy', encoded)
    if not encoded:
        print(f"Encodings are in ./{folder_name}. Now reading ./Encoded/{model_to_use}.npy")
        encoded = np.load(f'./Encoded/{model_to_use}.npy')

        #last part of encoded is the embeddings at the input, so they're all the same: the input embedding for [CLS] before it's fed into the network
        encoded = encoded[:, :-config.dim]
        encoded = encoded.astype(dtype=dtype, copy=False)

        print(f"Loaded model embeddings are {config.dim}-dimensional, and {model_to_use} has {config.n_layers} hidden layers.")
        print(f"Using dtype={dtype}")
    return encoded, config

In [12]:
%%time
encoded, config = encode(tokenized_df=tokenized, model_to_use=MODEL_TO_USE, dtype=np.float32)

Encodings are in ./Encoded. Now reading ./Encoded/distilbert-base-uncased.npy
Loaded model embeddings are 768-dimensional, and distilbert-base-uncased has 6 hidden layers.
Using dtype=<class 'numpy.float32'>
CPU times: user 2.11 s, sys: 24.7 s, total: 26.8 s
Wall time: 52.3 s


# Use an Autoencoder to reduce the dimensionality of the embedding down to 1024

In [21]:
def make_dataset(model_to_use,test_size=0.15, dtype=np.float32, batch_size=32):
    print(f"Now reading ./Encoded/{model_to_use}.npy...")
    encoded = np.load(f'./Encoded/{model_to_use}.npy')
    config = AutoConfig.from_pretrained(MODEL_TO_USE)
    #last part of encoded is the embeddings at the input, so they're all the same: the input embedding for [CLS] at before it's fed into the network
    encoded = encoded[:, :-config.dim]
    encoded = encoded.astype(dtype=dtype, copy=False)
    
    X_train, X_valid = train_test_split(encoded, test_size=0.15)
    print("Now creating TensorFlow Dataset")
    dataset_train = tf.data.Dataset.from_tensor_slices(X_train)
    dataset_valid = tf.data.Dataset.from_tensor_slices(X_valid)
    
    dataset_train = dataset_train.map(lambda x: (x,x))
    dataset_train = dataset_train.shuffle(10000)
    dataset_train = dataset_train.batch(batch_size)
    
    dataset_valid = dataset_valid.map(lambda x: (x,x))
    dataset_valid = dataset_valid.shuffle(10000)
    dataset_valid = dataset_valid.batch(batch_size)
    
    return dataset_train, dataset_valid

#From Aurelien Geron's Hands-on Machine Learning 2nd ed. https://github.com/ageron/handson-ml2/blob/master/17_autoencoders_and_gans.ipynb
class DenseTranspose(keras.layers.Layer):
    def __init__(self, dense, activation=None, **kwargs):
        self.dense=dense
        self.activation = keras.activations.get(activation)
        super().__init__(**kwargs)
    def build(self, batch_input_shape):
        self.biases = self.add_weight(name="bias", initializer="zeros", shape=[self.dense.input_shape[-1]])
        super().build(batch_input_shape)
    def call(self, inputs):
        z = tf.matmul(inputs, self.dense.weights[0], transpose_b=True)
        return self.activation(z + self.biases)

In [22]:
d_train, d_valid = make_dataset(model_to_use=MODEL_TO_USE, batch_size=32)

Now reading ./Encoded/distilbert-base-uncased.npy...
Now creating TensorFlow Dataset


In [23]:
keras.backend.clear_session()

dense_1 = keras.layers.Dense(3072, activation="selu")
dense_2 = keras.layers.Dense(2048, activation="selu")
dense_3 = keras.layers.Dense(1024, activation="selu")

encoder = keras.models.Sequential([
    keras.layers.InputLayer(input_shape=d_train.element_spec[0].shape[1:]),
    dense_1,
    dense_2,
    dense_3
])

tied_decoder = keras.models.Sequential([
    DenseTranspose(dense_3, activation="selu"),
    DenseTranspose(dense_2, activation="selu"),
    DenseTranspose(dense_1, activation="sigmoid")
])

tied_ae = keras.models.Sequential([encoder, tied_decoder])

callback_list = [keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3)]

tied_ae.compile(loss="mse", optimizer=keras.optimizers.Adagrad(), metrics=["accuracy"])

hist = tied_ae.fit(d_train, epochs=10, validation_data=d_valid, callbacks=callback_list)

Train for 14758 steps, validate for 2605 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [4]:
model_version = "0001"
model_name = f"Encoder{MODEL_TO_USE}_1024"
folder_name = "Autoencoder_encoder"
model_path = Path('.') / folder_name / model_name / model_version
model_path

PosixPath('Autoencoder_encoder/Encoderdistilbert-base-uncased_1024/0001')

In [67]:
tf.saved_model.save(encoder, str(model_path))

INFO:tensorflow:Assets written to: Autoencoder_encoder/Encoderdistilbert-base-uncased_1024/0001/assets


INFO:tensorflow:Assets written to: Autoencoder_encoder/Encoderdistilbert-base-uncased_1024/0001/assets


In [74]:
saved = tf.saved_model.load(str(model_path))
tf.reduce_all(saved(encoded[:1], training=False) == encoder.predict(encoded[:1])).numpy()

True

In [77]:
encoder.predict(encoded[:1]).shape

(1, 1024)

In [13]:
ae_e = tf.saved_model.load(str(model_path))
autoencoded = []
to_autoencode = tf.data.Dataset.from_tensor_slices(encoded)
to_autoencode = to_autoencode.batch(512)

for batch in tqdm_notebook(to_autoencode, total=tf.data.experimental.cardinality(to_autoencode).numpy()):
    autoencoded.append(ae_e(batch, training=False).numpy())
autoencoded = np.vstack(autoencoded)
np.save(f"./Encoded/autoencoded_{MODEL_TO_USE}_1024.npy", autoencoded)

HBox(children=(FloatProgress(value=0.0, max=1086.0), HTML(value='')))




In [14]:
autoencoded.shape

(555583, 1024)

# Use Google's Universal Sentence Encoder to  encode the arguments

In [5]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

INFO:absl:Using /tmp/tfhub_modules to cache modules.
INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder/4'.
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder/4: 170.00MB
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder/4: 350.00MB
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder/4: 530.00MB
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder/4: 710.00MB
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder/4: 890.00MB
INFO:absl:Downloaded https://tfhub.dev/google/universal-sentence-encoder/4, Total size: 987.47MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder/4'.


In [54]:
embed(["This is a sentence", "This is another sentence"])

<tf.Tensor: shape=(2, 512), dtype=float32, numpy=
array([[ 0.02881766, -0.02020015,  0.01069627, ..., -0.02896921,
         0.00876467,  0.08242127],
       [ 0.04333361, -0.01821983,  0.01752458, ..., -0.02157544,
        -0.02861957,  0.05987324]], dtype=float32)>

In [10]:
embeddings = []
to_embed = tf.data.Dataset.from_tensor_slices(dataset['text'].values)
to_embed = to_embed.batch(1024)

In [12]:
for batch in tqdm_notebook(to_embed, total=tf.data.experimental.cardinality(to_embed).numpy()):
    embedding = embed(batch)
    embedding = embedding.numpy()
    embeddings.append(embedding)

HBox(children=(FloatProgress(value=0.0, max=379.0), HTML(value='')))




In [13]:
embeddings = np.vstack(embeddings)

In [16]:
np.save('./Encoded/UniversalSentenceEncoderEmbeddings.npy', embeddings)