In [7]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [4]:
from pathlib import Path

import faiss
from annoy import AnnoyIndex

import pandas as pd
import numpy as np
# import tensorflow as tf
# from tensorflow import keras
# import tensorflow_hub as hub
# from transformers import TFAutoModel, AutoTokenizer, AutoConfig

In [2]:
MODEL_TO_USE = "distilbert-base-uncased"

# Import the original args dataset for argument lookup

In [8]:
%%time
current_directory = Path('.')
if not (current_directory / 'Data/dataset.pkl').exists():
    print("Pickled dataset doesn't already exists. Now reading JSON file.")

    #Read in JSON file if pickled dataframe doesn't already exist
    with open('./Data/args-me.json') as f:
        d = json.load(f)
        d = d['arguments']
        context_subfields = [['context', k] for k in d[0]['context'].keys()]
        dataset = pd.json_normalize(d, record_path='premises', meta=['id', 'conclusion', *context_subfields])
        print("Now pickling Pandas DataFrame into dataset.pkl.")
        dataset.to_pickle('Data/dataset.pkl')
        print("DataFrame pickled.")
        print(" ")
else:
    #If pickle already exists, read it into dataframe
    print("Pickled dataset already exists. Now loading dataset.pkl into Pandas DataFrame")
    dataset = pd.read_pickle('Data/dataset.pkl')
    print(" ")

Pickled dataset already exists. Now loading dataset.pkl into Pandas DataFrame
 
CPU times: user 579 ms, sys: 414 ms, total: 992 ms
Wall time: 990 ms


# See what dense embeddings exist in the Encoded folder

In [9]:
current_dir = Path(".")
encoded_dir = current_dir / "Encoded"
tokenized_dir = current_dir / "Tokenized"
print([f"{x.stem}{x.suffix}" for x in encoded_dir.iterdir()])
print("")
print(list(encoded_dir.iterdir()))

['UniversalSentenceEncoderEmbeddings.npy', 'autoencoded_distilbert-base-uncased_1024.npy', 'distilbert-base-uncased.npy']

[PosixPath('Encoded/UniversalSentenceEncoderEmbeddings.npy'), PosixPath('Encoded/autoencoded_distilbert-base-uncased_1024.npy'), PosixPath('Encoded/distilbert-base-uncased.npy')]


# Create an Annoy index for every embedding

In [36]:
def make_or_load_annoy(embedding_posix, n_trees=500, index_type="angular", a=0, b=3): #a and b determine how many hidden layers of [CLS] to include in the embeddings for transformer-based embeddings
    current_dir= Path(".")
    embedding_name= embedding_posix.stem
    suffix= ""
    if not (current_dir / "Annoy" / f'{embedding_name}_{index_type}_{n_trees}.ann').exists() and not (current_dir / "Annoy" / f'{embedding_name}_{index_type}_{n_trees}_{a}_to_{b}.ann').exists():
        print(f"Now reading embeddings into numpy array from {str(embedding_posix)}")
        
        embeddings = np.load(str(embedding_posix))
        if embedding_name in [file.stem for file in tokenized_dir.iterdir()]:
            config = AutoConfig.from_pretrained(embedding_name)
            dim = config.dim
            print(f"Taking the Pooler (CLS) embeddings from hidden layers {a} to {b}")
            embeddings = embeddings[:, a*dim:b*dim]
            suffix = f"_{a}_to_{b}"
        d = embeddings.shape[1]
        print(f"Embeddings are {d}-dimensional")
        annoy = AnnoyIndex(d, index_type)
        
        print("Now adding vectors to index")
        for i, embedding in tqdm_notebook(enumerate(embeddings), total=embeddings.shape[0]):
            annoy.add_item(i, embedding)
        print(f"Now building index for {embedding_name}")
        annoy.build(n_trees)
        annoy.save(f'./Annoy/{embedding_name}_{index_type}_{n_trees}{suffix}.ann')
        print((f'Saved to ./Annoy/{embedding_name}_{index_type}_{n_trees}{suffix}'))
    else: #True when Index already exists
        #Read header of numpy array from disk without loading it to get its shape
        d = 512
        if embedding_name in [file.stem for file in tokenized_dir.iterdir()]: #True for huggingface models    
            suffix = f"_{a}_to_{b}"
            config = AutoConfig.from_pretrained(embedding_name)
            dim = config.dim
            d = (b-a)*dim
        elif (embedding_name.split("_")[0] == "autoencoded"):
            with open(f"./Encoded/{embedding_name}.npy", "rb") as npy:                
                version = np.lib.format.read_magic(npy)
                shape,_,_ = np.lib.format._read_array_header(npy, version)
                d = shape[1]

        print(f"Index already exists, now loading {embedding_name}_{index_type}_{n_trees}{suffix}")
        annoy = AnnoyIndex(d, index_type)
        annoy.load(f'./Annoy/{embedding_name}_{index_type}_{n_trees}{suffix}.ann')
        
    print("Now reading in argument ids")
    
    
    if embedding_name in [file.stem for file in tokenized_dir.iterdir()]: #True for huggingface models    
        print("Found tokenized pickle.1")
        tokenized = pd.read_pickle(f"./Tokenized/{embedding_name}.pkl")
        arg_ids = tokenized[["id"]]
    
    elif (embedding_name.split("_")[0] == "autoencoded"):
        print("Found tokenized pickle.2")
        tokenized = pd.read_pickle(f"./Tokenized/{embedding_name.split('_')[1]}.pkl")
        arg_ids = tokenized[["id"]]
    else:
        print("No tokenized pickle. Reading original dataset.")
        dataset = pd.read_pickle("./Data/dataset.pkl")
        arg_ids = dataset[["id"]]
    print("-------------------")
    print("-------------------")
    print("-------------------")
    print("-------------------")

    return annoy, arg_ids

## Building the indices:

In [39]:
%%time

indices = {}
for file in encoded_dir.iterdir():
    index, tokenized = make_or_load_annoy(embedding_posix=file, n_trees=5, b=2)
    indices[file.stem] = {"index":index, "tokenized":tokenized}

Now reading embeddings into numpy array from Encoded/UniversalSentenceEncoderEmbeddings.npy
Embeddings are 512-dimensional
Now adding vectors to index


HBox(children=(FloatProgress(value=0.0, max=387692.0), HTML(value='')))


Now building index for UniversalSentenceEncoderEmbeddings
Saved to ./Annoy/UniversalSentenceEncoderEmbeddings_angular_5
Now reading in argument ids
No tokenized pickle. Reading original dataset.
-------------------
-------------------
-------------------
-------------------
Now reading embeddings into numpy array from Encoded/autoencoded_distilbert-base-uncased_1024.npy
Embeddings are 1024-dimensional
Now adding vectors to index


HBox(children=(FloatProgress(value=0.0, max=555583.0), HTML(value='')))


Now building index for autoencoded_distilbert-base-uncased_1024
Saved to ./Annoy/autoencoded_distilbert-base-uncased_1024_angular_5
Now reading in argument ids
Found tokenized pickle.2
-------------------
-------------------
-------------------
-------------------
Now reading embeddings into numpy array from Encoded/distilbert-base-uncased.npy
Taking the Pooler (CLS) embeddings from hidden layers 0 to 2
Embeddings are 1536-dimensional
Now adding vectors to index


HBox(children=(FloatProgress(value=0.0, max=555583.0), HTML(value='')))


Now building index for distilbert-base-uncased
Saved to ./Annoy/distilbert-base-uncased_angular_5_0_to_2
Now reading in argument ids
Found tokenized pickle.1
-------------------
-------------------
-------------------
-------------------
CPU times: user 7min 1s, sys: 22.6 s, total: 7min 24s
Wall time: 7min 26s


## Loading the indices from the Annoy folder:

In [49]:
%%time

indices = {}
for file in encoded_dir.iterdir():
    index, tokenized = make_or_load_annoy(embedding_posix=file, n_trees=5, b=2)
    indices[file.stem] = {"index":index, "arg_ids":tokenized}

Index already exists, now loading UniversalSentenceEncoderEmbeddings_angular_5
Now reading in argument ids
No tokenized pickle. Reading original dataset.
-------------------
-------------------
-------------------
-------------------
Index already exists, now loading autoencoded_distilbert-base-uncased_1024_angular_5
Now reading in argument ids
Found tokenized pickle.2
-------------------
-------------------
-------------------
-------------------
Index already exists, now loading distilbert-base-uncased_angular_5_0_to_2
Now reading in argument ids
Found tokenized pickle.1
-------------------
-------------------
-------------------
-------------------
CPU times: user 5.79 s, sys: 1.59 s, total: 7.38 s
Wall time: 7.83 s


In [50]:
print(list(indices.keys()))
print(list(indices['distilbert-base-uncased'].keys()))

['UniversalSentenceEncoderEmbeddings', 'autoencoded_distilbert-base-uncased_1024', 'distilbert-base-uncased']
['index', 'arg_ids']


# Searching with the Annoy indices

In [189]:
query = "Abortion is not a crime and should be legalized."
num_vectors_to_find = 10000

## Searching with Distilbert embeddings

In [51]:
annoy_d = indices['distilbert-base-uncased']['index']
ids_d = indices['distilbert-base-uncased']['arg_ids']

In [55]:
config = AutoConfig.from_pretrained('distilbert-base-uncased')
config.output_hidden_states=True
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
dbert = TFAutoModel.from_pretrained('distilbert-base-uncased', config=config)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=363423424.0, style=ProgressStyle(descri…




In [190]:
mod_input = tokenizer.encode_plus(query)
i = tf.constant(mod_input['input_ids'])[None, :]
m = tf.constant(mod_input['attention_mask'])[None, :]
output = dbert(i, attention_mask=m, training=False)
embedding = np.hstack([thing.numpy()[:,0,:] for thing in reversed(output[-1])])
embedding = embedding[:, 0*config.dim:2*config.dim]

In [191]:
result_indices = annoy_d.get_nns_by_vector(embedding.squeeze(), num_vectors_to_find, search_k=-1, include_distances=False)
set_d = set(ids_d['id'][result_indices].values)
set_d

{'27d3a1e7-2019-04-15T20:22:30Z-00006-000',
 '4ace0a08-2019-04-19T12:47:51Z-00002-000',
 'fffbcdf0-2019-04-17T11:47:46Z-00000-000',
 'aedf8562-2019-04-19T12:45:18Z-00017-000',
 '89e52114-2019-04-17T11:47:41Z-00017-000',
 '48c4322d-2019-04-19T12:46:43Z-00000-000',
 '4da47715-2019-04-17T11:47:27Z-00059-000',
 '9860f93c-2019-04-17T11:47:22Z-00034-000',
 '1378c40-2019-04-17T11:47:41Z-00095-000',
 '52fbc112-2019-04-19T12:47:46Z-00005-000',
 'd397393b-2019-04-18T18:08:56Z-00009-000',
 '1d99ec20-2019-04-17T11:47:28Z-00032-000',
 '25edced3-2019-04-17T11:47:33Z-00024-000',
 '72e845b6-2019-04-17T11:47:38Z-00012-000',
 'e5cb277d-2019-04-19T12:46:50Z-00006-000',
 'b477d5a-2019-04-17T11:47:43Z-00040-000',
 '6967e902-2019-04-19T12:46:02Z-00026-000',
 '5b858825-2019-04-19T12:48:05Z-00009-000',
 '89c45bda-2019-04-17T11:47:42Z-00009-000',
 '60e6af4e-2019-04-15T20:24:21Z-00006-000',
 'e4848164-2019-04-17T11:47:33Z-00021-000',
 '40f44a1b-2019-04-17T11:47:22Z-00035-000',
 '281ab12-2019-04-17T11:47:28Z-000

## Searching with Autoencoder embeddings

In [176]:
annoy_a = indices['autoencoded_distilbert-base-uncased_1024']['index']
ids_a= indices['autoencoded_distilbert-base-uncased_1024']['arg_ids']

In [111]:
MODEL_TO_USE = 'distilbert-base-uncased'
model_version = "0001"
model_name = f"Encoder{MODEL_TO_USE}_1024"
folder_name = "Autoencoder_encoder"
model_path = Path('.') / folder_name / model_name / model_version
ae_e = tf.saved_model.load(str(model_path))

In [192]:
mod_input = tokenizer.encode_plus(query)
i = tf.constant(mod_input['input_ids'])[None, :]
m = tf.constant(mod_input['attention_mask'])[None, :]
output = dbert(i, attention_mask=m, training=False)
embedding = np.hstack([thing.numpy()[:,0,:] for thing in reversed(output[-1])])
embedding = embedding[:,:-config.dim]
embedding = ae_e(tf.constant(embedding)).numpy()

In [193]:
result_indices = annoy_a.get_nns_by_vector(embedding.squeeze(), num_vectors_to_find, search_k=-1, include_distances=False)
set_a = set(ids_d['id'][result_indices].values)
set_a

{'27d3a1e7-2019-04-15T20:22:30Z-00006-000',
 '4ace0a08-2019-04-19T12:47:51Z-00002-000',
 'aedf8562-2019-04-19T12:45:18Z-00017-000',
 '89e52114-2019-04-17T11:47:41Z-00017-000',
 '48c4322d-2019-04-19T12:46:43Z-00000-000',
 '4da47715-2019-04-17T11:47:27Z-00059-000',
 '9860f93c-2019-04-17T11:47:22Z-00034-000',
 '1378c40-2019-04-17T11:47:41Z-00095-000',
 '52fbc112-2019-04-19T12:47:46Z-00005-000',
 '25edced3-2019-04-17T11:47:33Z-00024-000',
 '72e845b6-2019-04-17T11:47:38Z-00012-000',
 '1d99ec20-2019-04-17T11:47:28Z-00032-000',
 'e5cb277d-2019-04-19T12:46:50Z-00006-000',
 '5b858825-2019-04-19T12:48:05Z-00009-000',
 '89c45bda-2019-04-17T11:47:42Z-00009-000',
 '60e6af4e-2019-04-15T20:24:21Z-00006-000',
 'e4848164-2019-04-17T11:47:33Z-00021-000',
 '40f44a1b-2019-04-17T11:47:22Z-00035-000',
 '70ffe88-2019-04-17T11:47:25Z-00039-000',
 '2e729a4f-2019-04-17T11:47:44Z-00045-000',
 'e5ccda7-2019-04-17T11:47:44Z-00081-000',
 'cc95487f-2019-04-15T20:22:50Z-00001-000',
 'e5ccda7-2019-04-17T11:47:44Z-0003

## Searching with Google Universal Sentence Encoder embeddings

In [52]:
annoy_g = indices['UniversalSentenceEncoderEmbeddings']['index']
ids_g= indices['UniversalSentenceEncoderEmbeddings']['arg_ids']

In [119]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

INFO:absl:Using /tmp/tfhub_modules to cache modules.
INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder/4'.
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder/4: 180.00MB
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder/4: 360.00MB
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder/4: 540.00MB
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder/4: 720.00MB
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder/4: 900.00MB
INFO:absl:Downloaded https://tfhub.dev/google/universal-sentence-encoder/4, Total size: 987.47MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder/4'.


In [194]:
embedding = embed([query]).numpy()

In [195]:
result_indices = annoy_g.get_nns_by_vector(embedding.squeeze(), num_vectors_to_find, search_k=-1, include_distances=False)
set_g = set(ids_d['id'][result_indices].values)
set_g

{'8717e219-2019-04-18T17:24:55Z-00000-000',
 '9545eb6e-2019-04-18T19:52:07Z-00000-000',
 'b3be5091-2019-04-18T15:08:14Z-00003-000',
 '69d63e81-2019-04-18T19:10:12Z-00002-000',
 'b94e9d7f-2019-04-18T18:57:03Z-00001-000',
 '6e6ae6b4-2019-04-18T14:19:43Z-00001-000',
 'dc58e662-2019-04-18T12:27:48Z-00005-000',
 '8b084077-2019-04-18T16:30:45Z-00005-000',
 '4e763ff6-2019-04-18T16:49:19Z-00009-000',
 '902fed27-2019-04-18T11:10:01Z-00001-000',
 '113e39e8-2019-04-18T11:14:56Z-00003-000',
 'd23a6ca-2019-04-18T17:33:02Z-00004-000',
 'da41e898-2019-04-18T15:04:47Z-00003-000',
 'ee77be25-2019-04-18T16:23:17Z-00001-000',
 '9bb25fb0-2019-04-18T16:45:11Z-00004-000',
 'f49fbe84-2019-04-18T19:09:34Z-00003-000',
 '6113ce7b-2019-04-18T19:51:24Z-00001-000',
 'f659d4fd-2019-04-18T13:11:17Z-00005-000',
 'e03bb5aa-2019-04-18T14:46:15Z-00004-000',
 '66bca23e-2019-04-18T11:23:05Z-00005-000',
 'aedf4296-2019-04-18T18:38:32Z-00002-000',
 '35d7506c-2019-04-18T12:16:22Z-00000-000',
 '36eef26f-2019-04-18T18:20:17Z-0

## Combining resultsets set_d, set_a and set_g

In [196]:
union = set.union(*[set_d, set_a, set_g])

from itertools import combinations
pairwise_intersection = set()
for set_1, set_2 in combinations([set_d, set_a, set_g], r=2):
    pairwise_intersection.update(set_1.intersection(set_2))

## Returning arguments from a resultset

In [183]:
def return_args(id_set):
    l = list(id_set)
    return dataset[dataset['id'].isin(l)].copy()

In [197]:
return_args(set_g)[:5]

Unnamed: 0,text,stance,id,conclusion,context.sourceId,context.previousArgumentInSourceId,context.acquisitionTime,context.discussionTitle,context.sourceTitle,context.sourceUrl,context.nextArgumentInSourceId
21,"Words to Begin the Debate with First off, I ap...",PRO,c3e9c4a6-2019-04-18T14:36:11Z-00001-000,Debates are harder when you are the first one ...,c3e9c4a6-2019-04-18T14:36:11Z,c3e9c4a6-2019-04-18T14:36:11Z-00000-000,2019-04-18T14:36:11Z,Debates are harder when you are the first one ...,Debate Issue: Debates are harder when you are ...,https://www.debate.org/debates/Debates-are-har...,c3e9c4a6-2019-04-18T14:36:11Z-00002-000
39,Reasons why this program wouldnt work in a mil...,CON,9bb25fb0-2019-04-18T16:45:11Z-00004-000,My Proposal for the Eradication of Extreme Pov...,9bb25fb0-2019-04-18T16:45:11Z,9bb25fb0-2019-04-18T16:45:11Z-00003-000,2019-04-18T16:45:11Z,My Proposal for the Eradication of Extreme Pov...,Online Debate: My Proposal for the Eradication...,https://www.debate.org/debates/My-Proposal-for...,9bb25fb0-2019-04-18T16:45:11Z-00005-000
50,Framework: In order to show that the Ontologic...,PRO,f4a9d491-2019-04-18T18:51:26Z-00009-000,Resolved: The Ontological Argument is Sound,f4a9d491-2019-04-18T18:51:26Z,f4a9d491-2019-04-18T18:51:26Z-00008-000,2019-04-18T18:51:26Z,Resolved: The Ontological Argument is Sound,Debate Topic: Resolved: The Ontological Argume...,https://www.debate.org/debates/Resolved-The-On...,
51,Thanks for your attemopt and yeah you have a g...,PRO,806480f-2019-04-18T20:02:33Z-00001-000,We should a limited number of voters or limete...,806480f-2019-04-18T20:02:33Z,,2019-04-18T20:02:33Z,We should a limited number of voters or limete...,Online Debate: We should a limited number of v...,https://www.debate.org/debates/We-should-a-lim...,806480f-2019-04-18T20:02:33Z-00002-000
66,"First of all, I thank my opponent for acceptin...",PRO,a60d2aa5-2019-04-18T17:14:53Z-00003-000,Russell Hantz Should Have Won Survivor: Samoa,a60d2aa5-2019-04-18T17:14:53Z,a60d2aa5-2019-04-18T17:14:53Z-00002-000,2019-04-18T17:14:53Z,Russell Hantz Should Have Won Survivor: Samoa,Debate Argument: Russell Hantz Should Have Won...,https://www.debate.org/debates/Russell-Hantz-S...,a60d2aa5-2019-04-18T17:14:53Z-00004-000


In [198]:
return_args(pairwise_intersection)

Unnamed: 0,text,stance,id,conclusion,context.sourceId,context.previousArgumentInSourceId,context.acquisitionTime,context.discussionTitle,context.sourceTitle,context.sourceUrl,context.nextArgumentInSourceId
801,Atheists believe in mythological concepts. Goo...,PRO,c958fed8-2019-04-18T13:00:26Z-00005-000,Atheists believe in mythological concepts,c958fed8-2019-04-18T13:00:26Z,c958fed8-2019-04-18T13:00:26Z-00004-000,2019-04-18T13:00:26Z,Atheists believe in mythological concepts,Debate: Atheists believe in mythological conce...,https://www.debate.org/debates/Atheists-believ...,
1030,The continued use of sweatshops represents a v...,CON,b0659314-2019-04-18T19:13:49Z-00003-000,Sweatshops,b0659314-2019-04-18T19:13:49Z,b0659314-2019-04-18T19:13:49Z-00002-000,2019-04-18T19:13:49Z,Sweatshops,Debate Argument: Sweatshops | Debate.org,https://www.debate.org/debates/Sweatshops/2/,b0659314-2019-04-18T19:13:49Z-00004-000
1641,My opponent brought up an example of people dr...,PRO,f7b04e85-2019-04-18T14:37:44Z-00004-000,Censorship is good,f7b04e85-2019-04-18T14:37:44Z,f7b04e85-2019-04-18T14:37:44Z-00003-000,2019-04-18T14:37:44Z,Censorship is good,Debate Topic: Censorship is good | Debate.org,https://www.debate.org/debates/Censorship-is-g...,
1808,Mayo can't be considered a human being because...,CON,d8a8657d-2019-04-18T13:49:30Z-00001-000,Should Mayo Be Considered A Human Being,d8a8657d-2019-04-18T13:49:30Z,d8a8657d-2019-04-18T13:49:30Z-00000-000,2019-04-18T13:49:30Z,Should Mayo Be Considered A Human Being,Debate Argument: Should Mayo Be Considered A H...,https://www.debate.org/debates/Should-Mayo-Be-...,d8a8657d-2019-04-18T13:49:30Z-00002-000
1996,Civil Unions should be put as marriage and LGB...,PRO,f7fc785e-2019-04-18T18:01:55Z-00001-000,Civil Unions,f7fc785e-2019-04-18T18:01:55Z,f7fc785e-2019-04-18T18:01:55Z-00000-000,2019-04-18T18:01:55Z,Civil Unions,Debate Topic: Civil Unions | Debate.org,https://www.debate.org/debates/Civil-Unions/3/,f7fc785e-2019-04-18T18:01:55Z-00002-000
...,...,...,...,...,...,...,...,...,...,...,...
387491,Islam is a religion of violence and conquest.,CON,2772ce32-2019-04-17T11:47:25Z-00010-000,Ground zero mosque,2772ce32-2019-04-17T11:47:25Z,2772ce32-2019-04-17T11:47:25Z-00025-000,2019-04-17T11:47:25Z,Ground zero mosque,Debate: Ground zero mosque - Debatepedia,http://www.debatepedia.org/en/index.php/Debate...,2772ce32-2019-04-17T11:47:25Z-00041-000
387496,9/11 victims are not entitled to make bigoted ...,PRO,2772ce32-2019-04-17T11:47:25Z-00041-000,Ground zero mosque,2772ce32-2019-04-17T11:47:25Z,2772ce32-2019-04-17T11:47:25Z-00010-000,2019-04-17T11:47:25Z,Ground zero mosque,Debate: Ground zero mosque - Debatepedia,http://www.debatepedia.org/en/index.php/Debate...,2772ce32-2019-04-17T11:47:25Z-00026-000
387512,Islam is a religion of peace.,PRO,2772ce32-2019-04-17T11:47:25Z-00013-000,Ground zero mosque,2772ce32-2019-04-17T11:47:25Z,2772ce32-2019-04-17T11:47:25Z-00028-000,2019-04-17T11:47:25Z,Ground zero mosque,Debate: Ground zero mosque - Debatepedia,http://www.debatepedia.org/en/index.php/Debate...,2772ce32-2019-04-17T11:47:25Z-00044-000
387592,The disadvantaged deserve access to better sch...,PRO,671509c8-2019-04-17T11:47:34Z-00009-000,Charter schools,671509c8-2019-04-17T11:47:34Z,671509c8-2019-04-17T11:47:34Z-00024-000,2019-04-17T11:47:34Z,Charter schools,Debate: Charter schools - Debatepedia,http://www.debatepedia.org/en/index.php/Debate...,671509c8-2019-04-17T11:47:34Z-00055-000


In [199]:
return_args(pairwise_intersection)['conclusion'].value_counts()

Abortion                                                                  144
Single-payer universal health care                                         81
Animal testing                                                             80
Gay marriage                                                               78
Death penalty                                                              78
                                                                         ... 
Gas appliances do not create unhealthy electrical fields in the home.       1
it is sometimes right for the government to restrict freedom of speech      1
Life in prison without parole is preferable to the death penatly.           1
A ban would be simple to enforce.                                           1
The death penalty                                                           1
Name: conclusion, Length: 1854, dtype: int64

In [200]:
return_args(pairwise_intersection)['stance'].value_counts()

PRO    3074
CON    2562
Name: stance, dtype: int64

# Create a FAISS PQ index for every embedding

In [10]:
current_dir = Path(".")
encoded_dir = current_dir / "Encoded"
tokenized_dir = current_dir / "Tokenized"
print([f"{x.stem}{x.suffix}" for x in encoded_dir.iterdir()])
print("")
print(list(encoded_dir.iterdir()))

['UniversalSentenceEncoderEmbeddings.npy', 'autoencoded_distilbert-base-uncased_1024.npy', 'distilbert-base-uncased.npy']

[PosixPath('Encoded/UniversalSentenceEncoderEmbeddings.npy'), PosixPath('Encoded/autoencoded_distilbert-base-uncased_1024.npy'), PosixPath('Encoded/distilbert-base-uncased.npy')]


In [15]:
res = faiss.StandardGpuResources()

## Distilbert

In [16]:
def make_or_load_pq(embedding_posix, m=64, n_bits=8, a=0, b=3): #a and b determine how many hidden layers of [CLS] to include in the embeddings for transformer-based embeddings
    current_dir= Path(".")
    embedding_name= embedding_posix.stem
    suffix= ""
    if not (current_dir / "Faiss" / f'{embedding_name}_{m}_{n_bits}.faiss').exists() and not (current_dir / "Faiss" / f'{embedding_name}_{m}_{n_bits}_{a}_to_{b}.faiss').exists():
        print(f"Now reading embeddings into numpy array from {str(embedding_posix)}")
        
        embeddings = np.load(str(embedding_posix))
        if embedding_name in [file.stem for file in tokenized_dir.iterdir()]:
            config = AutoConfig.from_pretrained(embedding_name)
            dim = config.dim
            print(f"Taking the Pooler (CLS) embeddings from hidden layers {a} to {b}")
            embeddings = embeddings[:, a*dim:b*dim]
            suffix = f"_{a}_to_{b}"
        d = embeddings.shape[1]
        print(f"Embeddings are {d}-dimensional")
        pq = faiss.IndexPQ(d, m, n_bits)
        print("Now trying to switch to GPU")
        gpu_pq = faiss.index_cpu_to_gpu(res, 0, pq)
        print("Now training PQ-Index")
        gpu_pq.train(embeddings)
        print("Now adding embeddings to PQ-Index")
        gpu_pq.add(embeddings)
        print("Now switching back to CPU")
        pq = faiss.index_gpu_to_cpu(gpu_pq)
        faiss.write_index(pq, f'./Faiss/{embedding_name}_{m}_{n_bits}{suffix}.faiss')
        print((f'Saved to ./Faiss/{embedding_name}_{m}_{n_bits}{suffix}'))
        
    else: #True when Index already exists


        print(f"Index already exists, now loading {embedding_name}_{m}_{n_bits}{suffix}")
        pq = faiss.read_index(f'./Faiss/{embedding_name}_{m}_{n_bits}{suffix}.faiss')
        
    print("Now reading in argument ids")
    
    
    if embedding_name in [file.stem for file in tokenized_dir.iterdir()]: #True for huggingface models    
        print("Found tokenized pickle.1")
        tokenized = pd.read_pickle(f"./Tokenized/{embedding_name}.pkl")
        arg_ids = tokenized[["id"]]
    
    elif (embedding_name.split("_")[0] == "autoencoded"):
        print("Found tokenized pickle.2")
        tokenized = pd.read_pickle(f"./Tokenized/{embedding_name.split('_')[1]}.pkl")
        arg_ids = tokenized[["id"]]
    else:
        print("No tokenized pickle. Reading original dataset.")
        dataset = pd.read_pickle("./Data/dataset.pkl")
        arg_ids = dataset[["id"]]
    print("-------------------")
    print("-------------------")
    print("-------------------")
    print("-------------------")

    return pq, arg_ids

In [None]:
%%time

pq_indices = {}
for file in encoded_dir.iterdir():
    index, tokenized = make_or_load_pq(embedding_posix=file)
    pq_indices[file.stem] = {"index":index, "arg_ids":tokenized}

Now reading embeddings into numpy array from Encoded/UniversalSentenceEncoderEmbeddings.npy
Embeddings are 512-dimensional
Now trying to switch to GPU
Now training PQ-Index
Now adding embeddings to PQ-Index
Now switching back to CPU
Saved to ./Faiss/UniversalSentenceEncoderEmbeddings_64_8
Now reading in argument ids
No tokenized pickle. Reading original dataset.
-------------------
-------------------
-------------------
-------------------
Now reading embeddings into numpy array from Encoded/autoencoded_distilbert-base-uncased_1024.npy
Embeddings are 1024-dimensional
Now trying to switch to GPU
Now training PQ-Index
Now adding embeddings to PQ-Index
Now switching back to CPU
Saved to ./Faiss/autoencoded_distilbert-base-uncased_1024_64_8
Now reading in argument ids
Found tokenized pickle.2
-------------------
-------------------
-------------------
-------------------
Now reading embeddings into numpy array from Encoded/distilbert-base-uncased.npy
