In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
from ipywidgets import Output
out = Output()

In [2]:
!pip install --disable-pip-version-check -q pandas transformers seaborn tensorflow_hub elasticsearch elasticsearch-dsl torch annoy faiss-cpu
!pip install --disable-pip-version-check -qU scikit-learn nltk spacy
!pip -q install --disable-pip-version-check --no-warn-script-location --user tensorflow-text 
with out:
    !python -m spacy download en_core_web_sm
#UNCOMMENT IF RUNNING ON GPU BECAUSE TENSORFLOW TEXT DISABLES GPU ACCESS 
#!pip -q uninstall -y tensorflow

In [40]:
from utils import *

def return_args(id_set):
    l = list(id_set)
    return arguments[arguments['id'].isin(l)].copy()

# Make sure Elasticsearch is alive

In [3]:
es = Elasticsearch(timeout=100)
es.ping()

True

In [6]:
display(es.indices.get_mapping(index="arg_index"))

{'arg_index': {'mappings': {'properties': {'conclusion': {'type': 'text'},
    'context': {'properties': {'sourceId': {'type': 'keyword'}}},
    'stance': {'type': 'keyword'},
    'text': {'type': 'text', 'similarity': 'my_dirichlet'}}}}}

# Load the Topics, Judgments and Arguments into DataFrames

In [8]:
%%time
judgments = pd.read_csv('Data/tira-qrels', delim_whitespace=True, names=['topic','iteration','id','relevance'])
arguments = pd.read_pickle('Data/dataset.pkl')

tree = ET.parse('Data/topics-automatic-runs-task-1.xml')
root = tree.getroot()

topics = []
for child in root:
    d = {'topic':int(child[0].text), 'query':child[1].text}
    topics.append(d)
topics = pd.DataFrame(topics)
display(topics[:2])
display(judgments[:2])
display(arguments[:2])

Unnamed: 0,topic,query
0,1,Should Teachers Get Tenure?
1,2,Is Vaping with E-Cigarettes Safe?


Unnamed: 0,topic,iteration,id,relevance
0,9,0,5f1c7022-2019-04-18T15:36:48Z-00005-000,3
1,30,0,8e2fdd07-2019-04-18T15:40:20Z-00003-000,3


Unnamed: 0,text,stance,id,conclusion,context.sourceId,context.previousArgumentInSourceId,context.acquisitionTime,context.discussionTitle,context.sourceTitle,context.sourceUrl,context.nextArgumentInSourceId
0,My opponent forfeited every round. None of my ...,CON,c67482ba-2019-04-18T13:32:05Z-00000-000,Contraceptive Forms for High School Students,c67482ba-2019-04-18T13:32:05Z,,2019-04-18T13:32:05Z,Contraceptive Forms for High School Students,Debate Argument: Contraceptive Forms for High ...,https://www.debate.org/debates/Contraceptive-F...,c67482ba-2019-04-18T13:32:05Z-00001-000
1,How do you propose the school will fund your p...,CON,c67482ba-2019-04-18T13:32:05Z-00001-000,Contraceptive Forms for High School Students,c67482ba-2019-04-18T13:32:05Z,c67482ba-2019-04-18T13:32:05Z-00000-000,2019-04-18T13:32:05Z,Contraceptive Forms for High School Students,Debate Argument: Contraceptive Forms for High ...,https://www.debate.org/debates/Contraceptive-F...,c67482ba-2019-04-18T13:32:05Z-00002-000


CPU times: user 691 ms, sys: 443 ms, total: 1.13 s
Wall time: 1.13 s


# Load the semantic indices

In [9]:
%%time
current_dir = Path(".")
encoded_dir = current_dir / "Encoded"
tokenized_dir = current_dir / "Tokenized"
indices = {}
for file in encoded_dir.iterdir():
    annoy_index, arg_ids = make_or_load_annoy(embedding_posix=file, n_trees=500, b=2)
    pq_index, _ = make_or_load_pq(embedding_posix=file, b=2)
    indices[file.stem] = {"annoy_index":annoy_index, "pq_index":pq_index, "arg_ids":arg_ids}

annoy_d = indices['distilbert-base-uncased']['annoy_index']
pq_d = indices['distilbert-base-uncased']['pq_index']
ids_d = indices['distilbert-base-uncased']['arg_ids']

annoy_a = indices['autoencoded_distilbert-base-uncased_1024']['annoy_index']
pq_a = indices['autoencoded_distilbert-base-uncased_1024']['pq_index']
ids_a = indices['autoencoded_distilbert-base-uncased_1024']['arg_ids']

annoy_g = indices['UniversalSentenceEncoderEmbeddings']['annoy_index']
pq_g = indices['UniversalSentenceEncoderEmbeddings']['pq_index']
ids_g = indices['UniversalSentenceEncoderEmbeddings']['arg_ids']

Index already exists, now loading UniversalSentenceEncoderEmbeddings_angular_500
Now reading in argument ids
No tokenized pickle. Reading original dataset.
-------------------
-------ANNOY-------
-------------------
Index already exists, now loading UniversalSentenceEncoderEmbeddings_64_8
Now reading in argument ids
No tokenized pickle. Reading original dataset.
-------------------
-------FAISS-------
-------------------
Index already exists, now loading autoencoded_distilbert-base-uncased_1024_angular_500
Now reading in argument ids
Found tokenized pickle.
-------------------
-------ANNOY-------
-------------------
Index already exists, now loading autoencoded_distilbert-base-uncased_1024_64_8
Now reading in argument ids
Found tokenized pickle.
-------------------
-------FAISS-------
-------------------
Index already exists, now loading distilbert-base-uncased_angular_500_0_to_2
Now reading in argument ids
Found tokenized pickle.
-------------------
-------ANNOY-------
---------------

# Load the embedding neural networks

In [10]:
%%time

#LOAD DISTILBERT
config = AutoConfig.from_pretrained('distilbert-base-uncased')
config.output_hidden_states=True
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
dbert = TFAutoModel.from_pretrained('distilbert-base-uncased', config=config)

#LOAD UNIVERSAL SENTENCE ENCODER
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

#LOAD AUTOENCODER
MODEL_TO_USE = 'distilbert-base-uncased'
model_version = "0001"
model_name = f"Encoder{MODEL_TO_USE}_1024"
folder_name = "Autoencoder_encoder"
model_path = Path('.') / folder_name / model_name / model_version
ae_e = tf.saved_model.load(str(model_path))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=363423424.0, style=ProgressStyle(descri…




INFO:absl:Using /tmp/tfhub_modules to cache modules.
INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder/4'.
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder/4: 180.00MB
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder/4: 350.00MB
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder/4: 530.00MB
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder/4: 700.00MB
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder/4: 880.00MB
INFO:absl:Downloaded https://tfhub.dev/google/universal-sentence-encoder/4, Total size: 987.47MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder/4'.


# Enter a query or choose one at random from the topics dataframe

In [21]:
topic_number, query = topics.sample(1).values.squeeze()
print(query)

Can Alternative Energy Effectively Replace Fossil Fuels?


In [22]:
num_vectors_to_find = 10000

# Embed the query using every one of the embedders

In [23]:
%%time
#GET EMBEDDING FROM DISTILBERT
mod_input = tokenizer.encode_plus(query)
i = tf.constant(mod_input['input_ids'])[None, :]
m = tf.constant(mod_input['attention_mask'])[None, :]
output = dbert(i, attention_mask=m, training=False)
embedding = np.hstack([thing.numpy()[:,0,:] for thing in reversed(output[-1])])
d_embedding = embedding[:, 0*config.dim:2*config.dim]

#GET EMBEDDING FROM UNIVERSAL SENTENCE ENCODER
g_embedding = embed([query]).numpy()

#GET EMBEDDING FROM AUTOENCODER
a_embedding = embedding[:,:-config.dim]
a_embedding = ae_e(tf.constant(a_embedding)).numpy()

CPU times: user 535 ms, sys: 38.7 ms, total: 574 ms
Wall time: 460 ms


# Query FAISS and ANNOY for every embedding (6 different indices: 2 per embedding) and store the returned argument ids in a resultset corresponding to the index

In [26]:
%%time
result_indices = annoy_d.get_nns_by_vector(d_embedding.squeeze(), num_vectors_to_find, search_k=-1, include_distances=False)
set_d_annoy = set(ids_d['id'][result_indices].values)

result_indices = annoy_a.get_nns_by_vector(a_embedding.squeeze(), num_vectors_to_find, search_k=-1, include_distances=False)
set_a_annoy = set(ids_d['id'][result_indices].values)

result_indices = annoy_g.get_nns_by_vector(g_embedding.squeeze(), num_vectors_to_find, search_k=-1, include_distances=False)
set_g_annoy = set(ids_d['id'][result_indices].values)

_, result_indices = pq_d.search(d_embedding, num_vectors_to_find)  
result_indices = result_indices.squeeze()
set_d_pq = set(ids_d['id'][result_indices].values)

_, result_indices = pq_a.search(a_embedding, num_vectors_to_find)  
result_indices = result_indices.squeeze()
set_a_pq = set(ids_a['id'][result_indices].values)

_, result_indices = pq_g.search(g_embedding, num_vectors_to_find)  
result_indices = result_indices.squeeze()
set_g_pq = set(ids_g['id'][result_indices].values)

CPU times: user 4.33 s, sys: 0 ns, total: 4.33 s
Wall time: 1.43 s


## Inspect the top most frequent (not necessarily closest to the query) conclusion titles for each one of the 6 indices:

In [63]:
#Display the top-K most frequent discussion titles
k = 25

for resultset, name in zip([set_d_annoy, set_a_annoy, set_g_annoy, set_d_pq, set_a_pq, set_g_pq], ["Distilbert with Annoy", "Autoencoder with Annoy", "Google Encoder with Annoy", "Distilbert with PQ", "Autoencoder with PQ", "Google Encoder with PQ"]):
    print(f"{name}:")
    print(len(name)*"-")
    print("")
    print(list(return_args(resultset)['conclusion'].value_counts().index)[:k])
    print("")

Distilbert with Annoy:
---------------------

['Single-payer universal health care', 'Animal testing', 'Capitalism vs socialism', 'Nuclear energy', 'Free trade', 'Public health insurance option', 'Abortion', 'Progressive tax vs. flat tax', 'Solar energy', 'Legalization of Marijuana', 'Gay marriage', 'Vegetarianism', 'Corporate free speech', 'Multiculturalism vs. assimilation', 'Health insurance cooperatives', 'Hydrogen vehicles', 'Bailout of US automakers', 'Dollarization', 'Kyoto Protocol', 'Abolition of nuclear weapons', 'Libertarianism', 'Withdrawing from Iraq', 'Medical marijuana dispensaries', 'US offshore oil drilling', 'Hunting for sport']

Autoencoder with Annoy:
----------------------

['Single-payer universal health care', 'Animal testing', 'Abortion', 'Capitalism vs socialism', 'Public health insurance option', 'Gay marriage', 'Legalization of Marijuana', 'Progressive tax vs. flat tax', 'Free trade', 'Nuclear energy', 'Medical marijuana dispensaries', 'Death penalty', 'Veget

# Run query against the Elasticsearch index and store the resulting argument ids in a set

In [64]:
%%time
s = Search(using=es, index="arg_index")
s.query = Q("match", text=query)
s = s[:10000]
response = s.execute()

elastic_set = set()
elastic_dic = {}
for hit in response:
    elastic_set.add(hit.meta.id)
    elastic_dic[hit.meta.id] = hit.meta.score

for hit in response[:20]:
    print(f"{hit.stance} ARGUMENT CONCLUSION: {hit.conclusion}")    

CON ARGUMENT CONCLUSION: Establishing a $15 per ton carbon tax that increases by 3.5% each year until capped at $80 per ton
CON ARGUMENT CONCLUSION: The United States should change towards the use of alternative fuel and away from fossil fuels.
PRO ARGUMENT CONCLUSION: Nuclear power is beneficial for society
CON ARGUMENT CONCLUSION: fossil fuel
PRO ARGUMENT CONCLUSION: Steps to stop "Global Warming" will kill far more people then it will save
CON ARGUMENT CONCLUSION: Steps to stop "Global Warming" will kill far more people then it will save
CON ARGUMENT CONCLUSION: fossil fuel
PRO ARGUMENT CONCLUSION: The United States should change towards the use of alternative fuel and away from fossil fuels.
CON ARGUMENT CONCLUSION: There is currently an enviromental crisis in the United States
PRO ARGUMENT CONCLUSION: Nuclear energy is a crucial alternative energy source that is too valuable to be restricted.
CON ARGUMENT CONCLUSION: Wind power is unlikely to be economical within ten years
CON ARG

# Expand the original query using transformer-based language models, both causal and not

## Use a Cloze (Masked, BERT-like) Language Model to generate a list of keywords relevant to the original query and run those against the Elasticsearch index

In [34]:
tokenizer_mlm = AutoTokenizer.from_pretrained('bert-large-uncased')
model_mlm = BertForMaskedLM.from_pretrained('bert-large-uncased')

In [65]:
%%time
keywords = expand_mlm(model=model_mlm, tokenizer=tokenizer_mlm, query=query, k=5)
print(f"Augmenting the query with the following keywords:\n\n {keywords}\n\n")

Augmenting the query with the following keywords:

 ['diesel', 'cost', 'nuclear', 'consumption', 'hydrogen', 'technologies', 'energy', 'future', 'electricity', 'pregnancy', 'coal', 'alternative', 'migration', 'emissions', 'efficiency', 'economics', 'technology', 'growth', 'wartime', 'earthquakes', 'green', 'environmental', 'accidents', 'costs', 'renewable', 'winter', 'development', 'pollution', 'new', 'stress', 'water', 'oil', 'accident', 'death', 'health', 'warming', 'sustainability', 'accidental', 'fires', 'competition']


CPU times: user 39.8 s, sys: 469 ms, total: 40.2 s
Wall time: 2.97 s


In [66]:
%%time
s = Search(using=es, index="arguments_index")
s.query = Q("match", text=' '.join(keywords))
s = s[:10000]

response = s.execute()
mlm_set = set()
mlm_dic = {}
for hit in response:
    mlm_set.add(hit.meta.id)
    mlm_dic[hit.meta.id] =hit.meta.score

for hit in response[:20]:
    print(f"{hit.stance} ARGUMENT CONCLUSION: {hit.conclusion}")    

PRO ARGUMENT CONCLUSION: Resolved: hydraulic fracturing should continue to be used
PRO ARGUMENT CONCLUSION: Free Trade Benefits the United States
CON ARGUMENT CONCLUSION: The US Should Subsidize Nuclear Power
PRO ARGUMENT CONCLUSION: Nuclear power is the least cost and fastest way to substantially cut GHG emissions from electricity
CON ARGUMENT CONCLUSION: Nuclear Energy Is Beneficial to Society
CON ARGUMENT CONCLUSION: the government should interevene in the transition to alternative fuels
PRO ARGUMENT CONCLUSION: Resolved: On balance, the benefits of nuclear power outweigh the risks
CON ARGUMENT CONCLUSION: There is currently an enviromental crisis in the United States
CON ARGUMENT CONCLUSION: Nuclear Power
CON ARGUMENT CONCLUSION: America should expand nuclear energy production.
CON ARGUMENT CONCLUSION: fossil fuel
CON ARGUMENT CONCLUSION: Nuclear Power
PRO ARGUMENT CONCLUSION: The US should increase its use of nuclear energy
CON ARGUMENT CONCLUSION: Developing countries should prio

## Use a causal transformer language model (GPT2-like) to hallucinate new queries and run those against the Elasticsearch index

In [39]:
LM_MODEL_TO_USE = "gpt2"
tokenizer_lm = AutoTokenizer.from_pretrained(LM_MODEL_TO_USE)
model_lm = AutoModelWithLMHead.from_pretrained(LM_MODEL_TO_USE)

In [68]:
%%time
greedy, sampled = expand_lm(model=model_lm, tokenizer=tokenizer_lm, query=query, print_generated=True)


Greedily hallucinated for query 0:
  there is no evidence that alternative energy can be used to replace fossil fuels. However, it has been shown in a number of studies (e.g., Janssen et al.) and the results are consistent with what we have seen so far from conventional sources such as wind turbines or solar panels on our planet's surface."This article originally appeared at The Daily Caller News Foundation:





Greedily hallucinated for query 1:
  In fact, it may be possible to replace fossil fuels with renewable sources of energy such as wind and solar power in the near future."
"We believe that alternative energies can have a significant impact on our planet's climate system by reducing greenhouse gas emissions," he said at an event held earlier this month hosted jointly between UMass-Amherst (UMD) Chancellor Nicholas Dirksen &


Greedily hallucinated for query 2:
  there is no evidence that alternative energy will replace fossil fuels in the foreseeable future. However, it may be

 
Hallucinated 1 for query 6: . I'd prefer that it was renewable energy, like wind or solar and hydro instead (or something similar) because fossil fuels are not so toxic to humans as they would be if burned off in the atmosphere at such high temperatures of CO2 alone.[3][4]What's up with all these people saying we should have had more power back then!? Is there any reason why even those
 
 
Hallucinated 2 for query 6:  what I'm talking about.But that's probably the right answer... If this can be reversed, then we should definitely start working on alternatives to fossil fuels - which means removing emissions of carbon dioxide from our atmosphere and also reducing energy use (as long as they don't contribute significantly towards climate change)BENGHAZI/WASHINGTON—An Afghan judge has sentenced a Uttarak district court
 
 
Hallucinated 3 for query 6:  what to make of it yet but I don't know for how long.In my opinion, "Hinduism and Islam" are very similar concepts: they both share an ov

In [72]:
%%time
s = Search(using=es, index="arguments_index")
s.query = Q('bool', should=[Q("match", text=query) for query in chain(*[greedy, sampled])], minimum_should_match=int(len(list(chain(*[greedy, sampled])))/2))
s = s[:10000]
response = s.execute()
lm_set = set()
lm_dic = {}
for hit in response:
    lm_set.add(hit.meta.id)
    lm_dic[hit.meta.id] =hit.meta.score

for hit in response[:20]:
    print(f"{hit.stance} ARGUMENT CONCLUSION: {hit.conclusion}")    

CON ARGUMENT CONCLUSION: the government should interevene in the transition to alternative fuels
CON ARGUMENT CONCLUSION: fossil fuel
CON ARGUMENT CONCLUSION: The USFG ought to prioritize the development of clean energy over (continued...)
PRO ARGUMENT CONCLUSION: Resolved: On balance, the benefits of nuclear power outweigh the risks
PRO ARGUMENT CONCLUSION: The USFG ought to prioritize the development of clean energy over (continued...)
CON ARGUMENT CONCLUSION: Nuclear Power
CON ARGUMENT CONCLUSION: Nuclear Power
PRO ARGUMENT CONCLUSION: Nuclear Power
CON ARGUMENT CONCLUSION: There is currently an enviromental crisis in the United States
CON ARGUMENT CONCLUSION: fossil fuel
CON ARGUMENT CONCLUSION: Nuclear Power
PRO ARGUMENT CONCLUSION: fossil fuel
CON ARGUMENT CONCLUSION: Nuclear power.
CON ARGUMENT CONCLUSION: Establishing a $15 per ton carbon tax that increases by 3.5% each year until capped at $80 per ton
PRO ARGUMENT CONCLUSION: The US should encourage more nuclear energy use ove

In [54]:
#Interestingly, (lm_dic.keys() & mlm_dic.keys()) returns a set! Same as doing lm_set.intersect(mlm_set)
len(lm_dic.keys() & mlm_dic.keys()) == len(lm_set.intersection(mlm_set))

True

## Use a Bag-of-Words (arg_bow) enhanced GPT2 model (Uber's PPLM: Plug-and-Play Language Model) to hallucinate queries and run those against the Elasticsearch index

### The Bag of Words used attempts to steer the hallucinated text into sounding like an argument:

In [74]:
with open('arg_bow') as f:
    bow = f.read().splitlines()
print(bow)

['accept', 'according', 'accordingly', 'affirm', 'agree', 'argue', 'argument', 'argumentation', 'assert', 'assumption', 'attack', 'attest', 'backing', 'basis', 'because', 'belief', 'believe', 'bias', 'biased', 'challenge', 'cite', 'claim', 'clear', 'con', 'concede', 'conclude', 'conclusion', 'concur', 'consequence', 'consequently', 'considering', 'context', 'controversial', 'convince', 'corroborate', 'convincing', 'corroboration', 'credibility', 'credible', 'criteria', 'criterion', 'debatable', 'debate', 'deduce', 'definition', 'determine', 'disagree', 'disprove', 'ergo', 'evidence', 'example', 'facts', 'fallacy', 'fallible', 'faulty', 'general', 'hence', 'hypothetical', 'imply', 'inconsistent', 'infer', 'irrelevant', 'justify', 'knowledge', 'logical', 'naturally', 'objectively ', 'opinion', 'perspective', 'persuade', 'persuasive', 'point', 'position', 'precisely', 'premise', 'pro', 'probable', 'proof', 'prove', 'rational', 'reason', 'rebuttal', 'reiterate', 'relevant', 'rhetoric', 'rh

In [73]:
config = AutoConfig.from_pretrained("gpt2")
config.output_hidden_states = True
tokenizer_pplm = AutoTokenizer.from_pretrained("gpt2")
model_pplm = AutoModelWithLMHead.from_pretrained("gpt2", config=config)
model_pplm.eval()    
for param in model_pplm.parameters():
    param.requires_grad = False

In [75]:
%%time
pplm_hallucinated = expand_pplm(model=model_pplm, tokenizer=tokenizer_pplm, query=query, length=35, num_iterations=4)

, but it's not a good idea.<|endoftext|>The following is an excerpt from the book "A Brief History of American Politics" by John Dickson (University Press), which
, it does! The alternative energy industry has been working hard to convince the public that fossil fuels are not a viable option for our planet and we should be concerned about their impact
 Because of the fact that we are not using fossil fuels. We have to use alternative energy sources, and I believe it is important for us as a society in order be able
, it's a good idea to use alternative energy sources like wind and solar power as well because they are cheaper than fossil fuels due in part (and this has been proven)
, but it's not clear. It is possible to use alternative energy sources like solar and wind because they are cheaper than fossil fuels (and therefore more efficient). But there isn
, but I'm not convinced.

 (I don't know if it's because of the fact that there are no fossil fuels in this country or just due to

In [77]:
%%time
s = Search(using=es, index="arguments_index")
s.query = Q('bool', should=[Q("match", text=query) for query in pplm_hallucinated], minimum_should_match=int(len(pplm_hallucinated)/2))
s = s[:10000]
response = s.execute()
pplm_set = set()
pplm_dic = {}
for hit in response:
    pplm_set.add(hit.meta.id)
    pplm_dic[hit.meta.id] =hit.meta.score
for hit in response[:20]:
    print(f"{hit.stance} ARGUMENT CONCLUSION: {hit.conclusion}")

CON ARGUMENT CONCLUSION: the government should interevene in the transition to alternative fuels
CON ARGUMENT CONCLUSION: fossil fuel
PRO ARGUMENT CONCLUSION: The United States should change towards the use of alternative fuel and away from fossil fuels.
PRO ARGUMENT CONCLUSION: fossil fuel
CON ARGUMENT CONCLUSION: The USFG ought to prioritize the development of clean energy over (continued...)
CON ARGUMENT CONCLUSION: fossil fuel
CON ARGUMENT CONCLUSION: The United States should change towards the use of alternative fuel and away from fossil fuels.
CON ARGUMENT CONCLUSION: Nuclear Power
PRO ARGUMENT CONCLUSION: The US should encourage more nuclear energy use over current widely used energy forms.
PRO ARGUMENT CONCLUSION: Resolved: On balance, the benefits of nuclear power outweigh the risks
CON ARGUMENT CONCLUSION: Should the world stop using Oil/gas
CON ARGUMENT CONCLUSION: Are Renewable Energy Sources Better than Fossil Fuels
PRO ARGUMENT CONCLUSION: fossil fuel
CON ARGUMENT CONCLUS

# Get union and k-wise intersection of the resultsets of argument ids

The k-wise intersection takes all possible combinations of k out of kmax resultsets and produces the intersection. The result is the union of all these intersections

In [169]:
union = set.union(*[set_d_annoy, set_a_annoy, set_g_annoy, set_d_pq, set_a_pq, set_g_pq, lm_set, mlm_set, pplm_set])
len(union)

47571

The union seems to be noisy

In [174]:
return_args(union)['conclusion'].value_counts()[:20]

Abortion                              310
Single-payer universal health care    136
Animal testing                        134
Capitalism vs socialism               113
Death penalty                         111
Withdrawing from Iraq                 103
Legalization of Marijuana             103
Public health insurance option        101
Gay marriage                           97
Nuclear energy                         88
Medical marijuana dispensaries         86
Infant male circumcision               83
Progressive tax vs. flat tax           81
Vegetarianism                          80
Death Penalty                          78
Free trade                             76
God Exists                             76
Solar energy                           73
Bailout of US automakers               67
Law school                             66
Name: conclusion, dtype: int64

In [162]:
all_sets = [set_d_annoy, set_a_annoy, set_g_annoy, set_d_pq, set_a_pq, set_g_pq, lm_set, mlm_set, pplm_set]
L = len(all_sets)

cartesian_up_to_L = [combination for r in range(L-5,L) for combination in combinations(all_sets,r+1)] # this returns a list of combinations of L-5 elements up the intersection of all L sets. 5 is a hyperparameter
intersections = set.union(*[set.intersection(*thing) for thing in cartesian_up_to_L]) #This computes the intersection of every combination in that list then takes the union

tuple wise intersections seem to filter out the noise

In [166]:
return_args(intersections)['conclusion'].value_counts()[:50]

Nuclear energy                                                           67
Solar energy                                                             53
Hydrogen vehicles                                                        35
US offshore oil drilling                                                 35
Kyoto Protocol                                                           33
Abolition of nuclear weapons                                             31
Capitalism vs socialism                                                  31
Geoengineering                                                           29
Natural gas vehicles                                                     29
Algae biofuel                                                            28
Wind energy                                                              28
Single-payer universal health care                                       28
Bailout of US automakers                                                 27
Phasing out 

In [167]:
return_args(intersections)

Unnamed: 0,text,stance,id,conclusion,context.sourceId,context.previousArgumentInSourceId,context.acquisitionTime,context.discussionTitle,context.sourceTitle,context.sourceUrl,context.nextArgumentInSourceId
10266,Burning coal is much cheaper than renewable op...,CON,cf63c6bb-2019-04-18T16:09:29Z-00004-000,Renewable Resource Energy,cf63c6bb-2019-04-18T16:09:29Z,,2019-04-18T16:09:29Z,Renewable Resource Energy,Debate Topic: Renewable Resource Energy | Deba...,https://www.debate.org/debates/Renewable-Resou...,cf63c6bb-2019-04-18T16:09:29Z-00005-000
13373,should cloning of a extinct animals be allowed,CON,b129938-2019-04-18T17:27:14Z-00003-000,cloning animals,b129938-2019-04-18T17:27:14Z,b129938-2019-04-18T17:27:14Z-00002-000,2019-04-18T17:27:14Z,cloning animals,Debate Argument: cloning animals | Debate.org,https://www.debate.org/debates/cloning-animals/1/,b129938-2019-04-18T17:27:14Z-00000-000
13567,I would like to thank my opponent for the chal...,CON,bb8a50de-2019-04-18T18:52:28Z-00003-000,free trade should be valued above protectionism,bb8a50de-2019-04-18T18:52:28Z,bb8a50de-2019-04-18T18:52:28Z-00002-000,2019-04-18T18:52:28Z,free trade should be valued above protectionism,Debate Argument: free trade should be valued a...,https://www.debate.org/debates/free-trade-shou...,bb8a50de-2019-04-18T18:52:28Z-00004-000
16727,I'll go paragraph by paragraph: INHERENCY: Inh...,PRO,f1a21db5-2019-04-18T19:44:42Z-00003-000,Resolved: The USFG should substantially increa...,f1a21db5-2019-04-18T19:44:42Z,f1a21db5-2019-04-18T19:44:42Z-00002-000,2019-04-18T19:44:42Z,Resolved: The USFG should substantially increa...,Online Debate: Resolved: The USFG should subst...,https://www.debate.org/debates/Resolved-The-US...,f1a21db5-2019-04-18T19:44:42Z-00004-000
16899,I thank my opponent for accepting this debate....,CON,f8fc8c9e-2019-04-18T19:30:17Z-00003-000,government should mandate that by 2040 all new...,f8fc8c9e-2019-04-18T19:30:17Z,f8fc8c9e-2019-04-18T19:30:17Z-00002-000,2019-04-18T19:30:17Z,government should mandate that by 2040 all new...,Debate: government should mandate that by 2040...,https://www.debate.org/debates/government-shou...,f8fc8c9e-2019-04-18T19:30:17Z-00004-000
...,...,...,...,...,...,...,...,...,...,...,...
387374,Legalization would change drug consumption fro...,PRO,a7c47a5c-2019-04-17T11:47:49Z-00020-000,Legalization of drugs,a7c47a5c-2019-04-17T11:47:49Z,a7c47a5c-2019-04-17T11:47:49Z-00035-000,2019-04-17T11:47:49Z,Legalization of drugs,Debate: Legalization of drugs - Debatepedia,http://www.debatepedia.org/en/index.php/Debate...,a7c47a5c-2019-04-17T11:47:49Z-00005-000
387397,The legalization of drugs will increase consum...,CON,a7c47a5c-2019-04-17T11:47:49Z-00015-000,Legalization of drugs,a7c47a5c-2019-04-17T11:47:49Z,a7c47a5c-2019-04-17T11:47:49Z-00030-000,2019-04-17T11:47:49Z,Legalization of drugs,Debate: Legalization of drugs - Debatepedia,http://www.debatepedia.org/en/index.php/Debate...,a7c47a5c-2019-04-17T11:47:49Z-00000-000
387407,Lower prices due to legalization of drugs will...,CON,a7c47a5c-2019-04-17T11:47:49Z-00016-000,Legalization of drugs,a7c47a5c-2019-04-17T11:47:49Z,a7c47a5c-2019-04-17T11:47:49Z-00031-000,2019-04-17T11:47:49Z,Legalization of drugs,Debate: Legalization of drugs - Debatepedia,http://www.debatepedia.org/en/index.php/Debate...,a7c47a5c-2019-04-17T11:47:49Z-00001-000
387418,Legalizing drugs would eliminate an industry t...,PRO,a7c47a5c-2019-04-17T11:47:49Z-00002-000,Legalization of drugs,a7c47a5c-2019-04-17T11:47:49Z,a7c47a5c-2019-04-17T11:47:49Z-00017-000,2019-04-17T11:47:49Z,Legalization of drugs,Debate: Legalization of drugs - Debatepedia,http://www.debatepedia.org/en/index.php/Debate...,a7c47a5c-2019-04-17T11:47:49Z-00025-000


# Rerank a list of arguments using Distilbert and/or the Wide and Deep Network

## Load the Distilbert ranker

In [168]:
model_version = "0002"
model_name = f"distilbert-base-uncased_for_sequence_classification"
folder_name = "Ranker"
model_path = Path('.') / folder_name / model_name / model_version
db_rank = tf.saved_model.load(str(model_path))

## Rerank the intersection set with Distilbert

In [175]:
%%time
db_reranked = []
for i, arg in return_args(intersections).iterrows():
    tokenized = tokenizer.encode_plus(query, arg['text'], max_length=tokenizer.max_len, pad_to_max_length=True)
    i= tf.constant(tokenized['input_ids'], dtype=tf.int64)[None, :]
    m= tf.constant(tokenized['attention_mask'], dtype=tf.int64)[None, :]
    r,b = (np.array(db_rank([i,m], training=False)).squeeze())
    db_reranked.append({"id": arg['id'], 'relevance':r, 'binary':b})

CPU times: user 58min, sys: 7min 24s, total: 1h 5min 24s
Wall time: 4min


In [176]:
db_reranked = pd.DataFrame(db_reranked)
db_reranked

Unnamed: 0,id,relevance,binary
0,cf63c6bb-2019-04-18T16:09:29Z-00004-000,2.636328,0.988542
1,b129938-2019-04-18T17:27:14Z-00003-000,-0.981731,0.227259
2,bb8a50de-2019-04-18T18:52:28Z-00003-000,2.408162,0.986205
3,f1a21db5-2019-04-18T19:44:42Z-00003-000,2.642425,0.988530
4,f8fc8c9e-2019-04-18T19:30:17Z-00003-000,2.562882,0.991478
...,...,...,...
1911,a7c47a5c-2019-04-17T11:47:49Z-00020-000,-1.688570,0.076038
1912,a7c47a5c-2019-04-17T11:47:49Z-00015-000,-1.542947,0.120951
1913,a7c47a5c-2019-04-17T11:47:49Z-00016-000,2.334604,0.985321
1914,a7c47a5c-2019-04-17T11:47:49Z-00002-000,1.881718,0.963452


## Load the Wide and Deep ranker

In [177]:
model_version = "0001"
model_name = f"Wide_and_deep"
folder_name = "Ranker"
model_path = Path('.') / folder_name / model_name / model_version
wnd_rank = tf.saved_model.load(str(model_path))

## Load the QA version of the Universal Sentence Encoder

In [178]:
module = hub.load('https://tfhub.dev/google/universal-sentence-encoder-qa/3')
query_embedder = module.signatures['question_encoder']
arg_embedder = module.signatures['response_encoder']

INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder-qa/3'.
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder-qa/3: 170.04MB
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder-qa/3: 340.04MB
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder-qa/3: 500.04MB
INFO:absl:Downloaded https://tfhub.dev/google/universal-sentence-encoder-qa/3, Total size: 588.94MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder-qa/3'.


In [182]:
%%time
use_reranked = []
for i, arg in return_args(intersections).iterrows():
    q = query_embedder(tf.constant([query]))['outputs'].numpy()
    a = arg_embedder(input=tf.constant([arg['text']]), context=tf.constant([arg['conclusion']]))['outputs'].numpy()
    dp = np.dot(q.squeeze(),a.squeeze())
    deep_input = np.hstack([q,a])
    s = Search(using=es, index='arg_index')
    s.query = Q("match", text=query) & Q("term", _id=arg['id'])
    response = s.execute()
    length = len(arg['text'])
    ds = 0
    if not response.hits.total.value == 0:
        ds = response[0].meta.score
    wide_input = np.hstack([dp, ds, length])
    r,b,aux = np.array(wnd_rank([tf.constant(wide_input, dtype=tf.float32)[None,:], tf.constant(deep_input, dtype=tf.float32)], training=False)).squeeze()
    use_reranked.append({"id": arg['id'], 'relevance':r, 'binary':b, 'aux':aux})

CPU times: user 15min 25s, sys: 2min 30s, total: 17min 55s
Wall time: 2min 59s


In [183]:
use_reranked = pd.DataFrame(use_reranked)
use_reranked

Unnamed: 0,id,relevance,binary,aux
0,cf63c6bb-2019-04-18T16:09:29Z-00004-000,2.267275,0.632976,0.992779
1,b129938-2019-04-18T17:27:14Z-00003-000,-1.504922,0.516634,0.165707
2,bb8a50de-2019-04-18T18:52:28Z-00003-000,1.375458,1.000000,0.277443
3,f1a21db5-2019-04-18T19:44:42Z-00003-000,2.949860,1.000000,0.924030
4,f8fc8c9e-2019-04-18T19:30:17Z-00003-000,2.467835,1.000000,0.870499
...,...,...,...,...
1911,a7c47a5c-2019-04-17T11:47:49Z-00020-000,-1.710224,0.542446,0.096710
1912,a7c47a5c-2019-04-17T11:47:49Z-00015-000,-0.724441,0.430645,0.734148
1913,a7c47a5c-2019-04-17T11:47:49Z-00016-000,-0.749373,0.499491,0.768408
1914,a7c47a5c-2019-04-17T11:47:49Z-00002-000,2.192319,0.545298,0.988674


# Combine the two rankings using the scaled scores

In [184]:
scaler = MinMaxScaler()
all_scores = np.hstack([scaler.fit_transform(use_reranked[['relevance', 'binary', 'aux']]), scaler.fit_transform(db_reranked[['relevance', 'binary']])])
final_ranks = pd.DataFrame({'arg_ids':use_reranked['id'], 'mean':pd.Series(np.mean(all_scores, axis=1)), 'median':pd.Series(np.median(all_scores, axis=1))})
final_ranks

Unnamed: 0,arg_ids,mean,median
0,cf63c6bb-2019-04-18T16:09:29Z-00004-000,0.829133,0.892579
1,b129938-2019-04-18T17:27:14Z-00003-000,0.214735,0.200753
2,bb8a50de-2019-04-18T18:52:28Z-00003-000,0.748457,0.851136
3,f1a21db5-2019-04-18T19:44:42Z-00003-000,0.940014,0.927205
4,f8fc8c9e-2019-04-18T19:30:17Z-00003-000,0.910836,0.879238
...,...,...,...
1911,a7c47a5c-2019-04-17T11:47:49Z-00020-000,0.144024,0.105948
1912,a7c47a5c-2019-04-17T11:47:49Z-00015-000,0.288707,0.212077
1913,a7c47a5c-2019-04-17T11:47:49Z-00016-000,0.634408,0.771009
1914,a7c47a5c-2019-04-17T11:47:49Z-00002-000,0.768899,0.759956


## Sort by mean

In [186]:
final_ranks.sort_values(by='mean', ascending=False)

Unnamed: 0,arg_ids,mean,median
597,e7056476-2019-04-19T12:47:46Z-00019-000,0.991811,0.997550
125,4c6aab06-2019-04-18T17:04:35Z-00000-000,0.982555,0.995139
126,4c6aab06-2019-04-18T17:04:35Z-00001-000,0.977574,0.993606
55,d926f811-2019-04-18T19:43:34Z-00004-000,0.976536,0.987507
47,4911d42e-2019-04-18T12:51:15Z-00000-000,0.975741,0.991463
...,...,...,...
755,9bc8d269-2019-04-17T11:47:38Z-00035-000,0.083325,0.053266
688,7785529c-2019-04-17T11:47:37Z-00029-000,0.077810,0.065996
1495,8b68ae4-2019-04-17T11:47:47Z-00001-000,0.073651,0.023386
705,7785529c-2019-04-17T11:47:37Z-00037-000,0.071746,0.033995


## Sort by median

In [187]:
final_ranks.sort_values(by='median', ascending=False)

Unnamed: 0,arg_ids,mean,median
144,b3cf5511-2019-04-18T18:44:32Z-00004-000,0.923510,0.998179
597,e7056476-2019-04-19T12:47:46Z-00019-000,0.991811,0.997550
29,996dd88-2019-04-18T16:58:08Z-00002-000,0.960776,0.997307
117,2b6bd5cb-2019-04-18T11:43:36Z-00000-000,0.958431,0.995921
538,a69e9d5b-2019-04-19T12:46:24Z-00016-000,0.946301,0.995918
...,...,...,...
1747,e3d235e2-2019-04-17T11:47:41Z-00031-000,0.150586,0.043986
705,7785529c-2019-04-17T11:47:37Z-00037-000,0.071746,0.033995
402,e4c52b1f-2019-04-15T20:22:58Z-00000-000,0.150817,0.029450
1495,8b68ae4-2019-04-17T11:47:47Z-00001-000,0.073651,0.023386
