In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [29]:
import pandas as pd
import numpy as np
from elasticsearch import Elasticsearch, helpers
from elasticsearch_dsl import Search, Q, SF
from tqdm.notebook import tqdm_notebook
from pathlib import Path

In [3]:
def document_generator_from_dataframe(df, index, fields_to_index):
    for _, row in df.iterrows():
        row_as_dict = row.replace('', 'empty').to_dict()
        yield {
            "_index": index,
            "_id": row['id'],
            "_source": {k: row_as_dict[k] for k in fields_to_index}
        }

In [8]:
#Use compression which is useful for bulk loading, and set a maximum of 1000 connections to the node
es = Elasticsearch(http_compress=True, maxsize=1000) 

In [9]:
es.ping()

True

In [30]:
%%time
current_directory = Path('.')
if not (current_directory / 'Data/dataset.pkl').exists():
    print("Pickled dataset doesn't already exists. Now reading JSON file.")

    #Read in JSON file if pickled dataframe doesn't already exist
    with open('./Data/args-me.json') as f:
        d = json.load(f)
        d = d['arguments']
        context_subfields = [['context', k] for k in d[0]['context'].keys()]
        dataset = pd.json_normalize(d, record_path='premises', meta=['id', 'conclusion', *context_subfields])
        print("Now pickling Pandas DataFrame into dataset.pkl.")
        dataset.to_pickle('Data/dataset.pkl')
        print("DataFrame pickled.")
        print(" ")
else:
    #If pickle already exists, read it into dataframe
    print("Pickled dataset already exists. Now loading dataset.pkl into Pandas DataFrame")
    dataset = pd.read_pickle('Data/dataset.pkl')
    print(" ")

Pickled dataset already exists. Now loading dataset.pkl into Pandas DataFrame
 
CPU times: user 668 ms, sys: 380 ms, total: 1.05 s
Wall time: 1.05 s


In [31]:
dataset[:2]

Unnamed: 0,text,stance,id,conclusion,context.sourceId,context.previousArgumentInSourceId,context.acquisitionTime,context.discussionTitle,context.sourceTitle,context.sourceUrl,context.nextArgumentInSourceId
0,My opponent forfeited every round. None of my ...,CON,c67482ba-2019-04-18T13:32:05Z-00000-000,Contraceptive Forms for High School Students,c67482ba-2019-04-18T13:32:05Z,,2019-04-18T13:32:05Z,Contraceptive Forms for High School Students,Debate Argument: Contraceptive Forms for High ...,https://www.debate.org/debates/Contraceptive-F...,c67482ba-2019-04-18T13:32:05Z-00001-000
1,How do you propose the school will fund your p...,CON,c67482ba-2019-04-18T13:32:05Z-00001-000,Contraceptive Forms for High School Students,c67482ba-2019-04-18T13:32:05Z,c67482ba-2019-04-18T13:32:05Z-00000-000,2019-04-18T13:32:05Z,Contraceptive Forms for High School Students,Debate Argument: Contraceptive Forms for High ...,https://www.debate.org/debates/Contraceptive-F...,c67482ba-2019-04-18T13:32:05Z-00002-000


In [13]:
gen = document_generator_from_dataframe(dataset, "arg_index", ['text', 'stance', 'context.sourceId', 'conclusion'])

In [14]:
body = {
    'settings' : {
        'similarity' : {
            'my_dirichlet' : {'type': 'LMDirichlet' }
        }
    },
    
    'mappings': {
        
        'properties' : {
            
            'text':             {'type': 'text', 'similarity': 'my_dirichlet'},
            'stance':           {'type': 'keyword'},
            'context.sourceId': {'type': 'keyword'},
            'conclusion':       {'type': 'text'}
        }
    }
}

In [15]:
es.indices.create(index='arg_index', body=body)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'arg_index'}

In [16]:
%%time
helpers.bulk(es, gen)

CPU times: user 3min 7s, sys: 98.2 ms, total: 3min 7s
Wall time: 4min 31s


(387692, [])

In [17]:
es.indices.get_mapping(index="arg_index")

{'arg_index': {'mappings': {'properties': {'conclusion': {'type': 'text'},
    'context': {'properties': {'sourceId': {'type': 'keyword'}}},
    'stance': {'type': 'keyword'},
    'text': {'type': 'text', 'similarity': 'my_dirichlet'}}}}}

In [42]:
%%time
query = "Computer Science is a bad university degree."
s = Search(using=es, index="arg_index")

s.query = Q("match", text=query)
s = s[:10]

response = s.execute()
for hit in response:
    print(f"CONCLUSION: {hit.conclusion} - ARGUMENT ID:{hit.meta.id} - SCORE: {hit.meta.score}")

CONCLUSION: A philosophy degree is useful - ARGUMENT ID:b4f62054-2019-04-18T17:17:42Z-00004-000 - SCORE: 9.3465395
CONCLUSION: Students should have possibility to select in which lessons they can attend. (17-18 years old) - ARGUMENT ID:fd5843b7-2019-04-18T17:47:44Z-00002-000 - SCORE: 8.223627
CONCLUSION: CO2 Emissions are good - ARGUMENT ID:c3104a40-2019-04-18T11:41:20Z-00002-000 - SCORE: 8.066863
CONCLUSION: Global Nuclear War is Good - ARGUMENT ID:19b45e6e-2019-04-18T11:23:40Z-00001-000 - SCORE: 8.056856
CONCLUSION: Free University Education - ARGUMENT ID:1b21a2f2-2019-04-18T15:02:36Z-00003-000 - SCORE: 8.053132
CONCLUSION: For Higher Learning, online courses are a more beneficial option than universities - ARGUMENT ID:8f7d5736-2019-04-18T18:41:18Z-00003-000 - SCORE: 7.721984
CONCLUSION: Science and Math Degrees Should be Respected More Than Liberal Arts Degrees - ARGUMENT ID:8885de0a-2019-04-18T18:31:51Z-00001-000 - SCORE: 7.595864
CONCLUSION: Free University Education - ARGUMENT ID

In [39]:
dir(hit.meta)

['doc_type', 'id', 'index', 'score']