In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np
from elasticsearch import Elasticsearch, helpers
from elasticsearch_dsl import Search, Q, SF
from tqdm.notebook import tqdm_notebook
from pathlib import Path

In [3]:
def document_generator_from_dataframe(df, index, fields_to_index):
    for _, row in df.iterrows():
        row_as_dict = row.replace('', 'empty').to_dict()
        yield {
            "_index": index,
            "_id": row['id'],
            "_source": {k: row_as_dict[k] for k in fields_to_index}
        }

In [4]:
#Use compression which is useful for bulk loading, and set a maximum of 1000 connections to the node
es = Elasticsearch(http_compress=True, maxsize=1000)

In [5]:
es.ping()

True

In [6]:
%%time
current_directory = Path('.')
if not (current_directory / 'Data/dataset.pkl').exists():
    print("Pickled dataset doesn't already exists. Now reading JSON file.")

    #Read in JSON file if pickled dataframe doesn't already exist
    with open('./Data/args-me.json') as f:
        d = json.load(f)
        d = d['arguments']
        context_subfields = [['context', k] for k in d[0]['context'].keys()]
        dataset = pd.json_normalize(d, record_path='premises', meta=['id', 'conclusion', *context_subfields])
        print("Now pickling Pandas DataFrame into dataset.pkl.")
        dataset.to_pickle('Data/dataset.pkl')
        print("DataFrame pickled.")
        print(" ")
else:
    #If pickle already exists, read it into dataframe
    print("Pickled dataset already exists. Now loading dataset.pkl into Pandas DataFrame")
    dataset = pd.read_pickle('Data/dataset.pkl')
    print(" ")

Pickled dataset already exists. Now loading dataset.pkl into Pandas DataFrame
 
CPU times: user 777 ms, sys: 595 ms, total: 1.37 s
Wall time: 1.81 s


In [7]:
dataset[:2]

Unnamed: 0,text,stance,id,conclusion,context.sourceId,context.previousArgumentInSourceId,context.acquisitionTime,context.discussionTitle,context.sourceTitle,context.sourceUrl,context.nextArgumentInSourceId
0,My opponent forfeited every round. None of my ...,CON,c67482ba-2019-04-18T13:32:05Z-00000-000,Contraceptive Forms for High School Students,c67482ba-2019-04-18T13:32:05Z,,2019-04-18T13:32:05Z,Contraceptive Forms for High School Students,Debate Argument: Contraceptive Forms for High ...,https://www.debate.org/debates/Contraceptive-F...,c67482ba-2019-04-18T13:32:05Z-00001-000
1,How do you propose the school will fund your p...,CON,c67482ba-2019-04-18T13:32:05Z-00001-000,Contraceptive Forms for High School Students,c67482ba-2019-04-18T13:32:05Z,c67482ba-2019-04-18T13:32:05Z-00000-000,2019-04-18T13:32:05Z,Contraceptive Forms for High School Students,Debate Argument: Contraceptive Forms for High ...,https://www.debate.org/debates/Contraceptive-F...,c67482ba-2019-04-18T13:32:05Z-00002-000


In [42]:
gen = document_generator_from_dataframe(dataset, "arguments_index", ['text', 'stance', 'context.sourceId', 'conclusion'])

### Indexing the text field solely as an Elasticsearch 'text' field and a Dirichlet similarity

In [None]:
body = {
    'settings' : {
        'similarity' : {
            'my_dirichlet' : {'type': 'LMDirichlet' }
        }
    },
    
    'mappings': {
        
        'properties' : {
            
            'text':             {'type': 'text', 'similarity': 'my_dirichlet'},
            'stance':           {'type': 'keyword'},
            'context.sourceId': {'type': 'keyword'},
            'conclusion':       {'type': 'text'}
        }
    }
}

### Indexing while additionally indexing the text field as a keyword field

In [None]:
body = {
    'settings' : {
        'similarity' : {
            'my_dirichlet' : {'type': 'LMDirichlet' }
        },
        'analysis' : {
            'normalizer' : {
                'my_normalizer' :{'type':'custom', 'filter':['lowercase', 'asciifolding']}
            }
        }
    },
    
    'mappings': {
        
        'properties' : {
            
            'text':             {'type': 'text', 'similarity': 'my_dirichlet', 'fields':{'kw':{'type':'keyword', 'normalizer': 'my_normalizer', 'ignore_above': 32766}}},
            'stance':           {'type': 'keyword'},
            'context.sourceId': {'type': 'keyword'},
            'conclusion':       {'type': 'text'}
        }
    }
}

In [52]:
es.indices.create(index='arguments_index', body=body)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'arguments_index'}

In [53]:
%%time
helpers.bulk(es, gen)

CPU times: user 3min 9s, sys: 76.3 ms, total: 3min 9s
Wall time: 5min 7s


(387692, [])

In [54]:
es.indices.get_mapping(index="arguments_index")

{'arguments_index': {'mappings': {'properties': {'conclusion': {'type': 'text'},
    'context': {'properties': {'sourceId': {'type': 'keyword'}}},
    'stance': {'type': 'keyword'},
    'text': {'type': 'text',
     'similarity': 'my_dirichlet',
     'fields': {'kw': {'type': 'keyword',
       'ignore_above': 32766,
       'normalizer': 'my_normalizer'}}}}}}}

In [106]:
%%time
query = "Computer Science is a bad university degree."

s = Search(using=es, index="arguments_index")

s.query = Q("match", text=query)
s = s[:10]

response = s.execute()
for hit in response:
    print(f"CONCLUSION: {hit.conclusion} - ARGUMENT ID:{hit.meta.id} - SCORE: {hit.meta.score}")
    print("")

CONCLUSION: A philosophy degree is useful - ARGUMENT ID:b4f62054-2019-04-18T17:17:42Z-00004-000 - SCORE: 9.3465395

CONCLUSION: Students should have possibility to select in which lessons they can attend. (17-18 years old) - ARGUMENT ID:fd5843b7-2019-04-18T17:47:44Z-00002-000 - SCORE: 8.223627

CONCLUSION: CO2 Emissions are good - ARGUMENT ID:c3104a40-2019-04-18T11:41:20Z-00002-000 - SCORE: 8.066863

CONCLUSION: Global Nuclear War is Good - ARGUMENT ID:19b45e6e-2019-04-18T11:23:40Z-00001-000 - SCORE: 8.056856

CONCLUSION: Free University Education - ARGUMENT ID:1b21a2f2-2019-04-18T15:02:36Z-00003-000 - SCORE: 8.053132

CONCLUSION: For Higher Learning, online courses are a more beneficial option than universities - ARGUMENT ID:8f7d5736-2019-04-18T18:41:18Z-00003-000 - SCORE: 7.721984

CONCLUSION: Science and Math Degrees Should be Respected More Than Liberal Arts Degrees - ARGUMENT ID:8885de0a-2019-04-18T18:31:51Z-00001-000 - SCORE: 7.595864

CONCLUSION: Free University Education - ARGU

In [111]:
%%time
list_of_keywords = [
    'abortion',
    'prices',
    'birth',
    'nutrition',
    'infections',
    'pills',
    'age',
    'surgery',
    'hormones',
    'pill',
    'therapy',
    'pregnancy'
]

s = Search(using=es, index="arguments_index")

s.query = Q("match", text=' '.join(list_of_keywords))
s = s[:10]

response = s.execute()
for hit in response:
    print(f"CONCLUSION: {hit.conclusion} - ARGUMENT ID:{hit.meta.id} - SCORE: {hit.meta.score}")
    print("")

CONCLUSION: Contraceptives should be classified as preventative health care for insurance coverage - ARGUMENT ID:7d6799b0-2019-04-18T17:09:31Z-00005-000 - SCORE: 19.668785

CONCLUSION: Contraception is a right. - ARGUMENT ID:f3fff523-2019-04-18T13:04:43Z-00004-000 - SCORE: 19.101013

CONCLUSION: Beginners' Tournament: Abortion should be banned, even in cases of rape. - ARGUMENT ID:757df847-2019-04-18T14:47:42Z-00003-000 - SCORE: 18.951563

CONCLUSION: United States Citizens should not be allowed to undergo gender-transformation surgery - ARGUMENT ID:6204e9e2-2019-04-18T13:54:29Z-00002-000 - SCORE: 16.574152

CONCLUSION: Transwomen Should Be Eligible to Play Sports in the Women's League - ARGUMENT ID:50231541-2019-04-18T11:14:04Z-00003-000 - SCORE: 16.209927

CONCLUSION: Birth control - ARGUMENT ID:c64898ed-2019-04-18T18:17:24Z-00004-000 - SCORE: 16.070171

CONCLUSION: Birth control - ARGUMENT ID:c64898ed-2019-04-18T18:17:24Z-00002-000 - SCORE: 15.99925

CONCLUSION: Insurance Exclusions