In [1]:
from IPython.core.display import HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np
from elasticsearch import Elasticsearch, helpers
from elasticsearch_dsl import Search, Q, SF
from bs4 import BeautifulSoup

In [86]:
# es = Elasticsearch(http_compress=True, maxsize=1000) # Use this to index
es = Elasticsearch()

In [87]:
es.ping()

True

In [5]:
wiki_dataset = pd.read_pickle('../data/dbpedia_with_articles.pkl')
plato_dataset = pd.read_pickle('../data/plato.pkl')
wiki_index = wiki_dataset[['philosopher_url', 'abstract', 'text']].copy()
plato_index = plato_dataset[['title', 'abstract', 'full_article_with_tags']].copy()

In [6]:
wiki_index['title'] = wiki_index['philosopher_url'].apply(lambda x: x.replace('_', ' '))
wiki_index['id'] = 'W' + wiki_index.index.astype(str)
wiki_index = wiki_index[['id','title', 'abstract', 'text']]
wiki_index.head(5)

Unnamed: 0,id,title,abstract,text
0,W0,Stephen Law,Stephen Law (born 1960) is an English philoso...,Stephen Law (born 1960) is an English philoso...
1,W1,Henry S. Richardson,Henry S. Richardson is an American philosopher...,Henry S. Richardson is an American philosopher...
2,W2,John Amos Comenius,John Amos Comenius (Czech: Jan Amos Komenský; ...,John Amos Comenius (Czech: Jan Amos Komenský; ...
3,W3,Javier Gomá,"Javier Gomá Lanzón (Bilbao, 24 May 1965) is a ...","Javier Gomá Lanzón (Bilbao, 24 May 1965) is a ..."
4,W4,Oskar Negt,Oskar Negt (German pronunciation: [ˈneːkt]; bo...,Oskar Negt (German pronunciation: [ˈneːkt]; bo...


In [7]:
%%time
plato_index['text'] = plato_index['full_article_with_tags'].apply(lambda x: BeautifulSoup(x).text)
plato_index['id'] = 'P' + plato_index.index.astype(str)
plato_index = plato_index[['id','title', 'abstract', 'text']]
plato_index.head(3)

CPU times: user 21 s, sys: 257 ms, total: 21.3 s
Wall time: 21.3 s


Unnamed: 0,id,title,abstract,text
0,P0,Abduction,"In the philosophical literature, the term “abd...",\n1. Abduction: The General Idea\n\nYou happen...
1,P1,Affirmative Action,“Affirmative action” means positive steps take...,"\n1. In the Beginning\n\n\nIn 1972, affirmativ..."
2,P2,Aesthetics of the Everyday,"In the history of Western aesthetics, the subj...",\n1. Recent History\n\nWith the establishment ...


In [54]:
to_index = pd.concat([plato_index, wiki_index])
to_index

Unnamed: 0,id,title,abstract,text
0,P0,Abduction,"In the philosophical literature, the term “abd...",\n1. Abduction: The General Idea\n\nYou happen...
1,P1,Affirmative Action,“Affirmative action” means positive steps take...,"\n1. In the Beginning\n\n\nIn 1972, affirmativ..."
2,P2,Aesthetics of the Everyday,"In the history of Western aesthetics, the subj...",\n1. Recent History\n\nWith the establishment ...
3,P3,Wittgenstein’s Aesthetics,Given the extreme importance that Wittgenstein...,\n1. The Critique of Traditional Aesthetics\n\...
4,P4,Schopenhauer’s Aesthetics,The focus of this entry is on Schopenhauer’s a...,"\n1. Brief Background\n\n\nBy the 1870s, Arthu..."
...,...,...,...,...
6084,W6084,Stanisław Krajewski,Stanisław Krajewski (born 1950) is a Polish ph...,Stanisław Krajewski (born 1950) is a Polish ph...
6085,W6085,Patrick Stokes (philosopher),Patrick Stokes (born 1978) is an Australian ph...,Patrick Stokes (born 1978) is an Australian ph...
6086,W6086,Ernst Mach,Ernst Waldfried Josef Wenzel Mach (; German: [...,Ernst Waldfried Josef Wenzel Mach (; German: [...
6087,W6087,Jessica Pierce,"Jessica Pierce (born October 21, 1965) is an A...","Jessica Pierce (born October 21, 1965) is an A..."


In [55]:
to_index['text'] = to_index['text'].apply(lambda x: np.nan if x==-1 else x)

In [61]:
to_index.to_csv('../data/to_index.csv')

In [62]:
to_index.to_pickle('../data/to_index.pkl')

In [63]:
to_index.to_pickle('../data/to_index_p4.pkl', protocol=4)

In [67]:
body = {
    'mappings': {  
        'properties' : {          
            'title': {'type': 'text'},
            'abstract': {'type': 'text'},
            'article': {'type': 'text'},
        }
    }
}

In [68]:
es.indices.create(index='philosophy', body=body)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'philosophy'}

In [69]:
def document_generator_from_dataframe(df, index):
    for _, row in df.iterrows():
        row_as_dict = row.replace('', 'empty').to_dict()
        yield {
            "_index": index,
            "_id": row['id'],
            "_source": {k: row_as_dict[k] for k in ['title', 'abstract', 'text']}
        }

In [70]:
gen = document_generator_from_dataframe(to_index[~to_index[['text', 'abstract']].isna().all(1)].replace(np.nan, 'empty'), 'philosophy')

In [71]:
%%time
for success, info in helpers.parallel_bulk(es, gen, thread_count=2000,chunk_size=2500, queue_size=1000):
    if not success:
        print('A document failed:', info)

CPU times: user 18.2 s, sys: 1.07 s, total: 19.3 s
Wall time: 18.9 s


In [80]:
lookup = {"P": "the Stanford Plato Encyclopedia", "W":"the Wikipedia Philosophers Collection"}

In [84]:
%%time
query = "language games"

s = Search(using=es, index="philosophy")

# s.query = Q("match", title=query)
s.query = Q("multi_match", query=query, fields=['title', 'abstract', 'article'])
s = s[:20]

response = s.execute()
for hit in response:
    print(f"{hit.title} - ID: {hit.meta.id} from {lookup[hit.meta.id[0]]} SCORE: {hit.meta.score}")
    print("*************************************************************************************************************************")

Logic and Games - ID: P973 from the Stanford Plato Encyclopedia SCORE: 10.573457
*************************************************************************************************************************
Logics for Analyzing Games - ID: P974 from the Stanford Plato Encyclopedia SCORE: 9.614387
*************************************************************************************************************************
Logics for Analyzing Power in Normal Form Games - ID: P674 from the Stanford Plato Encyclopedia SCORE: 9.523955
*************************************************************************************************************************
Jason Silva - ID: W1401 from the Wikipedia Philosophers Collection SCORE: 8.676296
*************************************************************************************************************************
Byram D. Avari - ID: W3379 from the Wikipedia Philosophers Collection SCORE: 8.661753
***********************************************************