In [6]:
from IPython.core.display import HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [75]:
import pandas as pd
import numpy as np
from elasticsearch import Elasticsearch, helpers
from elasticsearch_dsl import Search, Q, SF
from bs4 import BeautifulSoup

In [103]:
es = Elasticsearch(http_compress=True, maxsize=1000)

In [104]:
es.ping()

True

In [95]:
wiki_dataset = pd.read_pickle('../data/dbpedia_with_articles.pkl')
plato_dataset = pd.read_pickle('../data/plato.pkl')
wiki_index = wiki_dataset[['philosopher_url', 'abstract', 'text']].copy()
plato_index = plato_dataset[['title', 'abstract', 'full_article_with_tags']].copy()

In [96]:
wiki_index['title'] = wiki_index['philosopher_url'].apply(lambda x: x.replace('_', ' '))
wiki_index['id'] = 'W' + wiki_index.index.astype(str)
wiki_index = wiki_index[['id','title', 'abstract', 'text']]
wiki_index.head(5)

Unnamed: 0,id,title,abstract,text
0,W0,Stephen Law,Stephen Law (born 1960) is an English philoso...,Stephen Law (born 1960) is an English philoso...
1,W1,Henry S. Richardson,Henry S. Richardson is an American philosopher...,Henry S. Richardson is an American philosopher...
2,W2,John Amos Comenius,John Amos Comenius (Czech: Jan Amos Komenský; ...,John Amos Comenius (Czech: Jan Amos Komenský; ...
3,W3,Javier Gomá,"Javier Gomá Lanzón (Bilbao, 24 May 1965) is a ...","Javier Gomá Lanzón (Bilbao, 24 May 1965) is a ..."
4,W4,Oskar Negt,Oskar Negt (German pronunciation: [ˈneːkt]; bo...,Oskar Negt (German pronunciation: [ˈneːkt]; bo...


In [97]:
%%time
plato_index['text'] = plato_index['full_article_with_tags'].apply(lambda x: BeautifulSoup(x).text)
plato_index['id'] = 'P' + plato_index.index.astype(str)
plato_index = plato_index[['id','title', 'abstract', 'text']]
plato_index.head(3)

CPU times: user 23.6 s, sys: 88.9 ms, total: 23.7 s
Wall time: 23.7 s


Unnamed: 0,id,title,abstract,text
0,P0,Abduction,"In the philosophical literature, the term “abd...",\n1. Abduction: The General Idea\n\nYou happen...
1,P1,Affirmative Action,“Affirmative action” means positive steps take...,"\n1. In the Beginning\n\n\nIn 1972, affirmativ..."
2,P2,Aesthetics of the Everyday,"In the history of Western aesthetics, the subj...",\n1. Recent History\n\nWith the establishment ...


In [100]:
to_index = pd.concat([plato_index, wiki_index])
to_index

Unnamed: 0,id,title,abstract,text
0,P0,Abduction,"In the philosophical literature, the term “abd...",\n1. Abduction: The General Idea\n\nYou happen...
1,P1,Affirmative Action,“Affirmative action” means positive steps take...,"\n1. In the Beginning\n\n\nIn 1972, affirmativ..."
2,P2,Aesthetics of the Everyday,"In the history of Western aesthetics, the subj...",\n1. Recent History\n\nWith the establishment ...
3,P3,Wittgenstein’s Aesthetics,Given the extreme importance that Wittgenstein...,\n1. The Critique of Traditional Aesthetics\n\...
4,P4,Schopenhauer’s Aesthetics,The focus of this entry is on Schopenhauer’s a...,"\n1. Brief Background\n\n\nBy the 1870s, Arthu..."
...,...,...,...,...
6084,W6084,Stanisław Krajewski,Stanisław Krajewski (born 1950) is a Polish ph...,Stanisław Krajewski (born 1950) is a Polish ph...
6085,W6085,Patrick Stokes (philosopher),Patrick Stokes (born 1978) is an Australian ph...,Patrick Stokes (born 1978) is an Australian ph...
6086,W6086,Ernst Mach,Ernst Waldfried Josef Wenzel Mach (; German: [...,Ernst Waldfried Josef Wenzel Mach (; German: [...
6087,W6087,Jessica Pierce,"Jessica Pierce (born October 21, 1965) is an A...","Jessica Pierce (born October 21, 1965) is an A..."


In [98]:
body = {
    'mappings': {  
        'properties' : {          
            'title': {'type': 'keyword'},
            'abstract': {'type': 'text'},
            'article': {'type': 'text'},
        }
    }
}

In [117]:
es.indices.create(index='philosophy', body=body)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'philosophy'}

In [118]:
def document_generator_from_dataframe(df, index):
    for _, row in df.iterrows():
        row_as_dict = row.replace('', 'empty').to_dict()
        yield {
            "_index": index,
            "_id": row['id'],
            "_source": {k: row_as_dict[k] for k in ['title', 'abstract', 'text']}
        }

In [119]:
gen = document_generator_from_dataframe(to_index.replace(np.nan, 'empty'), 'philosophy')

In [120]:
%%time
for success, info in helpers.parallel_bulk(es, gen, thread_count=2000,chunk_size=2500, queue_size=1000):
    if not success:
        print('A document failed:', info)

CPU times: user 18.3 s, sys: 1.04 s, total: 19.3 s
Wall time: 18.7 s


In [128]:
%%time
query = "Ludwig Wittgenstein"

s = Search(using=es, index="philosophy")

s.query = Q("match", title=query)
s = s[:10]

response = s.execute()
for hit in response:
    print(f"TITLE: {hit.title} - SCORE: {hit.meta.id}")
    print("")

TITLE: Ludwig Wittgenstein - SCORE: W3694

TITLE: Ludwig Wittgenstein - SCORE: P31

CPU times: user 10.3 ms, sys: 92 µs, total: 10.4 ms
Wall time: 31.5 ms


In [18]:
!pip install pymediawiki

Collecting pymediawiki
  Downloading pymediawiki-0.7.0-py3-none-any.whl (23 kB)
Installing collected packages: pymediawiki
Successfully installed pymediawiki-0.7.0
