In [27]:
import re
import traceback
from pathlib import Path 

import spacy
import pandas as pd
from ebooklib import epub
from bs4 import BeautifulSoup, NavigableString

In [18]:
MODEL = 'en_core_web_sm'

In [19]:
def load_epub(path):
    e = epub.read_epub(path)
    contents = e.get_items()
    xmls = [x.get_content() for x in contents]
    texts = [BeautifulSoup(x.decode('utf-8', 'replace'), 'xml') for x in xmls]
    t = []
    for text in texts:
        temp = text.find_all('p')
        content = []
        for x in temp:
            try:
                if 'class' in x.attrs.keys():
                    if x.attrs['class'] in ['tx', 'cotx1', 'indent', 'nonindent'] and isinstance(x.contents[0], NavigableString):
                        content.append(x.contents[0])
                else:
                    if isinstance(x.contents[0], str) or isinstance(x.contents[0], NavigableString):
                        content.append(x.contents[0])

            except:
                traceback.print_exc()
        [t.append(x) for x in content]

    return ' '.join(t)


In [20]:
def get_places(src,
               model='en_core_web_sm',
               ):
    nlp = spacy.load(model)

    data = []
    text = load_epub(src)
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == 'GPE':
            datum = {'name': ent.text,
                     'entity': ent.label_,
                     'source': src}
            data.append(datum)
    df = pd.DataFrame(data)
    return df

In [24]:
def parse_places(df,
                 places_path):
    names = list(set(df['name'].values))
    places = pd.read_csv(places_path, index_col=0)

    data = []
    for name in names:
        locations = places[places['name'].str.contains(fr'^{name}\Z',
                                                       flags=re.IGNORECASE)]
        if locations.shape[0] > 0:
            lat = locations.iloc[0]['latitude']
            long = locations.iloc[0]['longitude']
            temp = df[df['name'] == name]
            datum = {'name': name,
                     'count': temp.shape[0],
                     'latitude': lat,
                     'longitude': long}
            data.append(datum)
    parsed = pd.DataFrame(data)
    return parsed

In [22]:
book_files = [x for x in Path('./sources').iterdir()]

dfs = []
for book_file in book_files:
    df = get_places(str(book_file),
                    model=MODEL)
    dfs.append(df)
df = pd.concat(dfs)
df.head()

Unnamed: 0,name,entity,source
0,Los Angeles,GPE,sources/omnibus.epub
1,Chicago,GPE,sources/omnibus.epub
2,England,GPE,sources/omnibus.epub
3,Raymond,GPE,sources/omnibus.epub
4,London,GPE,sources/omnibus.epub


In [23]:
df.to_csv('./data/corpus_places.csv')

In [28]:
df_parsed = parse_places(df,
                         './data/places.csv')
df_parsed.head()

Unnamed: 0,name,count,latitude,longitude
0,Beverly Hills,26,34.073619,-118.400356
1,Pasadena,23,34.147786,-118.144517
2,Los Angeles,51,34.052233,-118.243686
3,Descanso,2,32.87,-116.63
4,Santa Rosa,3,38.440467,-122.714431


In [29]:
df_parsed.to_csv('./data/marlowe.csv')