# Old stuff

In [5]:
import spacy
spacy.load('en_core_web_md')
from epitator.annotator import AnnoDoc
from epitator.geoname_annotator import GeonameAnnotator
from epitator.resolved_keyword_annotator import ResolvedKeywordAnnotator
from epitator.count_annotator import CountAnnotator
from epitator.date_annotator import DateAnnotator
from boilerpipe.extract import Extractor
from itertools import groupby
import datetime
import sys
import pandas as pd
import re
from tqdm import tqdm_notebook as tqdm
import numpy as np
import epitator

In [None]:
import requests
from bs4 import BeautifulSoup
from sys import stdout
from time import sleep
import pickle
import os

def get_links_by_year(list_of_years=None, proxies={'http': 'http://fw-bln.rki.local:8020'}):
    """Returns (all) the anual links of the WHO DONs 
    
    list_of_years -- a list of years (YYYY format) you want to parse (default None)
    proxies -- the proxy to use while scraping (default {'http': 'http://fw-bln.rki.local:8020'})
    """
    page = requests.get('http://www.who.int/csr/don/archive/year/en/',proxies=proxies)
    soup = BeautifulSoup(page.content,'html.parser')
    archiv_years = soup.find('ul',attrs={'class':'list'})
    years_links_html = archiv_years.find_all('a')
    if list_of_years:
        return ['http://www.who.int' + link.get('href') for link in years_links_html if any(year in link for year in list_of_years)]
    else:
        return ['http://www.who.int' + link.get('href') for link in years_links_html]
    
def get_links_per_year(years_links, list_of_months=None, proxies={'http': 'http://fw-bln.rki.local:8020'}):
    '''Take a list of links to the annual archive and return a list of DON links of these years
    
    years_links -- a list of links of the anual archive to parse 
    list_of_months -- a list of months (MMM* format) you want to parse (default None)
    proxies -- the proxy to use while scraping (default {'http': 'http://fw-bln.rki.local:8020'})
    '''
    all_links = []
    
    for year_link in years_links:
        page_year = requests.get(year_link,proxies=proxies)
        soup_year = BeautifulSoup(page_year.content,'html.parser')
        archive_year = soup_year.find('ul',attrs={'class':'auto_archive'})
        daily_links = ['http://www.who.int' + link.get('href') for link in archive_year.find_all('a')]
        all_links.extend(daily_links)
    
    if list_of_months:
        all_links = [link for link in all_links if any(month in link for month in map(lambda s:s.lower(),list_of_months))]
    return all_links
    
headers = {
    'User-Agent': 'Auss Abbood, www.rki.de',
    'From': 'abbooda@rki.de'
}

def scrape(years=None,
           months=None,
           num_last_reports=None,
           headers=None,
           proxies={'http': 'http://fw-bln.rki.local:8020'}):
    '''Scrapes the WHO DONs using the WHO DON scraping functions and returns the links to these DONs
    
    years -- a list of strings of years in the format YYYY to be scraped
    months -- a list of strings of months in the format MMM* to be scraped
    num_list_reports -- an integer to specify how many of the last reports should be scraped. 
    can be combined with the specification of year and/or month
    headers -- use a header for scraping
    proxies -- the proxy to use while scraping (default {'http': 'http://fw-bln.rki.local:8020'})
    '''
    years = get_links_by_year(list_of_years=years,proxies=proxies)
    all_links = get_links_per_year(years,list_of_months=months,proxies=proxies)
    return all_links

In [None]:
# Scrape all the WHO DONs of the year 2018
all_links = scrape(years=['2018'],proxies=None)

In [None]:
# Extract the main text of the given links
from boilerpipe.extract import Extractor
def extract(list_of_links):
    '''Extracts the main content from a list of links and returns a list of texts (str)

    list_of_links -- a list containing URLs of webpages to get the main content from
    '''
    return[Extractor(extractor='ArticleExtractor', url=url).getText().replace('\n','') \
         for url in tqdm(list_of_links)]

In [None]:
# Run this shit (a.k.a annotate all the scraped WHO DONs)
def create_annotated_database(texts,raw=False):
    '''Given a list of texts (str) annotate and extract disease keywords, geonames, and dates and return
    a dictonary of the text and the annotations
    
    texts -- a list of texts (str)
    raw -- returns a not preprocessed annotation (Default False)
    '''
    database = {"text":texts,"date":[],"confirmed_cases":[],"keyword":[],"geoname":[]}
    for i,text in enumerate(tqdm(texts)):
        try:
            doc = annotate(text)
            database["date"].append(get_date(doc,raw))
            database["confirmed_cases"].append(get_cases(doc,raw))
            database["keyword"].append(get_keywords(doc,raw))
            database["geoname"].append(get_geonames(doc,raw))
        except TypeError as e:
            print("Type error in text({})".format(i) + ": " + str(e))
    return database

In [None]:
def annotate(text):
    ''' Returns an document annotated for dates, disease counts, diseases, and geoneames
    
    text -- a string to be annotated
    '''
    doc = AnnoDoc(text)
    doc.add_tiers(GeonameAnnotator())
    doc.add_tiers(ResolvedKeywordAnnotator())
    doc.add_tiers(CountAnnotator())
    doc.add_tiers(DateAnnotator())
    return doc
def get_geonames(doc,raw=False):
    '''Returns the most occuring geographical entity in a annotated document
    
    doc -- an annotated string
    raw -- returns a not preprocessed annotation (Default False)
    '''
    if raw:
        return [doc.tiers["geonames"].spans[i].geoname["name"] for i in range(len(doc.tiers["geonames"].spans))]
    else:
        geonames = [doc.tiers["geonames"].spans[i].geoname["name"] for i in range(len(doc.tiers["geonames"].spans))]
        geoname_counts = [(key,len(list(group))) for key, group in groupby(sorted(geonames))]
        geoname_ranking = sorted(geoname_counts,key=lambda x:x[1],reverse=True)
        geoname_most_occure = [geoname[0] for geoname in geoname_ranking if geoname[1] == geoname_ranking[0][1]]
        return geoname_most_occure
def get_keywords(doc,raw=False):
    '''Returns the most occuring disease entity in a annotated document
    
    doc -- an annotated string
    raw -- returns a not preprocessed annotation (Default False)
    '''
    if raw:
        return [doc.tiers["resolved_keywords"].spans[i].metadata["resolutions"][0]['entity']['label'] \
                for i in range(len(doc.tiers["resolved_keywords"].spans)) \
               if doc.tiers["resolved_keywords"].spans[i].metadata["resolutions"][0]['entity']['type'] \
                    == 'disease']
                     
    else:
        keywords = [(doc.tiers["resolved_keywords"].spans[i].metadata["resolutions"][0]['entity']['label'] \
                     ,doc.tiers["resolved_keywords"].spans[i].metadata["resolutions"][0]["weight"]) \
                    for i in range(len(doc.tiers["resolved_keywords"].spans)) \
                    if doc.tiers["resolved_keywords"].spans[i].metadata["resolutions"][0]['entity']['type'] \
                    == 'disease']

        # Here I ignore the included weights and only considere the most occuring disease name
        keywords_without_weight = [disease[0] for disease in keywords]
        keyword_counts = [(key,len(list(group))) for key, group in groupby(sorted(keywords_without_weight))]
        try:
            keyword = max(keyword_counts,key=lambda x:x[1])
        except ValueError:
            keyword = np.nan
        if type(keyword) is float:
            return keyword
        else:
            return keyword[0] # Only returns the keyword, not the weight
def get_cases(doc,raw=False):
    '''Returns the disease counts with the attribute "confirmed" in a annotated document
    
    doc -- an annotated string
    raw -- returns a not preprocessed annotation (Default False)
    '''
    if raw:
        return [doc.tiers["counts"].spans[i].metadata['count'] for i in range(len(doc.tiers["counts"].spans))]
    else:
        return [doc.tiers["counts"].spans[i].metadata['count'] \
                for i in range(len(doc.tiers["counts"].spans)) \
                if "confirmed" in doc.tiers["counts"].spans[i].metadata['attributes']]
def get_date(doc,raw=False):
    '''Returns most mentioned date in a annotated document
    
    doc -- an annotated string
    raw -- returns a not preprocessed annotation (Default False)
    '''
    dates = [doc.tiers["dates"].spans[i].metadata["datetime_range"][0].strftime("%Y-%m-%d") \
             for i in range(len(doc.tiers["dates"].spans))]
    if raw:
        return dates
    else:
        date_count_tuple = [(key,len(list(group))) for key, group in groupby(sorted(dates))]
        try:
            date = max(date_count_tuple,key=lambda x:x[1])
        except ValueError:
            date = np.nan
        if type(date) is float:
            return date
        else:
            return date[0]

In [None]:
extracted = extract(all_links)

In [None]:
parsed_whos_df = pd.DataFrame.from_dict(create_annotated_database(extracted))
parsed_whos_df.head()

In [16]:
ereignisdatenbank = pd.read_csv("Ereignisse_utf8.csv",sep=";")

In [17]:
ereignisdatenbank.columns = list(map(lambda x:x.strip(" "),ereignisdatenbank.columns))

In [18]:
countries_not_null = ereignisdatenbank[pd.notnull(ereignisdatenbank["Ausgangs- bzw. Ausbruchsland"])]

In [19]:
countries = countries_not_null["Ausgangs- bzw. Ausbruchsland"].copy(deep=True)
countries = list(map(lambda x:x.strip(" "),countries))

In [20]:
set(countries)

{'Afghanistan',
 'Afghanistan,\nDR Congo\nNigeria\nSomalia',
 'Algerien',
 'Angola',
 'Australien',
 'Bangladesch',
 'Benin',
 'Bolivien',
 'Brasilien',
 'Burundi',
 'China',
 'Costa Rica',
 'DRC',
 'DRCongo',
 'Demokratische Republik Kongo',
 'Deutschland',
 'El Salvador',
 'Fiji',
 'Frankreich',
 'Französich_Polynesien',
 'Französiche Guyana',
 'Französisch-Polynesien',
 'Ghana',
 'Haiti',
 'Indien',
 'Irak',
 'Iran',
 'Israel',
 'Italien',
 'Italien, Griechenland, Rumanien, Ungarn, Frankreich',
 'Italien, Griechenland, Ungarn, Rumänien',
 'Italien, Serbien, Griechenland, Rumänien, Ungarn, Frankreich, Kosovo, Albanien, Macedonien, Montenegro, Serbien, Türkei',
 'Kamerun',
 'Kanada',
 'Kenia',
 'Kolumbien',
 'Kongo',
 'Kroatien',
 'Kuwait',
 'La Reunion',
 'Liberia',
 'Madagaskar',
 'Malawi',
 'Mali',
 'Mauretanien',
 'Mosambik',
 'Myanmar',
 'Namibia',
 'Niger',
 'Nigeria',
 'Nordeuropa',
 'Oman',
 'Pakistan',
 'Papua-Neuguinea',
 'Peru',
 'Philippinen',
 'Polen',
 'Saudi-Arabien',
 

# Let the testing begin
## Wikipedia parser

In [1]:
from bs4 import BeautifulSoup
import requests

In [2]:
req = requests.get("https://de.wikipedia.org/wiki/Liste_der_Staaten_der_Erde")
soup = BeautifulSoup(req.content,"html.parser")

In [3]:
parsed_soup = soup.find("table",class_="wikitable sortable zebra").find("tbody") # Find table with of all countries
parsed_soup = parsed_soup.find_all("tr") # Get entries of countries form table
amount_countries = len(parsed_soup)
parsed_soup = [parsed_soup[i].find_all('td') \
               for i in range(amount_countries)] # Extract table entries from country entry

In [6]:
wiki_dict = {"state_name_de":[],"full_state_name_de":[],"capital_de":[],"translation_state_name":[],"wiki_abbreviations":[]}
dash = u"\u2014"
regex = re.compile(r"\[\d*\]") # To remove footnotes in the names
for i in range(amount_countries):
    try:
        state_name_de = regex.sub("",(parsed_soup[i][0].text).replace("\n","")\
                                  .replace("\xad","")) # Remove soft hyphen used in Zentralafr. Rep.
        state_name_de = re.sub(r"((mit)|(ohne)).*","",state_name_de) # Remove some additional information in name
        wiki_dict["state_name_de"].append(state_name_de) 
        wiki_dict["full_state_name_de"].append(regex.sub("",parsed_soup[i][1].text).replace("\n","")) 
        wiki_dict["capital_de"].append(regex.sub("",parsed_soup[i][2].text).replace("\n","")) 
        wiki_dict["translation_state_name"].append(regex.sub("",parsed_soup[i][10].text).replace("\n",""))
        # Short and long abbreviation
        list_abbreviation = [parsed_soup[i][7].text.replace("\n",""),parsed_soup[i][8].text.replace("\n","")] 
        list_abbreviation = list(filter(lambda x:x not in ["",dash],list_abbreviation)) # Remove empty entries
        if len(list_abbreviation) > 1:
            wiki_dict["wiki_abbreviations"].append(list_abbreviation)#", ".join(list_abbreviation))
        else:
            wiki_dict["wiki_abbreviations"].append(dash)
    except: # Because header and footer are part of the table, soup opperations don't work
        print(i) # Except that the first and last entry fail

0
213


In [7]:
wikipedia_country_list = pd.DataFrame.from_dict(wiki_dict)
wikipedia_country_list.head()

Unnamed: 0,state_name_de,full_state_name_de,capital_de,translation_state_name,wiki_abbreviations
0,Erde,—,—,Earth,—
1,Europäische Union,—,Brüssel,European Union,—
2,Union Südamerikanischer Nationen,—,Quito,Union of South American Nations,—
3,Afrikanische Union,—,Addis Abeba,African Union,—
4,Verband Südostasiatischer Nationen,—,Jakarta,Association of Southeast Asian Nations,—


In [8]:
def abbreviate(country_name):
    country_name = re.sub(r"\(.*\)","", country_name) # Delete content in paranthesis since not relevant
    if "," in country_name:
        # If there is a comma, switch order to yield a more common abbreviation: Korea, Nord --> Nord Korea
        matched = re.match(r"([A-Za-z]*), (.*)",country_name)
        country_name = matched[2] + " " + matched[1] 
    abbreviation = None
    if len(re.findall(r"([A-Z|Ä|Ö|Ü])",country_name)) > 1:
        abbreviation = "".join(re.findall(r"([A-Z|Ä|Ö|Ü])",country_name))
    return abbreviation

In [9]:
# Search for names that might have abbreviations. If they consist of two or more words that start with a capital
# letter, make an abbreviation out of it
abb_state_de = list(map(abbreviate,wikipedia_country_list["state_name_de"].tolist()))
abb_full_state_de = list(map(abbreviate,wikipedia_country_list["full_state_name_de"].tolist()))
abb_state_trans = list(map(abbreviate,wikipedia_country_list["translation_state_name"].tolist()))
                   
abbreviations = [list(a) for a in zip(abb_state_de,abb_full_state_de,abb_state_trans)]
abbreviations = [list(filter(None,abb)) for abb in abbreviations if str(abb) != 'None'] # Removes Nones
abbreviations = list(map(lambda x:list(set(x)) if len(x)>1 else "-", abbreviations)) # Removes redundance
#abbreviations = list(map(", ".join,cleaned_abbreviations)) # Unpack list of abbreviations to string

In [10]:
wikipedia_country_list["inoff_abbreviations"] =  abbreviations
wikipedia_country_list.head()

Unnamed: 0,state_name_de,full_state_name_de,capital_de,translation_state_name,wiki_abbreviations,inoff_abbreviations
0,Erde,—,—,Earth,—,-
1,Europäische Union,—,Brüssel,European Union,—,[EU]
2,Union Südamerikanischer Nationen,—,Quito,Union of South American Nations,—,"[USN, USAN]"
3,Afrikanische Union,—,Addis Abeba,African Union,—,[AU]
4,Verband Südostasiatischer Nationen,—,Jakarta,Association of Southeast Asian Nations,—,"[VSN, ASAN]"


## Ontology/Comparison

In [None]:
from epitator.annotator import AnnoDoc
from epitator.geoname_annotator import location_contains
doc = AnnoDoc("Where is America?")
doc.add_tiers(GeonameAnnotator())
annotations = doc.tiers["geonames"]
geoname = annotations[0]
geoname.to_dict()

In [11]:
def clean_entries(countries):
    """Takes a list of countries and returns a set of cleaned country names"""
    countries_unique = list(set(countries)) # Optional. Used for better overview
    countries_unique = list(map(lambda x: re.sub(r'\n',', ',x), countries_unique)) # Because someone used new lines in entries
    countries_unique = list(map(lambda x: re.sub(r',,',',',x), countries_unique)) # Because the line above adds one comma to much
    countries_unique = list(map(lambda x: re.sub(r'\(.*\)',"",x).strip(" "), countries_unique))
    countries_unique = list(map(lambda x: x.replace("&", "und"), countries_unique))
    countries_unique = list(map(lambda x: x.split(",") if "," in x else x, countries_unique)) # For entries with more than one country
    countries_unique = list(map(lambda x: x.replace("_"," ") if type(x) != list else x,countries_unique))
    # To clean s.t. like DRCongo or VAE Dubai
#     countries_unique = list(map(lambda x: re.match(r"([A-Z]{2,}\W?)*",x)[1].replace(" ","")\
#                                 if type(x) != list and  re.match(r"([A-Z]{2,}\W?)*",x)[1]\
#                                 else x, countries_unique))
    countries_unique = list(map(lambda x: clean_entries(x) if type(x) == list else x,countries_unique))
    return countries_unique

In [31]:
#TEST
from deep_eq import deep_eq
clean_entries(countries)
example_countries_to_clean = [" Australien",
                              "Kongo \nUSA",
                              "Italien, Deutschland, Belgien ",
                              "Franz._Polynesien", 
                              "Trinidad & Tobago"]
deep_eq(clean_entries(example_countries_to_clean),["Trinidad und Tobago","Australien",['Belgien', 'Deutschland', 'Italien'], ["USA", "Kongo"], "Franz. Polynesien"])

True

In [None]:
# # FOR TESTING. RETURNS TUPLE WITH ABBREVIATION AND TRANSLATION
# # Takes a list of not matched/translated entries and tries to match them to the wikipedia table and find the full name
# countries_not_translated = [entry for entry in countries_unique \
#                             if entry not in wikipedia_country_list["state_name_de"].tolist()]
# def translate_abbreviation(to_translate):
#     abb_to_country = []
#     if type(to_translate) == str:
#         to_translate = [to_translate]
#     for column in ["wiki_abbreviations","inoff_abbreviations"]:
#         for potential_abbreviation in to_translate:
#             if type(potential_abbreviation) == str:
#                 for i, abbreviation in enumerate(wikipedia_country_list[column]):
#                     if potential_abbreviation in abbreviation:
#                         abb_to_country.append((potential_abbreviation,\
#                                                wikipedia_country_list["translation_state_name"].tolist()[i]))
#                         to_translate.remove(potential_abbreviation)
#             elif type(potential_abbreviation) == list:
#                 abb_to_country.append(translate_abbreviation(potential_abbreviation))
#     return(abb_to_country,to_translate)

# abbreviation_tuple, countries_not_translated = translate_abbreviation(countries_not_translated)

# #abbreviation_tuple
# #print("****************************************************************************************************************")
# #print(countries_not_translated)

In [None]:
def translate_abbreviation(to_translate):
    # If not list but single abbreviation, transform into list
    if type(to_translate) == str:
        to_translate = [to_translate]
    for potential_abbreviation in to_translate:
        if type(potential_abbreviation) == str and re.findall(r"([A-Z]{2,})",potential_abbreviation):
            to_delete = potential_abbreviation
            potential_abbreviation = re.match(r"([A-Z]\W?)*",potential_abbreviation)
            potential_abbreviation = potential_abbreviation[0].replace(" ","")
            for column in ["wiki_abbreviations","inoff_abbreviations"]:
                for i, abbreviation in enumerate(wikipedia_country_list[column]):
                    if potential_abbreviation in abbreviation:
                        to_translate.append(wikipedia_country_list["state_name_de"].tolist()[i])
                        to_translate.remove(to_delete)
        elif type(potential_abbreviation) == list: # and not all(isinstance(i, list) for i in potential_abbreviation):
            list_entry = [translate_abbreviation(nested_entry) for nested_entry in potential_abbreviation]
            flattened = [entry for sublist in list_entry for entry in sublist]
            to_translate.remove(potential_abbreviation)
            to_translate.append(flattened)
    return(to_translate)

In [None]:
# REMOVING WHILE LOOPING IS DIFFICULT
def how(to_translate):
    to_remove = []
    if type(to_translate) == str:
        to_translate = [to_translate]
    for potential_abbreviation in to_translate:
        if type(potential_abbreviation) == str and re.findall(r"([A-Z]{2,})",potential_abbreviation):
            not_found = False
            for column in ["wiki_abbreviations","inoff_abbreviations"]:
                for i, abbreviation in enumerate(wikipedia_country_list[column]):
                    if potential_abbreviation in abbreviation:
                        to_translate.append(wikipedia_country_list["state_name_de"].tolist()[i])
                        to_remove.append(potential_abbreviation)
    return [entry for entry in to_translate if entry not in to_remove]


In [None]:
ab = ["USA","VAE",'asdas', "DR Cong"]
how(ab)


In [None]:
potential_abbreviation = re.match(r"([A-Z]\W?)*","DRCongo")
a = potential_abbreviation[0].replace(" ","")
a in "DRC a"

In [None]:
## SIMPLE TRANSLATION. FAST BUT DOES NOT TRANSLATE LISTS OF LISTS
# # Translate German entries of Ereignisdatenbank to English. Might be inefficient since I go through the wiki list
# # entirely which is longer then the list of countries to translate
# translated_ereignisdatenbank_countries = [(entry,wikipedia_country_list["translation_state_name"].tolist()[indx])\
#                                           for indx,entry \
#                                           in enumerate(wikipedia_country_list["state_name_de"].tolist())\
#                                           if entry in countries_unique]

In [None]:
# Translate German entries of Ereignisdatenbank to English. Might be inefficient since I go through the wiki list
# entirely which is longer then the list of countries to translate
def translate(countries_unique):
    translated_ereignisdatenbank_countries = []
    state_name_de = wikipedia_country_list["state_name_de"].tolist()
    translation = wikipedia_country_list["translation_state_name"].tolist()
    
    for entry in countries_unique:
        if entry in state_name_de:
            # Tuple of entry and translation
            translated_ereignisdatenbank_countries.append((entry,translation[state_name_de.index(entry)]))
        elif type(entry) == list:
            # TODO: Abbreviations not found in other function shall now be translated
            #found_abbreviations = list(filter(re.compile(r"([A-Z]{2,3})").match,entry))
            #translated_abbreviations = translate_abbreviation(found_abbreviations)
            
            translated_ereignisdatenbank_countries.append(translate(clean_entries(entry)))
    return translated_ereignisdatenbank_countries

In [None]:
translate(translate_abbreviation(countries_unique))

In [None]:
translate(["United States","Delaware"])

In [None]:
countries_not_translated = [entry for entry in countries_unique \
                            if entry not in wikipedia_country_list["state_name_de"].tolist()\
                           and type(entry) != list]
countries_not_translated

In [None]:
translate_abbreviation(["USA"])

In [None]:
a = search_abbreviation_in_wiki(countries_and_abb_not_found)
print(a["found"])

In [None]:
for not_found in countries_not_found:
    for poss_match in wikipedia_country_list["state_name_de"].tolist():
        if not_found in poss_match:
            print(not_found, poss_match)

# Unterstanding geoname annotator

In [None]:
"""Geoname Annotator"""

from geopy.distance import great_circle
from .maximum_weight_interval_set import Interval, find_maximum_weight_interval_set

# Containment levels indicate which properties must match when determing
# whether a geoname of a given containment level contains another geoname.
# The admin codes generally correspond to states, provinces and cities.
CONTAINMENT_LEVELS = [
    'country_code',
    'admin1_code',
    'admin2_code',
    'admin3_code',
    'admin4_code'
]

GEONAME_ATTRS = [
    'geonameid',
    'name',
    'feature_code',
    'country_code',
    'admin1_code',
    'admin2_code',
    'admin3_code',
    'admin4_code',
    'longitude',
    'latitude',
    'population',
    'asciiname',
    'names_used',
    'name_count']
def location_contains(loc_outer, loc_inner):
    """
    Do a comparison to see if the first geoname contains the second.
    It returns an integer to indicate the level of containment.
    0 indicates no containment. Siblings locations and identical locations
    have 0 containment. The level of containment is determined by the specificty
    of the outer location. e.g. USA would be a smaller number than Texas.
    In order for containment to be detected the outer location must have a
    ADM* or PCL* feature code, which is most countries, states, and districts.
    """
    # Test the country code in advance for efficiency. The country code must match for
    # any level of containment.
    if loc_outer.country_code != loc_inner.country_code or loc_outer.country_code == '':
        return 0
    feature_code = loc_outer.feature_code
    if feature_code == 'ADM1':
        outer_feature_level = 2
    elif feature_code == 'ADM2':
        outer_feature_level = 3
    elif feature_code == 'ADM3':
        outer_feature_level = 4
    elif feature_code == 'ADM4':
        outer_feature_level = 5
    elif re.match("^PCL.", feature_code):
        outer_feature_level = 1
    else:
        return 0
    for prop in CONTAINMENT_LEVELS[1:outer_feature_level]:
        if loc_outer[prop] == '':
            return 0
        if loc_outer[prop] != loc_inner[prop]:
            return 0
    if loc_outer.geonameid == loc_inner.geonameid:
        return 0
    return outer_feature_level



GEONAME_ATTRS = [
    'geonameid',
    'name',
    'feature_code',
    'country_code',
    'admin1_code',
    'admin2_code',
    'admin3_code',
    'admin4_code',
    'longitude',
    'latitude',
    'population',
    'asciiname',
    'names_used',
    'name_count']


ADMINNAME_ATTRS = [
    'country_name',
    'admin1_name',
    'admin2_name',
    'admin3_name']


class GeonameRow(object):
    __slots__ = GEONAME_ATTRS + ADMINNAME_ATTRS + [
        'alternate_locations',
        'spans',
        'parents',
        'score',
        'lat_long',
        'high_confidence']

    def __init__(self, sqlite3_row):
        for key in sqlite3_row.keys():
            if key in GEONAME_ATTRS:
                setattr(self, key, sqlite3_row[key])
        self.lat_long = (self.latitude, self.longitude,)
        self.alternate_locations = set()
        self.spans = set()
        self.parents = set()
        self.score = None



    def to_dict(self):
        result = {}
        for key in GEONAME_ATTRS:
            result[key] = self[key]
        for key in ADMINNAME_ATTRS:
            if hasattr(self, key):
                result[key] = self[key]
        result['parents'] = [p.to_dict() for p in self.parents]
        result['score'] = self.score
        return result


class GeonameFeatures(object):
    """
    This represents the aspects of a condidate geoname that are used to
    determine whether it is being referenced.
    """
    # The feature name array is used to maintain the order of the
    # values in the feature vector.
    feature_names = [
        'log_population',
        'name_count',
        'num_spans',
        'max_span_length',
        'cannonical_name_used',
        'loc_NE_portion',
        'other_NE_portion',
        'noun_portion',
        'other_pos_portion',
        'num_tokens',
        'ambiguity',
        'PPL_feature_code',
        'ADM_feature_code',
        'CONT_feature_code',
        'other_feature_code',
        'combined_span_parents',
        'close_locations',
        'very_close_locations',
        'containing_locations',
        'max_containment_level',
        # high_confidence indicates the base feature set received a high score.
        # It is an useful feature for preventing high confidence geonames
        # from receiving low final scores when they lack contextual cues -
        # for example, when they are the only location mentioned.
        'high_confidence',
    ]

    def __init__(self, geoname, spans_to_nes, span_to_tokens):
        self.geoname = geoname
        # The set of geonames that are mentioned in proximity to the spans
        # corresponding to this feature.
        # This will be populated by the add_contextual_features function.
        self.nearby_mentions = set()
        d = {}
        d['log_population'] = math.log(geoname.population + 1)
        # Geonames with lots of alternate names
        # tend to be the ones most commonly referred to.
        d['name_count'] = geoname.name_count
        d['num_spans'] = len(geoname.spans)
        d['max_span_length'] = max([
            len(span.text) for span in geoname.spans])

        def cannonical_name_match(span, geoname):
            first_leaf = next(span.iterate_leaf_base_spans(), None)
            if first_leaf:
                span_text = first_leaf.text
            else:
                span_text = span.text
            span_in_name = span_text in geoname.name or span_text in geoname.asciiname
            return (float(len(span_text)) if span_in_name else 0) / len(geoname.name)
        d['cannonical_name_used'] = max([
            cannonical_name_match(span, geoname)
            for span in geoname.spans
        ])
        loc_NEs_overlap = 0
        other_NEs_overlap = 0
        total_spans = len(geoname.spans)
        for span in geoname.spans:
            for ne_span in spans_to_nes[span]:
                if ne_span.label == 'GPE' or ne_span.label == 'LOC':
                    loc_NEs_overlap += 1
                else:
                    other_NEs_overlap += 1
        d['loc_NE_portion'] = float(loc_NEs_overlap) / total_spans
        d['other_NE_portion'] = float(other_NEs_overlap) / total_spans
        noun_pos_tags = 0
        other_pos_tags = 0
        pos_tags = 0
        for span in geoname.spans:
            for token_span in span_to_tokens[span]:
                token = token_span.token
                pos_tags += 1
                if token.tag_.startswith("NN") or token.tag_ == "FW":
                    noun_pos_tags += 1
                else:
                    other_pos_tags += 1
        d['combined_span_parents'] = len(geoname.parents)
        d['noun_portion'] = float(noun_pos_tags) / pos_tags
        d['other_pos_portion'] = float(other_pos_tags) / pos_tags
        d['num_tokens'] = pos_tags
        d['ambiguity'] = len(geoname.alternate_locations)
        feature_code = geoname.feature_code
        if feature_code.startswith('PPL'):
            d['PPL_feature_code'] = 1
        elif feature_code.startswith('ADM'):
            d['ADM_feature_code'] = 1
        elif feature_code.startswith('CONT'):
            d['CONT_feature_code'] = 1
        else:
            d['other_feature_code'] = 1
        self._values = [0] * len(self.feature_names)
        self.set_values(d)

    def set_value(self, feature_name, value):
        self._values[self.feature_names.index(feature_name)] = value

    def set_values(self, value_dict):
        for idx, name in enumerate(self.feature_names):
            if name in value_dict:
                self._values[idx] = value_dict[name]

    def set_contextual_features(self):
        """
        GeonameFeatures are initialized with only values that can be extracted
        from the geoname database and span. This extends the GeonameFeature
        with values that require information from nearby_mentions.
        """
        geoname = self.geoname
        close_locations = 0
        very_close_locations = 0
        containing_locations = 0
        max_containment_level = 0
        for recently_mentioned_geoname in self.nearby_mentions:
            if recently_mentioned_geoname == geoname:
                continue
            containment_level = max(
                location_contains(geoname, recently_mentioned_geoname),
                location_contains(recently_mentioned_geoname, geoname))
            if containment_level > 0:
                containing_locations += 1
            if containment_level > max_containment_level:
                max_containment_level = containment_level
            distance = great_circle(
                recently_mentioned_geoname.lat_long, geoname.lat_long
            ).kilometers
            if distance < 400:
                close_locations += 1
            if distance < 100:
                very_close_locations += 1
        self.set_values(dict(
            close_locations=close_locations,
            very_close_locations=very_close_locations,
            containing_locations=containing_locations,
            max_containment_level=max_containment_level))


class GeonameAnnotator(Annotator):
    def __init__(self, custom_classifier=None):
        self.connection = get_database_connection()
        self.connection.row_factory = sqlite3.Row
        if custom_classifier:
            self.geoname_classifier = custom_classifier
        else:
            self.geoname_classifier = geoname_classifier

    def get_candidate_geonames(self, doc):
        """
        Returns an array of geoname dicts correponding to locations that the
        document may refer to.
        The dicts are extended with lists of associated AnnoSpans.
        """
        if 'ngrams' not in doc.tiers:
            doc.add_tiers(NgramAnnotator())
        logger.info('Ngrams annotated')
        if 'nes' not in doc.tiers:
            doc.add_tiers(NEAnnotator())
        logger.info('Named entities annotated')

        all_ngrams = list(set([span.text.lower()
                               for span in doc.tiers['ngrams'].spans
                               if is_possible_geoname(span.text)
                               ]))
        cursor = self.connection.cursor()
        geoname_results = list(cursor.execute('''
        SELECT
            geonames.*,
            count AS name_count,
            group_concat(alternatename, ";") AS names_used
        FROM geonames
        JOIN alternatename_counts USING ( geonameid )
        JOIN alternatenames USING ( geonameid )
        WHERE alternatename_lemmatized IN
        (''' + ','.join('?' for x in all_ngrams) + ''')
        GROUP BY geonameid''', all_ngrams))
        logger.info('%s geonames fetched' % len(geoname_results))
        geoname_results = [GeonameRow(g) for g in geoname_results]
        # Associate spans with the geonames.
        # This is done up front so span information can be used in the scoring
        # function
        span_text_to_spans = defaultdict(list)
        for span in doc.tiers['ngrams'].spans:
            if is_possible_geoname(span.text):
                span_text_to_spans[span.text.lower()].append(span)
        candidate_geonames = []
        for geoname in geoname_results:
            geoname.add_spans(span_text_to_spans)
            # In rare cases geonames may have no matching spans because
            # sqlite unicode equivalency rules match geonames that use different
            # characters the document spans used to query them.
            # These geonames are ignored.
            if len(geoname.spans) > 0:
                candidate_geonames.append(geoname)
        # Add combined spans to locations that are adjacent to a span linked to
        # an administrative division. e.g. Seattle, WA
        span_to_geonames = defaultdict(list)
        for geoname in candidate_geonames:
            for span in geoname.spans:
                span_to_geonames[span].append(geoname)
        geoname_spans = span_to_geonames.keys()
        combined_spans = AnnoTier(geoname_spans).chains(at_least=2, at_most=4, max_dist=4)
        for combined_span in combined_spans:
            leaf_spans = combined_span.iterate_leaf_base_spans()
            first_spans = next(leaf_spans)
            potential_geonames = {geoname: set()
                                  for geoname in span_to_geonames[first_spans]}
            for leaf_span in leaf_spans:
                leaf_span_geonames = span_to_geonames[leaf_span]
                next_potential_geonames = defaultdict(set)
                for potential_geoname, prev_containing_geonames in potential_geonames.items():
                    containing_geonames = [
                        containing_geoname
                        for containing_geoname in leaf_span_geonames
                        if location_contains(containing_geoname, potential_geoname) > 0]
                    if len(containing_geonames) > 0:
                        next_potential_geonames[potential_geoname] |= prev_containing_geonames | set(containing_geonames)
                potential_geonames = next_potential_geonames
            for geoname, containing_geonames in potential_geonames.items():
                geoname.spans.add(combined_span)
                geoname.parents |= containing_geonames
        # Replace individual spans with combined spans.
        span_to_geonames = defaultdict(list)
        for geoname in candidate_geonames:
            geoname.spans = set(AnnoTier(geoname.spans).optimal_span_set().spans)
            for span in geoname.spans:
                span_to_geonames[span].append(geoname)
        # Find locations with overlapping spans
        # Note that is is possible for two valid locations to have
        # overlapping names. For example, Harare Province has
        # Harare as an alternate name, so the city Harare is very
        # likely to be an alternate location that competes with it.
        for span, geonames in span_to_geonames.items():
            geoname_set = set(geonames)
            for geoname in geonames:
                geoname.alternate_locations |= geoname_set
        for geoname in candidate_geonames:
            geoname.alternate_locations -= set([geoname])
        logger.info('%s alternative locations found' % sum([
            len(geoname.alternate_locations)
            for geoname in candidate_geonames]))
        logger.info('%s candidate locations prepared' %
                    len(candidate_geonames))
        return candidate_geonames

   

    def add_contextual_features(self, features):
        """
        Extend a list of features with values that are based on the geonames
        mentioned nearby.
        """
        logger.info('adding contextual features')
        span_to_features = defaultdict(list)
        for feature in features:
            for span in feature.geoname.spans:
                span_to_features[span].append(feature)
        geoname_span_tier = AnnoTier(list(span_to_features.keys()))



    def annotate(self, doc):
        logger.info('geoannotator started')
        candidate_geonames = self.get_candidate_geonames(doc)
        features = self.extract_features(candidate_geonames, doc)
        if len(features) == 0:
            doc.tiers['geonames'] = AnnoTier([])
            return doc

        scores = self.geoname_classifier.predict_proba_base([
            list(f.values()) for f in features])
        for geoname, feature, score in zip(candidate_geonames, features, scores):
            geoname.high_confidence = float(
                score[1]) > self.geoname_classifier.HIGH_CONFIDENCE_THRESHOLD
            feature.set_value('high_confidence', geoname.high_confidence)
        has_high_confidence_features = any(
            [geoname.high_confidence for geoname in candidate_geonames])
        if has_high_confidence_features:
            self.add_contextual_features(features)
            scores = self.geoname_classifier.predict_proba_contextual([
                list(f.values()) for f in features])
        for geoname, score in zip(candidate_geonames, scores):
            geoname.score = float(score[1])
        culled_geonames = [geoname
                           for geoname in candidate_geonames
                           if geoname.score > self.geoname_classifier.GEONAME_SCORE_THRESHOLD]
        cursor = self.connection.cursor()
        for geoname in culled_geonames:
            geoname_results = list(cursor.execute('''
                SELECT
                    cc.name,
                    a1.name,
                    a2.name,
                    a3.name
                FROM adminnames a3
                JOIN adminnames a2 ON (
                    a2.country_code = a3.country_code AND
                    a2.admin1_code = a3.admin1_code AND
                    a2.admin2_code = a3.admin2_code AND
                    a2.admin3_code = "" )
                JOIN adminnames a1 ON (
                    a1.country_code = a3.country_code AND
                    a1.admin1_code = a3.admin1_code AND
                    a1.admin2_code = "" AND
                    a1.admin3_code = "" )
                JOIN adminnames cc ON (
                    cc.country_code = a3.country_code AND
                    cc.admin1_code = "00" AND
                    cc.admin2_code = "" AND
                    cc.admin3_code = "" )
                WHERE (a3.country_code = ? AND a3.admin1_code = ? AND a3.admin2_code = ? AND a3.admin3_code = ?)
                ''', (
                geoname.country_code or "",
                geoname.admin1_code or "",
                geoname.admin2_code or "",
                geoname.admin3_code or "",)))
            for result in geoname_results:
                prev_val = None
                for idx, attr in enumerate(['country_name', 'admin1_name', 'admin2_name', 'admin3_name']):
                    val = result[idx]
                    if val == prev_val:
                        # Names are repeated for admin levels beyond that of
                        # the geoname.
                        break
                    setattr(geoname, attr, val)
                    prev_val = val
        logger.info('admin names added')
        geo_spans = []
        for geoname in culled_geonames:
            for span in geoname.spans:
                geo_span = GeoSpan(
                    span.start, span.end, doc, geoname)
                geo_spans.append(geo_span)
        culled_geospans = AnnoTier(geo_spans).optimal_span_set(prefer=lambda x: (x.size(), x.geoname.score,))
        logger.info('overlapping geospans removed')
        return {'geonames': culled_geospans}

In [None]:
==abE