In [20]:
%load_ext autoreload
%autoreload 2

import re
import pandas as pd
import pickle
import os

from nlp_surveillance.who_scraper import get_annotated_2018_whos, scrape
from nlp_surveillance.annotator import *
from nlp_surveillance.edb_clean import get_cleaned_edb
from nlp_surveillance.wiki_country_parser import get_wiki_countries_df

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
edb = get_cleaned_edb()


## Countries

In [7]:
print(len(edb)," entries", "in edb")
countries_not_null = edb[pd.notnull(edb["Ausgangs- bzw. Ausbruchsland"])]
print(len(countries_not_null)," are not empty")
unique = set(c[0] if type(c)==list else c for c in countries_not_null['Ausgangs- bzw. Ausbruchsland'].tolist())
print(len(unique), " are unique")

483  entries in edb
414  are not empty
104  are unique


In [8]:
wiki_countries_df = get_wiki_countries_df()
same_word_translated = []
for _, row in wiki_countries_df.iterrows():
    if row['state_name_de'] == row['translation_state_name']:
        same_word_translated.append(row['state_name_de'])
    elif row['full_state_name_de'] == row['translation_state_name']:
        same_word_translated.append(row['full_state_name_de'])
edb_path = os.path.join('nlp_surveillance', 'data', 'edb.csv')
unprocesed_edb = pd.read_csv(edb_path, sep=";")

# Exlude words that are unchanged compared to unproccessed version
unchanged_country_names = [e for e in unique if e in unprocesed_edb['Ausgangs- bzw. Ausbruchsland '].tolist()]
# Exlude words that are the same after translation
unchanged_country_names = [e for e in unchanged_country_names if e not in same_word_translated]
# Exlude words that were English in the first place
unchanged_country_names =  [e for e in unchanged_country_names if e not in wiki_countries_df["translation_state_name"].tolist()]
print("{} countries were not translated:".format(len(unchanged_country_names)))
print(unchanged_country_names)
print("Taiwan is listed here because it is written the same in English and in German. So, it got translated")

12 countries were not translated:
['Typhus', 'Nordeuropa', 'VAE Dubai', 'DRCongo', 'Wales', 'Französisch-Polynesien', 'Französiche Guyana', 'Americas', 'Frankreich: Neukaledonien', 'Hong Kong', 'La Reunion', 'Taiwan']
Taiwan is listed here because it is written the same in English and in German. So, it got translated


In [9]:
links = edb.columns[:10:-3]
edb_with_any_link = edb[links].dropna(how='all')
print(len(edb) - len(edb_with_any_link), 'edb entires without any links')

245 edb entires without any links


## Sources/Links

In [16]:
from nlp_surveillance.optimize_date_and_count import _get_optimization_edb, _get_edb_with_combined_link_columns
print('edb has', len(edb), 'entries')
valid_date_optimization_entries = _get_optimization_edb(_get_edb_with_combined_link_columns(edb), to_optimize='date')
print(len(valid_date_optimization_entries), 'are valid training entries to optimize date search')

edb has 483 entries
163 are valid training entries to optimize date search


In [4]:
from nlp_surveillance.optimize_date_and_count import get_date_optimization_edb
some_day = '0days'
edb = get_date_optimization_edb()
mask = ((edb['to'] +pd.Timedelta(some_day)) >= edb['Datenstand für Fallzahlen gesamt*'])&\
((edb['from'] - pd.Timedelta(some_day)) <= edb['Datenstand für Fallzahlen gesamt*'])
print(sum(mask), 'examples can be used for learning')

1049 examples can be used for learning


# Present EpiTator Capabilites

## Get WHO articles

In [17]:
parsed_whos_df = get_annotated_2018_whos()

## Filter the edb for rows that mention 'who' or 'don'



In [18]:
links = [column for column in edb.columns[10::].tolist() if 'Link' in column]
mask = edb.copy(deep=True) # Create a mask for filtering
# Use only the columns about sources to filter
for column in edb[links]:
    mask[column] = edb[column].str.contains('who',na=False) # Extract all the entries that have the word "who"
    mask[column] = edb[column].str.contains('don',na=False) # and "don"
indices_to_drop = [i for i in range(len(mask)) if not mask[links].iloc[i].any()]

# Drop all rows that don't mention "who" or "don"
sources_filtered = edb.drop(np.reshape(indices_to_drop,(len(indices_to_drop),)))

In [21]:
edb.iloc[sources_filtered.index.tolist(),[3,6,7,9]]

# Prettify the link description
all_links = scrape(years=['2018'],proxies=None)
link_description = [re.search(r'don/(.*)/en',all_links[i])[1]\
                    .replace('-', ' ',2).replace('-',', ',2).replace('-',' ')\
                    for i in range(len(all_links))]

In [22]:
compare = parsed_whos_df.iloc[:,[1,3,4]].copy()
compare['link_description'] = pd.Series(link_description,index=compare.index)

## Very left column is the target and left the outcome of epitator

In [None]:
compare.head()