In [2]:
import spacy
spacy.load('en_core_web_sm')
from epitator.annotator import AnnoDoc
from epitator.geoname_annotator import GeonameAnnotator
from epitator.resolved_keyword_annotator import ResolvedKeywordAnnotator
from epitator.count_annotator import CountAnnotator
from epitator.date_annotator import DateAnnotator
from boilerpipe.extract import Extractor
from itertools import groupby
import datetime
import sys
import pandas as pd
import re
from tqdm import tqdm_notebook as tqdm
import numpy as np
import epitator

# Scrape

In [3]:
import requests
from bs4 import BeautifulSoup
from sys import stdout
from time import sleep
import pickle
import os

def get_links_by_year(list_of_years=None, proxies={'http': 'http://fw-bln.rki.local:8020'}):
    """Returns (all) the anual links of the WHO DONs 
    
    list_of_years -- a list of years (YYYY format) you want to parse (default None)
    proxies -- the proxy to use while scraping (default {'http': 'http://fw-bln.rki.local:8020'})
    """
    page = requests.get('http://www.who.int/csr/don/archive/year/en/',proxies=proxies)
    soup = BeautifulSoup(page.content,'html.parser')
    archiv_years = soup.find('ul',attrs={'class':'list'})
    years_links_html = archiv_years.find_all('a')
    if list_of_years:
        return ['http://www.who.int' + link.get('href') for link in years_links_html if any(year in link for year in list_of_years)]
    else:
        return ['http://www.who.int' + link.get('href') for link in years_links_html]
    
def get_links_per_year(years_links, list_of_months=None, proxies={'http': 'http://fw-bln.rki.local:8020'}):
    """Take a list of links to the annual archive and return a list of DON links of these years
    
    years_links -- a list of links of the anual archive to parse 
    list_of_months -- a list of months (MMM* format) you want to parse (default None)
    proxies -- the proxy to use while scraping (default {'http': 'http://fw-bln.rki.local:8020'})
    """
    all_links = []
    
    for year_link in years_links:
        page_year = requests.get(year_link,proxies=proxies)
        soup_year = BeautifulSoup(page_year.content,'html.parser')
        archive_year = soup_year.find('ul',attrs={'class':'auto_archive'})
        daily_links = ['http://www.who.int' + link.get('href') for link in archive_year.find_all('a')]
        all_links.extend(daily_links)
    
    if list_of_months:
        all_links = [link for link in all_links if any(month in link for month in map(lambda s:s.lower(),list_of_months))]
    return all_links
    
headers = {
    'User-Agent': 'Auss Abbood, www.rki.de',
    'From': 'abbooda@rki.de'
}

def scrape(years=None,
           months=None,
           num_last_reports=None,
           headers=None,
           proxies={'http': 'http://fw-bln.rki.local:8020'}):
    """Scrapes the WHO DONs using the WHO DON scraping functions and returns the links to these DONs
    
    years -- a list of strings of years in the format YYYY to be scraped
    months -- a list of strings of months in the format MMM* to be scraped
    num_list_reports -- an integer to specify how many of the last reports should be scraped. 
    can be combined with the specification of year and/or month
    headers -- use a header for scraping
    proxies -- the proxy to use while scraping (default {'http': 'http://fw-bln.rki.local:8020'})
    """
    years = get_links_by_year(list_of_years=years,proxies=proxies)
    all_links = get_links_per_year(years,list_of_months=months,proxies=proxies)
    return all_links

In [19]:
# Scrape all the WHO DONs of the year 2018
all_links = scrape(years=['2018'],proxies=None)

In [131]:
# Extract the main text of the given links
from boilerpipe.extract import Extractor
def extract(list_of_links):
    """Extracts the main content from a list of links and returns a list of texts (str)

    list_of_links -- a list containing URLs of webpages to get the main content from
    """
    if type(list_of_links) == str:
        list_of_links = [list_of_links]
    return[Extractor(extractor='ArticleExtractor', url=url).getText().replace('\n','') \
         for url in tqdm(list_of_links)]

# Annotation
### Annotate

In [5]:
def annotate(text):
    """ Returns an document annotated for dates, disease counts, diseases, and geoneames
    
    text -- a string to be annotated
    """
    doc = AnnoDoc(text)
    doc.add_tiers(GeonameAnnotator())
    doc.add_tiers(ResolvedKeywordAnnotator())
    doc.add_tiers(CountAnnotator())
    doc.add_tiers(DateAnnotator())
    return doc

### Geonames

In [35]:
def get_geonames(doc,raw=False):
    """Returns the most occuring geographical entity in a annotated document
    
    doc -- an annotated string
    raw -- returns a not preprocessed annotation (Default False)
    """
    if raw:
        return [doc.tiers["geonames"].spans[i].geoname["name"] for i in range(len(doc.tiers["geonames"].spans))]
    else:
        geonames = [doc.tiers["geonames"].spans[i].geoname["name"] for i in range(len(doc.tiers["geonames"].spans))]
        geoname_counts = [(key,len(list(group))) for key, group in groupby(sorted(geonames))]
        geoname_ranking = sorted(geoname_counts,key=lambda x:x[1],reverse=True)
        geoname_most_occure = [geoname[0] for geoname in geoname_ranking if geoname[1] == geoname_ranking[0][1]]
        return geoname_most_occure

### Keywords

In [36]:
def get_keywords(doc,raw=False):
    """Returns the most occuring disease entity in a annotated document
    
    doc -- an annotated string
    raw -- returns a not preprocessed annotation (Default False)
    """
    if raw:
        return [doc.tiers["resolved_keywords"].spans[i].metadata["resolutions"][0]['entity']['label'] \
                for i in range(len(doc.tiers["resolved_keywords"].spans)) \
               if doc.tiers["resolved_keywords"].spans[i].metadata["resolutions"][0]['entity']['type'] \
                    == 'disease']
                     
    else:
        keywords = [(doc.tiers["resolved_keywords"].spans[i].metadata["resolutions"][0]['entity']['label'] \
                     ,doc.tiers["resolved_keywords"].spans[i].metadata["resolutions"][0]["weight"]) \
                    for i in range(len(doc.tiers["resolved_keywords"].spans)) \
                    if doc.tiers["resolved_keywords"].spans[i].metadata["resolutions"][0]['entity']['type'] \
                    == 'disease']

        # Here I ignore the included weights and only considere the most occuring disease name
        keywords_without_weight = [disease[0] for disease in keywords]
        keyword_counts = [(key,len(list(group))) for key, group in groupby(sorted(keywords_without_weight))]
        try:
            keyword = max(keyword_counts,key=lambda x:x[1])
        except ValueError:
            keyword = np.nan
        if type(keyword) is float:
            return keyword
        else:
            return keyword[0] # Only returns the keyword, not the weight

### Counts

In [37]:
def get_cases(doc,raw=False):
    """Returns the disease counts with the attribute "confirmed" in a annotated document
    
    doc -- an annotated string
    raw -- returns a not preprocessed annotation (Default False)
    """
    if raw:
        return [doc.tiers["counts"].spans[i].metadata['count'] for i in range(len(doc.tiers["counts"].spans))]
    else:
        return [doc.tiers["counts"].spans[i].metadata['count'] \
                for i in range(len(doc.tiers["counts"].spans)) \
                if "confirmed" in doc.tiers["counts"].spans[i].metadata['attributes']]

### Dates

In [38]:
def get_date(doc,raw=False):
    """Returns most mentioned date in a annotated document
    
    doc -- an annotated string
    raw -- returns a not preprocessed annotation (Default False)
    """
    dates = [doc.tiers["dates"].spans[i].metadata["datetime_range"][0].strftime("%Y-%m-%d") \
             for i in range(len(doc.tiers["dates"].spans))]
    if raw:
        return dates
    else:
        date_count_tuple = [(key,len(list(group))) for key, group in groupby(sorted(dates))]
        try:
            date = max(date_count_tuple,key=lambda x:x[1])
        except ValueError:
            date = np.nan
        if type(date) is float:
            return date
        else:
            return date[0]

In [39]:
# Run this shit (a.k.a annotate all the scraped WHO DONs)
def create_annotated_database(texts,raw=False):
    #TODO create a dict, to specifically set raw for different annotators
    """Given a list of texts (str) annotate and extract disease keywords, geonames, and dates and return
    a dictonary of the text and the annotations
    
    texts -- a list of texts (str)
    raw -- returns a not preprocessed annotation (Default False)
    """
    database = {"text":texts,"date":[],"confirmed_cases":[],"keyword":[],"geoname":[]}
    for i,text in enumerate(tqdm(texts)):
        try:
            doc = annotate(text)
            database["date"].append(get_date(doc,raw))
            database["confirmed_cases"].append(get_cases(doc,raw))
            database["keyword"].append(get_keywords(doc,raw))
            database["geoname"].append(get_geonames(doc,raw))
        except TypeError as e:
            print("Type error in text({})".format(i) + ": " + str(e))
    return database

In [20]:
parsed_whos_df = pd.DataFrame.from_dict(create_annotated_database(extract(all_links)))
parsed_whos_df.head()

HBox(children=(IntProgress(value=0, max=81), HTML(value='')))




HBox(children=(IntProgress(value=0, max=81), HTML(value='')))




Unnamed: 0,text,date,confirmed_cases,keyword,geoname
0,Ebola virus disease – Democratic Republic of t...,2018-11-13,[],Ebola hemorrhagic fever,[Democratic Republic of the Congo]
1,Ebola virus disease – Democratic Republic of t...,2018-11-06,[],Ebola hemorrhagic fever,[Democratic Republic of the Congo]
2,Middle East respiratory syndrome coronavirus (...,2018-09-17,[2005],Middle East respiratory syndrome,"[Kingdom of Saudi Arabia, Middle East]"
3,Ebola virus disease – Democratic Republic of t...,2018-10-30,"[1, 1]",Ebola hemorrhagic fever,[Democratic Republic of the Congo]
4,Circulating vaccine-derived poliovirus type 2 ...,2018-01-01,[],poliomyelitis,"[Federal Republic of Nigeria, Republic of Niger]"


In [21]:
pickle.dump( parsed_whos_df, open( "parsed_whos_df.p", "wb" ) )

In [22]:
parsed_whos_df = pickle.load( open( "parsed_whos_df.p", "rb" ) )

In [23]:
parsed_whos_df.head()

Unnamed: 0,text,date,confirmed_cases,keyword,geoname
0,Ebola virus disease – Democratic Republic of t...,2018-11-13,[],Ebola hemorrhagic fever,[Democratic Republic of the Congo]
1,Ebola virus disease – Democratic Republic of t...,2018-11-06,[],Ebola hemorrhagic fever,[Democratic Republic of the Congo]
2,Middle East respiratory syndrome coronavirus (...,2018-09-17,[2005],Middle East respiratory syndrome,"[Kingdom of Saudi Arabia, Middle East]"
3,Ebola virus disease – Democratic Republic of t...,2018-10-30,"[1, 1]",Ebola hemorrhagic fever,[Democratic Republic of the Congo]
4,Circulating vaccine-derived poliovirus type 2 ...,2018-01-01,[],poliomyelitis,"[Federal Republic of Nigeria, Republic of Niger]"


## Compare with Ereignisdatenbank (incident report). From here not put into .py

In [333]:
# Read in with columns with sources only
ereignisdatenbank = pd.read_csv("Ereignisse_utf8.csv",sep=";")

In [334]:
sources = ereignisdatenbank.iloc[:,15:26] # Get only the columns mentioning sources
sources = sources.dropna(how="all").reset_index(drop=True) # Drop empty rows at the end
sources.head()

Unnamed: 0,Quelle 1*,Datum der Veröffentlichung der Quelle1,Link zur Quelle 1,Quelle 2,Datum der Veröffentlichung der Quelle 2,Link zur Quelle 2,Quelle 3,Datum der Veröffentlichung der Quelle 3,Link zur Quelle 3,Quelle 4,Datum der Veröffentlichung der Quelle 4
0,WHO AFRO Bericht,12.03.2018,<http://apps.who.int/iris/bitstream/10665/2604...,NCDC,18.03.2018,,,,,,
1,WHO AFRO Bericht,12.03.2018,<http://apps.who.int/iris/bitstream/10665/2604...,,,,,,,,
2,WHO AFRO Bericht,21.03.2018,<http://apps.who.int/iris/bitstream/10665/2604...,,,,,,,,
3,WHO AFRO Bericht,12.03.2018,<http://apps.who.int/iris/bitstream/10665/2604...,,,,,,,,
4,ProMED-mail,21.03.2018,,MoH Fiji,,,,,,,


In [335]:
mask = sources.copy(deep=True) # Create a mask for filtering
for column in sources.columns:
    mask[column] = sources[column].str.contains('who',na=False) # Extract all the entries that have the word "who"
    mask[column] = sources[column].str.contains('don',na=False) # and "don"
indices = [i for i in range(len(mask)) if not mask.iloc[i].any()] 
sources_filtered = sources.drop(np.reshape(indices,(len(indices),))) # Drop all rows that don't mention "who" or "don"

In [424]:
# TO .PY
# Transform time to timestamp
def ereignisdatenbank_to_timestamp(dataframe):
    """Transforms an unconverted string to timestamp"""
    if type(dataframe) == pd.core.frame.DataFrame:
        for column in dataframe:
            dataframe[column] = dataframe[column].astype(str)
            dataframe[column] = dataframe[column].str.replace('.', ' ')
            dataframe[column] = \
            dataframe[column].apply(lambda x: datetime.datetime.strptime(x,'%d %m %Y').strftime("%Y-%m-%d")\
                                    if re.match(r"\d\d.\d\d.\d\d\d\d",x) and "-" not in x
                                    else x)
        return dataframe
    elif type(dataframe) == pd.core.series.Series:
        dataframe = dataframe.astype(str)
        dataframe = dataframe.str.replace('.', ' ')
        dataframe = \
        dataframe.apply(lambda x: datetime.datetime.strptime(x,'%d %m %Y').strftime("%Y-%m-%d")\
                        if re.match(r"\d\d.\d\d.\d\d\d\d",x) and "-" not in x
                        else x)
        return dataframe

In [328]:
sources_filtered = ereignisdatenbank_to_timestamp(sources_filtered)

In [330]:
sources_filtered

Unnamed: 0,Quelle 1*,Datum der Veröffentlichung der Quelle1,Link zur Quelle 1,Quelle 2,Datum der Veröffentlichung der Quelle 2,Link zur Quelle 2,Quelle 3,Datum der Veröffentlichung der Quelle 3,Link zur Quelle 3,Quelle 4,Datum der Veröffentlichung der Quelle 4
25,WHO,2018-03-15,http://www who int/csr/don/15-march-2018-mers-...,,,,,,,,
52,EAR report,2018-04-13,http://www who int/csr/don/09-april-2018-liste...,,,,,,,,
70,NCDC SitRep,2018-04-15,http://ncdc gov ng/themes/common/files/sitreps...,WHO DON,2018-04-20 00:00:00,http://www who int/csr/don/20-april-2018-lassa...,CIDRAP,2018-04-18 00:00:00,http://www cidrap umn edu/news-perspective/201...,,
107,WHO,2018-05-11,http://www who int/news-room/detail/11-05-2018...,WHO DON,2018-05-10 00:00:00,http://www who int/csr/don/10-may-2018-ebola-d...,,,,,
112,WHO GOARN TK,2018-05-15,,WHO DON,2018-05-14 00:00:00,http://www who int/csr/don/14-may-2018-ebola-d...,,,,,
113,WHO GOARN TK,2018-05-17,,WHO DON,2018-05-17 00:00:00,http://www who int/csr/don/17-may-2018-ebola-d...,,,,,
120,WHO News Release,2018-05-22,http://www who int/news-room/detail/21-05-2018...,WHO DON,2018-05-21 00:00:00,http://www who int/csr/don/21-may-2018-ebola-d...,,,,,
140,WHO DON,2018-05-31,http://www who int/csr/don/31-may-2018-nipah-v...,,,,,,,,
145,ProMED-mail,2018-06-05,http://www promedmail org/post/5838919,WHO DON,2018-06-05 00:00:00,http://www who int/csr/don/05-june-2018-monkey...,,,,,
149,WHO DON,2018-06-12,http://www who int/csr/don/11-june-2018-measle...,,,,,,,,


In [307]:
# Idea which is not correct and not complete
date_matches = {}
for column in sources_filtered.columns[::-3]: # Use only the columns mentioning dates
    date_matches[column] = [i for i in range(len(parsed_whos_df))\
                            if list(map(lambda x: str(x)[:-3],parsed_whos_df["date"].tolist()))[i] \
                            in list(map(lambda x: str(x)[:-12],sources_filtered[column].tolist()))]

In [308]:
import itertools
indices_that_matched = list(set(itertools.chain(*date_matches.values())))

In [309]:
ereignisdatenbank.iloc[sources_filtered.index.tolist(),[3,6,7,9]]

Unnamed: 0,Ausgangs- bzw. Ausbruchsland,Krankheitsbild(er),Frühestbekannter Ereignisbeginn,Fälle gesamt*
25,Oman,MERS,,
52,Australien,Listeriose,17.01.2018,20.0
70,Nigeria,Lassafieber,01.01.2018,
107,Demokratische Republik Kongo,Ebola,04.04.2018,34.0
112,Demokratische Republik Kongo,Ebola,04.04.2018,41.0
113,Demokratische Republik Kongo,Ebola,04.04.2018,44.0
120,Demokratische Republik Kongo,Ebola,04.04.2018,46.0
140,Indien,,,
145,Kamerun,Affenpocken,30.04.1018,16.0
149,Brasilien,Masern,,995.0


In [27]:
parsed_whos_df.iloc[indices_that_matched,1:].sort_values("date")

Unnamed: 0,date,confirmed_cases,keyword,geoname
64,2018-03-01,[],influenza,[Kingdom of the Netherlands]
60,2018-03-08,[],listeriosis,[Commonwealth of Australia]
25,2018-04-01,[1],yellow fever,[Guyane]
35,2018-04-01,"[1, 1]",Ebola hemorrhagic fever,[Democratic Republic of the Congo]
49,2018-04-04,[],Ebola hemorrhagic fever,[Mbandaka]
37,2018-04-04,"[1, 4]",Ebola hemorrhagic fever,[Democratic Republic of the Congo]
42,2018-04-04,[1],Ebola hemorrhagic fever,[Democratic Republic of the Congo]
45,2018-04-04,[],Ebola hemorrhagic fever,[Democratic Republic of the Congo]
36,2018-04-24,[],poliomyelitis,[Independent State of Papua New Guinea]
47,2018-04-30,[1],monkeypox,[Republic of Cameroon]


In [28]:
# Prettify the link description
link_description = [re.search(r'don/(.*)/en',all_links[i])[1]\
                    .replace('-', ' ',2).replace('-',', ',2).replace('-',' ')\
                    for i in range(len(all_links))]

In [29]:
# Extract the most important columns
compare = parsed_whos_df.iloc[:,[1,3,4]].copy()
compare['link_description'] = pd.Series(link_description,index=compare.index)

# To present

In [380]:
compare["date"].iloc[0]

'2018-11-13'

In [31]:
# These are the links that were faulty during annotation
to_check = compare[compare["keyword"].isnull()==True].index.values
links_to_check = np.asarray(all_links)[to_check] # Get the links that caused the bad annotations

In [32]:
annotated_faulty_text = create_annotated_database(extract(links_to_check),raw=True)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




In [33]:
faulty_df = pd.DataFrame.from_dict(annotated_faulty_text).iloc[:,1:]
faulty_df

Unnamed: 0,date,confirmed_cases,keyword,geoname
0,"[2018-08-01, 2018-07-17, 2018-06-01, 2018-07-3...","[19, 17, 18, 1, 2, 3, 3, 4, 1, 2, 2600, 5000, ...",[],"[State of Kerala, State of Kerala, Kozhikode, ..."


In [34]:
# Entries that had the occurence of a geographical entity more than once
parsed_whos_df[parsed_whos_df["geoname"].str.len()>1]

Unnamed: 0,text,date,confirmed_cases,keyword,geoname
2,Middle East respiratory syndrome coronavirus (...,2018-09-17,[2005],Middle East respiratory syndrome,"[Kingdom of Saudi Arabia, Middle East]"
4,Circulating vaccine-derived poliovirus type 2 ...,2018-01-01,[],poliomyelitis,"[Federal Republic of Nigeria, Republic of Niger]"
30,Circulating vaccine-derived poliovirus type 2 ...,2016-01-01,[],poliomyelitis,"[Borno State, Gombe State, Jigawa State, Sokot..."
40,Middle East respiratory syndrome coronavirus (...,2018-02-01,"[75, 75, 2220, 790]",Middle East respiratory syndrome,"[Kingdom of Saudi Arabia, Middle East]"
52,Ebola virus disease – Democratic Republic of t...,2015-01-01,[],Ebola hemorrhagic fever,"[Democratic Republic of the Congo, Republic of..."
53,Circulating vaccine-derived polioviruses – Hor...,2018-05-01,[],poliomyelitis,"[Horn of Africa, Somalia]"
67,Circulating vaccine-derived poliovirus type 2 ...,2017-01-01,[],poliomyelitis,"[Mogadishu, Somalia]"


# Geo Tests

In [78]:
import pycountry

In [92]:
country_names = [list(pycountry.countries)[i].name for i in range(len(pycountry.countries))]

In [969]:
sorted(country_names)

In [101]:
from googletrans import Translator

In [99]:
import geograpy
url = 'http://www.bbc.com/news/world-europe-26919928'
places = geograpy.get_place_context(url=url)

ModuleNotFoundError: No module named 'geograpy'

# Goodnes Test

In [310]:
sources_filtered.head()

Unnamed: 0,Quelle 1*,Datum der Veröffentlichung der Quelle1,Link zur Quelle 1,Quelle 2,Datum der Veröffentlichung der Quelle 2,Link zur Quelle 2,Quelle 3,Datum der Veröffentlichung der Quelle 3,Link zur Quelle 3,Quelle 4,Datum der Veröffentlichung der Quelle 4
25,WHO,2018-03-15,http://www who int/csr/don/15-march-2018-mers-...,,NaT,,,NaT,,,
52,EAR report,2018-04-13,http://www who int/csr/don/09-april-2018-liste...,,NaT,,,NaT,,,
70,NCDC SitRep,2018-04-15,http://ncdc gov ng/themes/common/files/sitreps...,WHO DON,2018-04-20,http://www who int/csr/don/20-april-2018-lassa...,CIDRAP,2018-04-18,http://www cidrap umn edu/news-perspective/201...,,
107,WHO,2018-05-11,http://www who int/news-room/detail/11-05-2018...,WHO DON,2018-05-10,http://www who int/csr/don/10-may-2018-ebola-d...,,NaT,,,
112,WHO GOARN TK,2018-05-15,,WHO DON,2018-05-14,http://www who int/csr/don/14-may-2018-ebola-d...,,NaT,,,


In [311]:
sources_filtered = sources_filtered.fillna("nan")

In [248]:
links_from_sources_filtered = [sources_filtered["Link zur Quelle 1"].iloc[i] 
                               if ("pdf" not in sources_filtered["Link zur Quelle 1"].iloc[i]) 
                               and (sources_filtered["Link zur Quelle 1"].iloc[i] != "nan") 
                               else sources_filtered["Link zur Quelle 2"].iloc[i] 
                               for i in range(len(sources_filtered))]

In [249]:
links_from_sources_filtered

['http://www.who.int/csr/don/15-march-2018-mers-oman/en/',
 'http://www.who.int/csr/don/09-april-2018-listeriosis-australia/en/',
 'http://www.who.int/csr/don/20-april-2018-lassa-fever-nigeria/en/',
 'http://www.who.int/news-room/detail/11-05-2018-who-and-partners-working-with-national-health-authorities-to-contain-new-ebola-outbreak-in-the-democratic-republic-of-the-congo',
 'http://www.who.int/csr/don/14-may-2018-ebola-drc/en/',
 'http://www.who.int/csr/don/17-may-2018-ebola-drc/en/',
 'http://www.who.int/news-room/detail/21-05-2018-who-supports-ebola-vaccination-of-high-risk-populations-in-the-democratic-republic-of-the-congo',
 'http://www.who.int/csr/don/31-may-2018-nipah-virus-india/en/',
 'http://www.promedmail.org/post/5838919',
 'http://www.who.int/csr/don/11-june-2018-measles-brazil/en/',
 'http://www.who.int/csr/don/18-june-2018-rift-valley-fever-kenya/en/',
 'http://www.promedmail.org/post/5936610',
 'http://www.cidrap.umn.edu/news-perspective/2018/08/news-scan-aug-07-2018'

In [121]:
extracted = extract(links_from_sources_filtered)

HBox(children=(IntProgress(value=0, max=17), HTML(value='')))

In [138]:
parsed_links_ereignisdatenbank = create_annotated_database(extracted,raw=True)

HBox(children=(IntProgress(value=0, max=17), HTML(value='')))

In [139]:
df = pd.DataFrame.from_dict(parsed_links_ereignisdatenbank)
df = df.iloc[:,1:]
df["links"] = links
df

Unnamed: 0,date,confirmed_cases,keyword,geoname,links
0,[],[1],"[Middle East respiratory syndrome, Middle East...",[Middle East],http://www.who.int/csr/don/15-march-2018-mers-...
1,"[2018-04-01, 2018-03-02, 2018-01-17, 2018-04-0...","[20, 19, 1, 7, 1, 27, 8, 30]","[listeriosis, listeriosis, listeriosis, lister...","[Commonwealth of Australia, Hong Kong Special ...",http://www.who.int/csr/don/09-april-2018-liste...
2,"[2018-04-01, 2018-01-01, 1849-01-01, 2018-04-1...","[21, 413, 9, 1422, 5, 413, 9, 114, 27, 7, 8, 1...","[Lassa fever, Lassa fever, Lassa fever, Lassa ...","[Anambra State, Bauchi, Benue State, Federal C...",http://www.who.int/csr/don/20-april-2018-lassa...
3,"[2018-05-11, 2018-05-10, 2018-05-09, 2018-11-1...","[34, 2, 18, 14, 5, 5, 2, 1, 1, 3, 2, 15, 4, 5]","[Ebola hemorrhagic fever, Ebola hemorrhagic fe...","[Democratic Republic of the Congo, Province de...",http://www.who.int/news-room/detail/11-05-2018...
4,"[2018-05-10, 2018-04-04, 2018-05-13, 2018-05-1...","[7, 39, 19, 3, 2, 20, 7, 3, 5, 2, 393, 1200000...","[Ebola hemorrhagic fever, Ebola hemorrhagic fe...","[Province de l’Équateur, Mbandaka, Mbandaka, M...",http://www.who.int/csr/don/14-may-2018-ebola-d...
5,"[2018-05-14, 2018-04-04, 2018-04-01, 2018-05-1...","[5, 1, 1, 1, 3, 1500000.0, 44, 23, 3, 3, 20, 2...","[Ebola hemorrhagic fever, Ebola hemorrhagic fe...","[Mbandaka, Mbandaka, Mbandaka, Mbandaka, Provi...",http://www.who.int/csr/don/17-may-2018-ebola-d...
6,"[2018-11-19, 2018-05-18, 2015-01-01]","[7500, 46, 26, 4, 1000000, 1, 1, 600, 5837, 1]","[Ebola hemorrhagic fever, Ebola hemorrhagic fe...","[Democratic Republic of the Congo, Democratic ...",http://www.who.int/news-room/detail/21-05-2018...
7,"[2018-05-19, 2018-05-28, 2018-05-01, 1998-01-0...","[31, 3, 3, 1, 1, 4, 3, 4, 1, 15, 15, 2, 13, 28...",[viral infectious disease],"[Kozhikode, State of Kerala, Republic of India...",http://www.who.int/csr/don/31-may-2018-nipah-v...
8,[],[20012008],[],[],http://www.promedmail.org/post/5838919
9,"[2018-06-01, 2018-01-01, 2018-01-01, 2018-02-0...","[995, 114, 30, 84, 2, 80, 3, 798, 611, 30, 63,...","[measles, measles, measles, viral infectious d...","[Federative Republic of Brazil, Amazonas, Amaz...",http://www.who.int/csr/don/11-june-2018-measle...


In [100]:
ereignisdatenbank.columns

Index(['Zeilen-ID', 'Ereignis-ID', 'Kontinent',
       'Ausgangs- bzw. Ausbruchsland ', 'Sekundär betroffene Länder*',
       'Erreger', 'Krankheitsbild(er)', 'Frühestbekannter Ereignisbeginn',
       'Erstveröffentlichung', 'Fälle gesamt*  ',
       'Datenstand für Fallzahlen gesamt*', 'Fälle bestätigt ',
       'Warcheinlische Fälle', 'Verdachtsfälle', 'Todesfälle ', 'Quelle 1*',
       'Datum der Veröffentlichung der Quelle1', 'Link zur Quelle 1',
       'Quelle 2', 'Datum der Veröffentlichung der Quelle 2',
       'Link zur Quelle 2', 'Quelle 3',
       'Datum der Veröffentlichung der Quelle 3', 'Link zur Quelle 3',
       'Quelle 4', 'Datum der Veröffentlichung der Quelle 4',
       'Link zur Quelle 4', 'Rationale für Monitoring*',
       'Rationale für Monitoring* Kommentar', 'Monitoring-Frequenz',
       'Verbindung zu Ereignis-ID*',
       'RKI-Berichtsformat* (z.B. Wochenbericht, EpiLag, etc)',
       'RKI-Berichtsformat Kategorie (z.B. Kurznachricht, Fortschreibung, etc.) ',


In [264]:
to_compare_from_ereignisdatenbank = ereignisdatenbank.iloc[sources_filtered.index.tolist()\
                                                           ,[3,6,7,8,10,11,12,13,14]].reset_index().drop("index",axis=1)

In [494]:
# WHY DOES THIS NOT WORK
to_compare_from_ereignisdatenbank["Warcheinlische Fälle"].apply((lambda x: int(x) if not np.isnan(x) else x))

0      NaN
1      1.0
2      9.0
3      NaN
4      NaN
5      NaN
6      NaN
7      NaN
8      NaN
9      NaN
10     NaN
11     NaN
12     NaN
13    27.0
14     NaN
15     NaN
16     NaN
Name: Warcheinlische Fälle, dtype: float64

In [483]:
int(to_compare_from_ereignisdatenbank["Warcheinlische Fälle"].iloc[2])

9

In [251]:
links_from_sources_filtered

['http://www.who.int/csr/don/15-march-2018-mers-oman/en/',
 'http://www.who.int/csr/don/09-april-2018-listeriosis-australia/en/',
 'http://www.who.int/csr/don/20-april-2018-lassa-fever-nigeria/en/',
 'http://www.who.int/news-room/detail/11-05-2018-who-and-partners-working-with-national-health-authorities-to-contain-new-ebola-outbreak-in-the-democratic-republic-of-the-congo',
 'http://www.who.int/csr/don/14-may-2018-ebola-drc/en/',
 'http://www.who.int/csr/don/17-may-2018-ebola-drc/en/',
 'http://www.who.int/news-room/detail/21-05-2018-who-supports-ebola-vaccination-of-high-risk-populations-in-the-democratic-republic-of-the-congo',
 'http://www.who.int/csr/don/31-may-2018-nipah-virus-india/en/',
 'http://www.promedmail.org/post/5838919',
 'http://www.who.int/csr/don/11-june-2018-measles-brazil/en/',
 'http://www.who.int/csr/don/18-june-2018-rift-valley-fever-kenya/en/',
 'http://www.promedmail.org/post/5936610',
 'http://www.cidrap.umn.edu/news-perspective/2018/08/news-scan-aug-07-2018'

In [137]:
extract("http://www.promedmail.org/post/5838919")

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

['©2001,2008 International Society for Infectious Diseases All Rights Reserved.Read our privacy guidelines.  Use of this web site and related services is governed by the Terms of Service.']

In [521]:
comparison_df = pd.concat([df, to_compare_from_ereignisdatenbank], axis=1, sort=False)

In [522]:
comparison_df["date"] = comparison_df["date"].astype(object) # To restore lists

In [523]:
comparison_df.columns

Index(['date', 'confirmed_cases', 'keyword', 'geoname', 'links',
       'Ausgangs- bzw. Ausbruchsland ', 'Krankheitsbild(er)',
       'Frühestbekannter Ereignisbeginn', 'Erstveröffentlichung',
       'Datenstand für Fallzahlen gesamt*', 'Fälle bestätigt ',
       'Warcheinlische Fälle', 'Verdachtsfälle', 'Todesfälle '],
      dtype='object')

## Date comparison

In [518]:
comparison_df = ereignisdatenbank_to_timestamp(comparison_df)
comparison_df["combined_dates"] = comparison_df.iloc[:,7:9].values.tolist()

In [524]:
comparison_df["combined_counts"] = comparison_df.iloc[:,[10,12]].values.tolist()

In [527]:
comparison_df = comparison_df.iloc[:,[0,1,2,3,5,13,14]]

In [528]:
comparison_df.head()

Unnamed: 0,date,confirmed_cases,keyword,geoname,Ausgangs- bzw. Ausbruchsland,Todesfälle,combined_counts
0,[],[1],"[Middle East respiratory syndrome, Middle East...",[Middle East],Oman,,"[nan, nan]"
1,"[2018-04-01, 2018-03-02, 2018-01-17, 2018-04-0...","[20, 19, 1, 7, 1, 27, 8, 30]","[listeriosis, listeriosis, listeriosis, lister...","[Commonwealth of Australia, Hong Kong Special ...",Australien,7.0,"[19, nan]"
2,"[2018-04-01, 2018-01-01, 1849-01-01, 2018-04-1...","[21, 413, 9, 1422, 5, 413, 9, 114, 27, 7, 8, 1...","[Lassa fever, Lassa fever, Lassa fever, Lassa ...","[Anambra State, Bauchi, Benue State, Federal C...",Nigeria,105.0,"[413, 1849]"
3,"[2018-05-11, 2018-05-10, 2018-05-09, 2018-11-1...","[34, 2, 18, 14, 5, 5, 2, 1, 1, 3, 2, 15, 4, 5]","[Ebola hemorrhagic fever, Ebola hemorrhagic fe...","[Democratic Republic of the Congo, Province de...",Demokratische Republik Kongo,18.0,"[2, nan]"
4,"[2018-05-10, 2018-04-04, 2018-05-13, 2018-05-1...","[7, 39, 19, 3, 2, 20, 7, 3, 5, 2, 393, 1200000...","[Ebola hemorrhagic fever, Ebola hemorrhagic fe...","[Province de l’Équateur, Mbandaka, Mbandaka, M...",Demokratische Republik Kongo,20.0,"[2, nan]"


In [502]:
matches = {"date":[],"cases":[]}
for index, row in comparison_df.iterrows():
    print(row['combined_counts'])
    matches["date"].append([any(date in epi_date  for epi_date in row["date"]) for date in row['combined_dates']])
    matches["cases"].append([any(count in epi_case for epi_case in row["confirmed_cases"]) for count in row['combined_counts']])

['nan', 'nan']
['19', '1 0']
['413', '9 0']
['2', 'nan']
['2', 'nan']
['3', 'nan']
['nan', 'nan']
['15', 'nan']
['1', 'nan']
['114', 'nan']
['nan', 'nan']
['nan', 'nan']
['nan', 'nan']
['16', '27 0']
['nan', 'nan']
['nan', 'nan']
['nan', 'nan']


In [471]:
matches["cases"]

[[False, False, False],
 [False, False, False],
 [False, False, False],
 [True, False, False],
 [True, False, False],
 [True, False, False],
 [False, False, False],
 [False, False, False],
 [True, False, False],
 [False, False, False],
 [False, False, False],
 [False, False, False],
 [False, False, False],
 [False, False, False],
 [False, False, False],
 [False, False, False],
 [False, False, False]]

In [461]:
epi = ['2018-04-01', '2018-03-02', '2018-01-17', '2018-04-06', '2018-01-17', '2018-03-02', '2018-03-03', '2018-03-07', '2018-03-08', '2018-03-08', '2018-03-09', '2018-03-08', '2018-04-04', '2018-04-02']

In [462]:
erg = ['2018-01-17', 'nan']

In [463]:
[any(date in epi_date for epi_date in epi) for date in erg]

[True, False]