In [2]:
import spacy
spacy.load('en_core_web_sm')
from epitator.annotator import AnnoDoc
from epitator.geoname_annotator import GeonameAnnotator
from epitator.resolved_keyword_annotator import ResolvedKeywordAnnotator
from epitator.count_annotator import CountAnnotator
from epitator.date_annotator import DateAnnotator
from boilerpipe.extract import Extractor
from itertools import groupby
import datetime
import sys
import pandas as pd
import re
from tqdm import tqdm_notebook as tqdm
import numpy as np
import epitator

# Scrape

In [3]:
import requests
from bs4 import BeautifulSoup
from sys import stdout
from time import sleep
import pickle
import os

def get_links_by_year(list_of_years=None, proxies={'http': 'http://fw-bln.rki.local:8020'}):
    """Returns (all) the anual links of the WHO DONs 
    
    list_of_years -- a list of years (YYYY format) you want to parse (default None)
    proxies -- the proxy to use while scraping (default {'http': 'http://fw-bln.rki.local:8020'})
    """
    page = requests.get('http://www.who.int/csr/don/archive/year/en/',proxies=proxies)
    soup = BeautifulSoup(page.content,'html.parser')
    archiv_years = soup.find('ul',attrs={'class':'list'})
    years_links_html = archiv_years.find_all('a')
    if list_of_years:
        return ['http://www.who.int' + link.get('href') for link in years_links_html if any(year in link for year in list_of_years)]
    else:
        return ['http://www.who.int' + link.get('href') for link in years_links_html]
    
def get_links_per_year(years_links, list_of_months=None, proxies={'http': 'http://fw-bln.rki.local:8020'}):
    """Take a list of links to the annual archive and return a list of DON links of these years
    
    years_links -- a list of links of the anual archive to parse 
    list_of_months -- a list of months (MMM* format) you want to parse (default None)
    proxies -- the proxy to use while scraping (default {'http': 'http://fw-bln.rki.local:8020'})
    """
    all_links = []
    
    for year_link in years_links:
        page_year = requests.get(year_link,proxies=proxies)
        soup_year = BeautifulSoup(page_year.content,'html.parser')
        archive_year = soup_year.find('ul',attrs={'class':'auto_archive'})
        daily_links = ['http://www.who.int' + link.get('href') for link in archive_year.find_all('a')]
        all_links.extend(daily_links)
    
    if list_of_months:
        all_links = [link for link in all_links if any(month in link for month in map(lambda s:s.lower(),list_of_months))]
    return all_links
    
headers = {
    'User-Agent': 'Auss Abbood, www.rki.de',
    'From': 'abbooda@rki.de'
}

def scrape(years=None,
           months=None,
           num_last_reports=None,
           headers=None,
           proxies={'http': 'http://fw-bln.rki.local:8020'}):
    """Scrapes the WHO DONs using the WHO DON scraping functions and returns the links to these DONs
    
    years -- a list of strings of years in the format YYYY to be scraped
    months -- a list of strings of months in the format MMM* to be scraped
    num_list_reports -- an integer to specify how many of the last reports should be scraped. 
    can be combined with the specification of year and/or month
    headers -- use a header for scraping
    proxies -- the proxy to use while scraping (default {'http': 'http://fw-bln.rki.local:8020'})
    """
    years = get_links_by_year(list_of_years=years,proxies=proxies)
    all_links = get_links_per_year(years,list_of_months=months,proxies=proxies)
    return all_links

In [19]:
# Scrape all the WHO DONs of the year 2018
all_links = scrape(years=['2018'],proxies=None)

In [40]:
# Extract the main text of the given links
from boilerpipe.extract import Extractor
def extract(list_of_links):
    """Extracts the main content from a list of links and returns a list of texts (str)

    list_of_links -- a list containing URLs of webpages to get the main content from
    """
    return[Extractor(extractor='ArticleExtractor', url=url).getText().replace('\n','') \
         for url in tqdm(list_of_links)]

# Annotation
### Annotate

In [5]:
def annotate(text):
    """ Returns an document annotated for dates, disease counts, diseases, and geoneames
    
    text -- a string to be annotated
    """
    doc = AnnoDoc(text)
    doc.add_tiers(GeonameAnnotator())
    doc.add_tiers(ResolvedKeywordAnnotator())
    doc.add_tiers(CountAnnotator())
    doc.add_tiers(DateAnnotator())
    return doc

### Geonames

In [35]:
def get_geonames(doc,raw=False):
    """Returns the most occuring geographical entity in a annotated document
    
    doc -- an annotated string
    raw -- returns a not preprocessed annotation (Default False)
    """
    if raw:
        return [doc.tiers["geonames"].spans[i].geoname["name"] for i in range(len(doc.tiers["geonames"].spans))]
    else:
        geonames = [doc.tiers["geonames"].spans[i].geoname["name"] for i in range(len(doc.tiers["geonames"].spans))]
        geoname_counts = [(key,len(list(group))) for key, group in groupby(sorted(geonames))]
        geoname_ranking = sorted(geoname_counts,key=lambda x:x[1],reverse=True)
        geoname_most_occure = [geoname[0] for geoname in geoname_ranking if geoname[1] == geoname_ranking[0][1]]
        return geoname_most_occure

### Keywords

In [36]:
def get_keywords(doc,raw=False):
    """Returns the most occuring disease entity in a annotated document
    
    doc -- an annotated string
    raw -- returns a not preprocessed annotation (Default False)
    """
    if raw:
        return [doc.tiers["resolved_keywords"].spans[i].metadata["resolutions"][0]['entity']['label'] \
                for i in range(len(doc.tiers["resolved_keywords"].spans)) \
               if doc.tiers["resolved_keywords"].spans[i].metadata["resolutions"][0]['entity']['type'] \
                    == 'disease']
                     
    else:
        keywords = [(doc.tiers["resolved_keywords"].spans[i].metadata["resolutions"][0]['entity']['label'] \
                     ,doc.tiers["resolved_keywords"].spans[i].metadata["resolutions"][0]["weight"]) \
                    for i in range(len(doc.tiers["resolved_keywords"].spans)) \
                    if doc.tiers["resolved_keywords"].spans[i].metadata["resolutions"][0]['entity']['type'] \
                    == 'disease']

        # Here I ignore the included weights and only considere the most occuring disease name
        keywords_without_weight = [disease[0] for disease in keywords]
        keyword_counts = [(key,len(list(group))) for key, group in groupby(sorted(keywords_without_weight))]
        try:
            keyword = max(keyword_counts,key=lambda x:x[1])
        except ValueError:
            keyword = np.nan
        if type(keyword) is float:
            return keyword
        else:
            return keyword[0] # Only returns the keyword, not the weight

### Counts

In [37]:
def get_cases(doc,raw=False):
    """Returns the disease counts with the attribute "confirmed" in a annotated document
    
    doc -- an annotated string
    raw -- returns a not preprocessed annotation (Default False)
    """
    if raw:
        return [doc.tiers["counts"].spans[i].metadata['count'] for i in range(len(doc.tiers["counts"].spans))]
    else:
        return [doc.tiers["counts"].spans[i].metadata['count'] \
                for i in range(len(doc.tiers["counts"].spans)) \
                if "confirmed" in doc.tiers["counts"].spans[i].metadata['attributes']]


### Dates

In [38]:
def get_date(doc,raw=False):
    """Returns most mentioned date in a annotated document
    
    doc -- an annotated string
    raw -- returns a not preprocessed annotation (Default False)
    """
    dates = [doc.tiers["dates"].spans[i].metadata["datetime_range"][0].strftime("%Y-%m-%d") \
             for i in range(len(doc.tiers["dates"].spans))]
    if raw:
        return dates
    else:
        date_count_tuple = [(key,len(list(group))) for key, group in groupby(sorted(dates))]
        try:
            date = max(date_count_tuple,key=lambda x:x[1])
        except ValueError:
            date = np.nan
        if type(date) is float:
            return date
        else:
            return date[0]

In [39]:
# Run this shit (a.k.a annotate all the scraped WHO DONs)
def create_annotated_database(texts,raw=False):
    """Given a list of texts (str) annotate and extract disease keywords, geonames, and dates and return
    a dictonary of the text and the annotations
    
    texts -- a list of texts (str)
    raw -- returns a not preprocessed annotation (Default False)
    """
    database = {"text":texts,"date":[],"confirmed_cases":[],"keyword":[],"geoname":[]}
    for i,text in enumerate(tqdm(texts)):
        try:
            doc = annotate(text)
            database["date"].append(get_date(doc,raw))
            database["confirmed_cases"].append(get_cases(doc,raw))
            database["keyword"].append(get_keywords(doc,raw))
            database["geoname"].append(get_geonames(doc,raw))
        except TypeError as e:
            print("Type error in text({})".format(i) + ": " + str(e))
    return database

In [20]:
parsed_whos_df = pd.DataFrame.from_dict(create_annotated_database(extract(all_links)))
parsed_whos_df.head()

HBox(children=(IntProgress(value=0, max=81), HTML(value='')))




HBox(children=(IntProgress(value=0, max=81), HTML(value='')))




Unnamed: 0,text,date,confirmed_cases,keyword,geoname
0,Ebola virus disease – Democratic Republic of t...,2018-11-13,[],Ebola hemorrhagic fever,[Democratic Republic of the Congo]
1,Ebola virus disease – Democratic Republic of t...,2018-11-06,[],Ebola hemorrhagic fever,[Democratic Republic of the Congo]
2,Middle East respiratory syndrome coronavirus (...,2018-09-17,[2005],Middle East respiratory syndrome,"[Kingdom of Saudi Arabia, Middle East]"
3,Ebola virus disease – Democratic Republic of t...,2018-10-30,"[1, 1]",Ebola hemorrhagic fever,[Democratic Republic of the Congo]
4,Circulating vaccine-derived poliovirus type 2 ...,2018-01-01,[],poliomyelitis,"[Federal Republic of Nigeria, Republic of Niger]"


In [21]:
pickle.dump( parsed_whos_df, open( "parsed_whos_df.p", "wb" ) )

In [22]:
parsed_whos_df = pickle.load( open( "parsed_whos_df.p", "rb" ) )

In [23]:
parsed_whos_df.head()

Unnamed: 0,text,date,confirmed_cases,keyword,geoname
0,Ebola virus disease – Democratic Republic of t...,2018-11-13,[],Ebola hemorrhagic fever,[Democratic Republic of the Congo]
1,Ebola virus disease – Democratic Republic of t...,2018-11-06,[],Ebola hemorrhagic fever,[Democratic Republic of the Congo]
2,Middle East respiratory syndrome coronavirus (...,2018-09-17,[2005],Middle East respiratory syndrome,"[Kingdom of Saudi Arabia, Middle East]"
3,Ebola virus disease – Democratic Republic of t...,2018-10-30,"[1, 1]",Ebola hemorrhagic fever,[Democratic Republic of the Congo]
4,Circulating vaccine-derived poliovirus type 2 ...,2018-01-01,[],poliomyelitis,"[Federal Republic of Nigeria, Republic of Niger]"


## Compare with Ereignisdatenbank (incident report)

In [12]:
# Read in with columns with sources only
ereignisdatenbank = pd.read_csv("Ereignisse_utf8.csv",sep=";")

In [13]:
sources = ereignisdatenbank.iloc[:,15:26] # Get only the columns mentioning sources
sources = sources.dropna(how="all").reset_index(drop=True) # Drop empty rows at the end
sources.head()

Unnamed: 0,Quelle 1*,Datum der Veröffentlichung der Quelle1,Link zur Quelle 1,Quelle 2,Datum der Veröffentlichung der Quelle 2,Link zur Quelle 2,Quelle 3,Datum der Veröffentlichung der Quelle 3,Link zur Quelle 3,Quelle 4,Datum der Veröffentlichung der Quelle 4
0,WHO AFRO Bericht,12.03.2018,<http://apps.who.int/iris/bitstream/10665/2604...,NCDC,18.03.2018,,,,,,
1,WHO AFRO Bericht,12.03.2018,<http://apps.who.int/iris/bitstream/10665/2604...,,,,,,,,
2,WHO AFRO Bericht,21.03.2018,<http://apps.who.int/iris/bitstream/10665/2604...,,,,,,,,
3,WHO AFRO Bericht,12.03.2018,<http://apps.who.int/iris/bitstream/10665/2604...,,,,,,,,
4,ProMED-mail,21.03.2018,,MoH Fiji,,,,,,,


In [14]:
mask = sources.copy(deep=True) # Create a mask for filtering
for column in sources.columns:
    mask[column] = sources[column].str.contains('who',na=False) # Extract all the entries that have the word "who"
    mask[column] = sources[column].str.contains('don',na=False) # and "don"
indices = [i for i in range(len(mask)) if not mask.iloc[i].any()] 
sources_filtered = sources.drop(np.reshape(indices,(len(indices),))) # Drop all rows that don't mention "who" or "don"

In [15]:
#ereignisdatenbank.iloc[indices]
sources_filtered.head()

Unnamed: 0,Quelle 1*,Datum der Veröffentlichung der Quelle1,Link zur Quelle 1,Quelle 2,Datum der Veröffentlichung der Quelle 2,Link zur Quelle 2,Quelle 3,Datum der Veröffentlichung der Quelle 3,Link zur Quelle 3,Quelle 4,Datum der Veröffentlichung der Quelle 4
25,WHO,15.03.2018,http://www.who.int/csr/don/15-march-2018-mers-...,,,,,,,,
52,EAR report,13.04.2018,http://www.who.int/csr/don/09-april-2018-liste...,,,,,,,,
70,NCDC SitRep,15.04.2018,http://ncdc.gov.ng/themes/common/files/sitreps...,WHO DON,20.04.2018,http://www.who.int/csr/don/20-april-2018-lassa...,CIDRAP,18.04.2018,http://www.cidrap.umn.edu/news-perspective/201...,,
107,WHO,11.05.2018,http://www.who.int/news-room/detail/11-05-2018...,WHO DON,10.05.2018,http://www.who.int/csr/don/10-may-2018-ebola-d...,,,,,
112,WHO GOARN TK,15.05.2018,,WHO DON,14.05.2018,http://www.who.int/csr/don/14-may-2018-ebola-d...,,,,,


In [16]:
# Transform time to timestamp
sources_filtered["Datum der Veröffentlichung der Quelle1"] = \
sources_filtered["Datum der Veröffentlichung der Quelle1"].str.replace('.', ' ')

sources_filtered["Datum der Veröffentlichung der Quelle1"] = \
sources_filtered["Datum der Veröffentlichung der Quelle1"].apply(lambda x:datetime.datetime.strptime(x,'%d %m %Y'))

In [24]:
date_matches = {}
for column in sources_filtered.columns[::-3]: # Use only the columns mentioning dates
    date_matches[column] = [i for i in range(len(parsed_whos_df))\
                            if list(map(lambda x:str(x)[:-3],parsed_whos_df["date"].tolist()))[i] \
                            in list(map(lambda x:str(x)[:-12],sources_filtered[column].tolist()))]

In [25]:
import itertools
indices_that_matched = list(set(itertools.chain(*date_matches.values())))

In [26]:
ereignisdatenbank.iloc[sources_filtered.index.tolist(),[3,6,7,9]]

Unnamed: 0,Ausgangs- bzw. Ausbruchsland,Krankheitsbild(er),Frühestbekannter Ereignisbeginn,Fälle gesamt*
25,Oman,MERS,,
52,Australien,Listeriose,17.01.2018,20.0
70,Nigeria,Lassafieber,01.01.2018,
107,Demokratische Republik Kongo,Ebola,04.04.2018,34.0
112,Demokratische Republik Kongo,Ebola,04.04.2018,41.0
113,Demokratische Republik Kongo,Ebola,04.04.2018,44.0
120,Demokratische Republik Kongo,Ebola,04.04.2018,46.0
140,Indien,,,
145,Kamerun,Affenpocken,30.04.1018,16.0
149,Brasilien,Masern,,995.0


In [27]:
parsed_whos_df.iloc[indices_that_matched,1:].sort_values("date")

Unnamed: 0,date,confirmed_cases,keyword,geoname
64,2018-03-01,[],influenza,[Kingdom of the Netherlands]
60,2018-03-08,[],listeriosis,[Commonwealth of Australia]
25,2018-04-01,[1],yellow fever,[Guyane]
35,2018-04-01,"[1, 1]",Ebola hemorrhagic fever,[Democratic Republic of the Congo]
49,2018-04-04,[],Ebola hemorrhagic fever,[Mbandaka]
37,2018-04-04,"[1, 4]",Ebola hemorrhagic fever,[Democratic Republic of the Congo]
42,2018-04-04,[1],Ebola hemorrhagic fever,[Democratic Republic of the Congo]
45,2018-04-04,[],Ebola hemorrhagic fever,[Democratic Republic of the Congo]
36,2018-04-24,[],poliomyelitis,[Independent State of Papua New Guinea]
47,2018-04-30,[1],monkeypox,[Republic of Cameroon]


In [28]:
# Prettify the link description
link_description = [re.search(r'don/(.*)/en',all_links[i])[1]\
                    .replace('-', ' ',2).replace('-',', ',2).replace('-',' ')\
                    for i in range(len(all_links))]

In [29]:
# Extract the most important columns
compare = parsed_whos_df.iloc[:,[1,3,4]].copy()
compare['link_description'] = pd.Series(link_description,index=compare.index)

# To present

In [30]:
compare

Unnamed: 0,date,keyword,geoname,link_description
0,2018-11-13,Ebola hemorrhagic fever,[Democratic Republic of the Congo],"15 november 2018, ebola, drc"
1,2018-11-06,Ebola hemorrhagic fever,[Democratic Republic of the Congo],"08 november 2018, ebola, drc"
2,2018-09-17,Middle East respiratory syndrome,"[Kingdom of Saudi Arabia, Middle East]","01 november 2018, mers, saudi arabia"
3,2018-10-30,Ebola hemorrhagic fever,[Democratic Republic of the Congo],"01 november 2018, ebola, drc"
4,2018-01-01,poliomyelitis,"[Federal Republic of Nigeria, Republic of Niger]","30 october 2018, polio, niger"
5,2018-10-23,Ebola hemorrhagic fever,[Democratic Republic of the Congo],"25 october 2018, ebola, drc"
6,2018-10-16,Ebola hemorrhagic fever,[Democratic Republic of the Congo],"18 october 2018, ebola, drc"
7,2018-05-31,chikungunya,[Kassala State],"15 october 2018, chikungunya, sudan"
8,2018-10-09,Ebola hemorrhagic fever,[Democratic Republic of the Congo],"11 october 2018, ebola, drc"
9,2018-09-04,cholera,[Harare],"05 october 2018, cholera, zimbabwe"


In [31]:
# These are the links that were faulty during annotation
to_check = compare[compare["keyword"].isnull()==True].index.values
links_to_check = np.asarray(all_links)[to_check] # Get the links that caused the bad annotations

In [32]:
annotated_faulty_text = create_annotated_database(extract(links_to_check),raw=True)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




In [33]:
faulty_df = pd.DataFrame.from_dict(annotated_faulty_text).iloc[:,1:]
faulty_df

Unnamed: 0,date,confirmed_cases,keyword,geoname
0,"[2018-08-01, 2018-07-17, 2018-06-01, 2018-07-3...","[19, 17, 18, 1, 2, 3, 3, 4, 1, 2, 2600, 5000, ...",[],"[State of Kerala, State of Kerala, Kozhikode, ..."


In [34]:
# Entries that had the occurence of a geographical entity more than once
parsed_whos_df[parsed_whos_df["geoname"].str.len()>1]

Unnamed: 0,text,date,confirmed_cases,keyword,geoname
2,Middle East respiratory syndrome coronavirus (...,2018-09-17,[2005],Middle East respiratory syndrome,"[Kingdom of Saudi Arabia, Middle East]"
4,Circulating vaccine-derived poliovirus type 2 ...,2018-01-01,[],poliomyelitis,"[Federal Republic of Nigeria, Republic of Niger]"
30,Circulating vaccine-derived poliovirus type 2 ...,2016-01-01,[],poliomyelitis,"[Borno State, Gombe State, Jigawa State, Sokot..."
40,Middle East respiratory syndrome coronavirus (...,2018-02-01,"[75, 75, 2220, 790]",Middle East respiratory syndrome,"[Kingdom of Saudi Arabia, Middle East]"
52,Ebola virus disease – Democratic Republic of t...,2015-01-01,[],Ebola hemorrhagic fever,"[Democratic Republic of the Congo, Republic of..."
53,Circulating vaccine-derived polioviruses – Hor...,2018-05-01,[],poliomyelitis,"[Horn of Africa, Somalia]"
67,Circulating vaccine-derived poliovirus type 2 ...,2017-01-01,[],poliomyelitis,"[Mogadishu, Somalia]"


# Geo Tests

In [78]:
import pycountry

In [92]:
country_names = [list(pycountry.countries)[i].name for i in range(len(pycountry.countries))]

In [969]:
sorted(country_names)

In [101]:
from googletrans import Translator

In [99]:
import geograpy
url = 'http://www.bbc.com/news/world-europe-26919928'
places = geograpy.get_place_context(url=url)

ModuleNotFoundError: No module named 'geograpy'

# Goodnes Test

In [48]:
sources_filtered.head()

Unnamed: 0,Quelle 1*,Datum der Veröffentlichung der Quelle1,Link zur Quelle 1,Quelle 2,Datum der Veröffentlichung der Quelle 2,Link zur Quelle 2,Quelle 3,Datum der Veröffentlichung der Quelle 3,Link zur Quelle 3,Quelle 4,Datum der Veröffentlichung der Quelle 4
25,WHO,2018-03-15,http://www.who.int/csr/don/15-march-2018-mers-...,,,,,,,,
52,EAR report,2018-04-13,http://www.who.int/csr/don/09-april-2018-liste...,,,,,,,,
70,NCDC SitRep,2018-04-15,http://ncdc.gov.ng/themes/common/files/sitreps...,WHO DON,20.04.2018,http://www.who.int/csr/don/20-april-2018-lassa...,CIDRAP,18.04.2018,http://www.cidrap.umn.edu/news-perspective/201...,,
107,WHO,2018-05-11,http://www.who.int/news-room/detail/11-05-2018...,WHO DON,10.05.2018,http://www.who.int/csr/don/10-may-2018-ebola-d...,,,,,
112,WHO GOARN TK,2018-05-15,,WHO DON,14.05.2018,http://www.who.int/csr/don/14-may-2018-ebola-d...,,,,,


In [63]:
sources_filtered = sources_filtered.fillna("nan")

In [83]:
links = [link for link in sources_filtered["Link zur Quelle 1"] if (link != "nan") and ("pdf" not in link) ]

In [84]:
links

['http://www.who.int/csr/don/15-march-2018-mers-oman/en/',
 'http://www.who.int/csr/don/09-april-2018-listeriosis-australia/en/',
 'http://www.who.int/news-room/detail/11-05-2018-who-and-partners-working-with-national-health-authorities-to-contain-new-ebola-outbreak-in-the-democratic-republic-of-the-congo',
 'http://www.who.int/news-room/detail/21-05-2018-who-supports-ebola-vaccination-of-high-risk-populations-in-the-democratic-republic-of-the-congo',
 'http://www.who.int/csr/don/31-may-2018-nipah-virus-india/en/',
 'http://www.promedmail.org/post/5838919',
 'http://www.who.int/csr/don/11-june-2018-measles-brazil/en/',
 'http://www.who.int/csr/don/18-june-2018-rift-valley-fever-kenya/en/',
 'http://www.promedmail.org/post/5936610',
 'http://www.cidrap.umn.edu/news-perspective/2018/08/news-scan-aug-07-2018',
 'http://www.who.int/csr/don/07-august-2018-nipah-virus-india/en/',
 'http://www.who.int/csr/don/02-July-2018-polio-png/en/',
 'http://www.who.int/csr/don/24-august-2018-yellow-feve

In [85]:
links.extend([link for link in sources_filtered["Link zur Quelle 2"] if (link != "nan") and ("pdf" not in link)])

In [86]:
extracted = extract(links)

HBox(children=(IntProgress(value=0, max=22), HTML(value='')))

In [89]:
parsed_links_ereignisdatenbank = create_annotated_database(extracted)

HBox(children=(IntProgress(value=0, max=22), HTML(value='')))

In [90]:
pd.DataFrame.from_dict(parsed_links_ereignisdatenbank)

Unnamed: 0,text,date,confirmed_cases,keyword,geoname
0,"xls, 229kbPublic health responseIdentification...",,[],Middle East respiratory syndrome,[Middle East]
1,Disease outbreak news9 April 2018On 2 March 20...,2018-03-08,[],listeriosis,[Commonwealth of Australia]
2,Detail /WHO and partners working with national...,2018-11-17,[],Ebola hemorrhagic fever,[Democratic Republic of the Congo]
3,WHO supports Ebola vaccination of high risk po...,2015-01-01,"[4, 1]",Ebola hemorrhagic fever,[Democratic Republic of the Congo]
4,"31 May 2018Event summaryOn 19 May 2018, three ...",2001-01-01,"[15, 1]",viral infectious disease,[Republic of India]
5,"©2001,2008 International Society for Infectiou...",,[],,[]
6,"Disease outbreak news11 June 2018In Brazil, th...",2018-01-01,"[84, 34, 2017]",measles,[Federative Republic of Brazil]
7,Disease outbreak news18 June 2018On 8 June 201...,2018-02-01,[7],Rift Valley fever,[Rift Valley Province]
8,"©2001,2008 International Society for Infectiou...",,[],,[]
9,"News Scan for Aug 07, 2018Role of cattle in ro...",2018-11-18,[1],poliomyelitis,[Republic of Guinea]


In [100]:
ereignisdatenbank.columns

Index(['Zeilen-ID', 'Ereignis-ID', 'Kontinent',
       'Ausgangs- bzw. Ausbruchsland ', 'Sekundär betroffene Länder*',
       'Erreger', 'Krankheitsbild(er)', 'Frühestbekannter Ereignisbeginn',
       'Erstveröffentlichung', 'Fälle gesamt*  ',
       'Datenstand für Fallzahlen gesamt*', 'Fälle bestätigt ',
       'Warcheinlische Fälle', 'Verdachtsfälle', 'Todesfälle ', 'Quelle 1*',
       'Datum der Veröffentlichung der Quelle1', 'Link zur Quelle 1',
       'Quelle 2', 'Datum der Veröffentlichung der Quelle 2',
       'Link zur Quelle 2', 'Quelle 3',
       'Datum der Veröffentlichung der Quelle 3', 'Link zur Quelle 3',
       'Quelle 4', 'Datum der Veröffentlichung der Quelle 4',
       'Link zur Quelle 4', 'Rationale für Monitoring*',
       'Rationale für Monitoring* Kommentar', 'Monitoring-Frequenz',
       'Verbindung zu Ereignis-ID*',
       'RKI-Berichtsformat* (z.B. Wochenbericht, EpiLag, etc)',
       'RKI-Berichtsformat Kategorie (z.B. Kurznachricht, Fortschreibung, etc.) ',


In [105]:
ereignisdatenbank.iloc[sources_filtered.index.tolist(),[3,6,7,8,10,11,12,13,14]]

Unnamed: 0,Ausgangs- bzw. Ausbruchsland,Krankheitsbild(er),Frühestbekannter Ereignisbeginn,Erstveröffentlichung,Datenstand für Fallzahlen gesamt*,Fälle bestätigt,Warcheinlische Fälle,Verdachtsfälle,Todesfälle
25,Oman,MERS,,,,,,,
52,Australien,Listeriose,17.01.2018,,02.03.2018,19.0,1.0,,7.0
70,Nigeria,Lassafieber,01.01.2018,,15.04.2018,413.0,9.0,1849.0,105.0
107,Demokratische Republik Kongo,Ebola,04.04.2018,,,2.0,,,18.0
112,Demokratische Republik Kongo,Ebola,04.04.2018,,,2.0,,,20.0
113,Demokratische Republik Kongo,Ebola,04.04.2018,,,3.0,,,23.0
120,Demokratische Republik Kongo,Ebola,04.04.2018,,,,,,24.0
140,Indien,,,,28.05.2018,15.0,,16.0,13.0
145,Kamerun,Affenpocken,30.04.1018,,30.05.2018,1.0,,15.0,
149,Brasilien,Masern,,,,114.0,,,2.0
