# Old stuff

In [4]:
import spacy
spacy.load('en_core_web_md')
from epitator.annotator import AnnoDoc
from epitator.geoname_annotator import GeonameAnnotator
from epitator.resolved_keyword_annotator import ResolvedKeywordAnnotator
from epitator.count_annotator import CountAnnotator
from epitator.date_annotator import DateAnnotator
from boilerpipe.extract import Extractor
from itertools import groupby
import datetime
import sys
import pandas as pd
import re
from tqdm import tqdm_notebook as tqdm
import numpy as np
import epitator

In [4]:
import requests
from bs4 import BeautifulSoup
from sys import stdout
from time import sleep
import pickle
import os

def get_links_by_year(list_of_years=None, proxies={'http': 'http://fw-bln.rki.local:8020'}):
    """Returns (all) the anual links of the WHO DONs 
    
    list_of_years -- a list of years (YYYY format) you want to parse (default None)
    proxies -- the proxy to use while scraping (default {'http': 'http://fw-bln.rki.local:8020'})
    """
    page = requests.get('http://www.who.int/csr/don/archive/year/en/',proxies=proxies)
    soup = BeautifulSoup(page.content,'html.parser')
    archiv_years = soup.find('ul',attrs={'class':'list'})
    years_links_html = archiv_years.find_all('a')
    if list_of_years:
        return ['http://www.who.int' + link.get('href') for link in years_links_html if any(year in link for year in list_of_years)]
    else:
        return ['http://www.who.int' + link.get('href') for link in years_links_html]
    
def get_links_per_year(years_links, list_of_months=None, proxies={'http': 'http://fw-bln.rki.local:8020'}):
    '''Take a list of links to the annual archive and return a list of DON links of these years
    
    years_links -- a list of links of the anual archive to parse 
    list_of_months -- a list of months (MMM* format) you want to parse (default None)
    proxies -- the proxy to use while scraping (default {'http': 'http://fw-bln.rki.local:8020'})
    '''
    all_links = []
    
    for year_link in years_links:
        page_year = requests.get(year_link,proxies=proxies)
        soup_year = BeautifulSoup(page_year.content,'html.parser')
        archive_year = soup_year.find('ul',attrs={'class':'auto_archive'})
        daily_links = ['http://www.who.int' + link.get('href') for link in archive_year.find_all('a')]
        all_links.extend(daily_links)
    
    if list_of_months:
        all_links = [link for link in all_links if any(month in link for month in map(lambda s:s.lower(),list_of_months))]
    return all_links
    
headers = {
    'User-Agent': 'Auss Abbood, www.rki.de',
    'From': 'abbooda@rki.de'
}

def scrape(years=None,
           months=None,
           num_last_reports=None,
           headers=None,
           proxies={'http': 'http://fw-bln.rki.local:8020'}):
    '''Scrapes the WHO DONs using the WHO DON scraping functions and returns the links to these DONs
    
    years -- a list of strings of years in the format YYYY to be scraped
    months -- a list of strings of months in the format MMM* to be scraped
    num_list_reports -- an integer to specify how many of the last reports should be scraped. 
    can be combined with the specification of year and/or month
    headers -- use a header for scraping
    proxies -- the proxy to use while scraping (default {'http': 'http://fw-bln.rki.local:8020'})
    '''
    years = get_links_by_year(list_of_years=years,proxies=proxies)
    all_links = get_links_per_year(years,list_of_months=months,proxies=proxies)
    return all_links

In [5]:
# Scrape all the WHO DONs of the year 2018
all_links = scrape(years=['2018'],proxies=None)

In [6]:
# Extract the main text of the given links
from boilerpipe.extract import Extractor
def extract(list_of_links):
    '''Extracts the main content from a list of links and returns a list of texts (str)

    list_of_links -- a list containing URLs of webpages to get the main content from
    '''
    return[Extractor(extractor='ArticleExtractor', url=url).getText().replace('\n','') \
         for url in tqdm(list_of_links)]

In [7]:
# Run this shit (a.k.a annotate all the scraped WHO DONs)
def create_annotated_database(texts,raw=False):
    '''Given a list of texts (str) annotate and extract disease keywords, geonames, and dates and return
    a dictonary of the text and the annotations
    
    texts -- a list of texts (str)
    raw -- returns a not preprocessed annotation (Default False)
    '''
    database = {"text":texts,"date":[],"confirmed_cases":[],"keyword":[],"geoname":[]}
    for i,text in enumerate(tqdm(texts)):
        try:
            doc = annotate(text)
            database["date"].append(get_date(doc,raw))
            database["confirmed_cases"].append(get_cases(doc,raw))
            database["keyword"].append(get_keywords(doc,raw))
            database["geoname"].append(get_geonames(doc,raw))
        except TypeError as e:
            print("Type error in text({})".format(i) + ": " + str(e))
    return database

In [15]:
def annotate(text):
    ''' Returns an document annotated for dates, disease counts, diseases, and geoneames
    
    text -- a string to be annotated
    '''
    doc = AnnoDoc(text)
    doc.add_tiers(GeonameAnnotator())
    doc.add_tiers(ResolvedKeywordAnnotator())
    doc.add_tiers(CountAnnotator())
    doc.add_tiers(DateAnnotator())
    return doc
def get_geonames(doc,raw=False):
    '''Returns the most occuring geographical entity in a annotated document
    
    doc -- an annotated string
    raw -- returns a not preprocessed annotation (Default False)
    '''
    if raw:
        return [doc.tiers["geonames"].spans[i].geoname["name"] for i in range(len(doc.tiers["geonames"].spans))]
    else:
        geonames = [doc.tiers["geonames"].spans[i].geoname["name"] for i in range(len(doc.tiers["geonames"].spans))]
        geoname_counts = [(key,len(list(group))) for key, group in groupby(sorted(geonames))]
        geoname_ranking = sorted(geoname_counts,key=lambda x:x[1],reverse=True)
        geoname_most_occure = [geoname[0] for geoname in geoname_ranking if geoname[1] == geoname_ranking[0][1]]
        return geoname_most_occure
def get_keywords(doc,raw=False):
    '''Returns the most occuring disease entity in a annotated document
    
    doc -- an annotated string
    raw -- returns a not preprocessed annotation (Default False)
    '''
    if raw:
        return [doc.tiers["resolved_keywords"].spans[i].metadata["resolutions"][0]['entity']['label'] \
                for i in range(len(doc.tiers["resolved_keywords"].spans)) \
               if doc.tiers["resolved_keywords"].spans[i].metadata["resolutions"][0]['entity']['type'] \
                    == 'disease']
                     
    else:
        keywords = [(doc.tiers["resolved_keywords"].spans[i].metadata["resolutions"][0]['entity']['label'] \
                     ,doc.tiers["resolved_keywords"].spans[i].metadata["resolutions"][0]["weight"]) \
                    for i in range(len(doc.tiers["resolved_keywords"].spans)) \
                    if doc.tiers["resolved_keywords"].spans[i].metadata["resolutions"][0]['entity']['type'] \
                    == 'disease']

        # Here I ignore the included weights and only considere the most occuring disease name
        keywords_without_weight = [disease[0] for disease in keywords]
        keyword_counts = [(key,len(list(group))) for key, group in groupby(sorted(keywords_without_weight))]
        try:
            keyword = max(keyword_counts,key=lambda x:x[1])
        except ValueError:
            keyword = np.nan
        if type(keyword) is float:
            return keyword
        else:
            return keyword[0] # Only returns the keyword, not the weight
def get_cases(doc,raw=False):
    '''Returns the disease counts with the attribute "confirmed" in a annotated document
    
    doc -- an annotated string
    raw -- returns a not preprocessed annotation (Default False)
    '''
    if raw:
        return [doc.tiers["counts"].spans[i].metadata['count'] for i in range(len(doc.tiers["counts"].spans))]
    else:
        return [doc.tiers["counts"].spans[i].metadata['count'] \
                for i in range(len(doc.tiers["counts"].spans)) \
                if "confirmed" in doc.tiers["counts"].spans[i].metadata['attributes']]
def get_date(doc,raw=False):
    '''Returns most mentioned date in a annotated document
    
    doc -- an annotated string
    raw -- returns a not preprocessed annotation (Default False)
    '''
    dates = [doc.tiers["dates"].spans[i].metadata["datetime_range"][0].strftime("%Y-%m-%d") \
             for i in range(len(doc.tiers["dates"].spans))]
    if raw:
        return dates
    else:
        date_count_tuple = [(key,len(list(group))) for key, group in groupby(sorted(dates))]
        try:
            date = max(date_count_tuple,key=lambda x:x[1])
        except ValueError:
            date = np.nan
        if type(date) is float:
            return date
        else:
            return date[0]

In [13]:
extracted = extract(all_links)

HBox(children=(IntProgress(value=0, max=80), HTML(value='')))




In [16]:
parsed_whos_df = pd.DataFrame.from_dict(create_annotated_database(extracted))
parsed_whos_df.head()

HBox(children=(IntProgress(value=0, max=80), HTML(value='')))




Unnamed: 0,text,date,confirmed_cases,keyword,geoname
0,Ebola virus disease – Democratic Republic of t...,2018-11-06,[],Ebola hemorrhagic fever,[Democratic Republic of the Congo]
1,Middle East respiratory syndrome coronavirus (...,2018-09-17,[2005],Middle East respiratory syndrome,"[Kingdom of Saudi Arabia, Middle East]"
2,Ebola virus disease – Democratic Republic of t...,2018-10-30,"[1, 1]",Ebola hemorrhagic fever,[Democratic Republic of the Congo]
3,Circulating vaccine-derived poliovirus type 2 ...,2018-01-01,[],poliomyelitis,"[Federal Republic of Nigeria, Republic of Niger]"
4,Ebola virus disease – Democratic Republic of t...,2018-10-23,"[27, 2]",Ebola hemorrhagic fever,[Democratic Republic of the Congo]


In [5]:
ereignisdatenbank = pd.read_csv("Ereignisse_utf8.csv",sep=";")

In [6]:
countries_not_null = ereignisdatenbank[pd.notnull(ereignisdatenbank["Ausgangs- bzw. Ausbruchsland "])]

In [7]:
countries = countries_not_null["Ausgangs- bzw. Ausbruchsland "].copy(deep=True)
countries.head()

0    Nigeria
1      Benin
2    Liberia
3      Ghana
4       Fiji
Name: Ausgangs- bzw. Ausbruchsland , dtype: object

# Let the testing begin

In [21]:
from translate import Translator
translator= Translator(to_lang="english")
translation = translator.translate("VAE")
translation

'VAE'

In [18]:
translated_pair = []
for country in range(len(set(countries))):
    

93

In [20]:
set(countries)

{' Äthiopien (AWD)',
 'Afghanistan',
 'Afghanistan,\nDR Congo\nNigeria\nSomalia',
 'Algerien',
 'Angola',
 'Australien',
 'Bangladesch',
 'Benin',
 'Bolivien',
 'Brasilien',
 'Burundi',
 'China',
 'Costa Rica',
 'DRC',
 'DRCongo',
 'Demokratische Republik Kongo',
 'Deutschland',
 'El Salvador',
 'Fiji',
 'Frankreich',
 'Französich_Polynesien',
 'Französiche Guyana',
 'Französisch-Polynesien',
 'Ghana',
 'Haiti',
 'Indien',
 'Irak',
 'Iran',
 'Israel',
 'Italien',
 'Italien, Griechenland, Rumanien, Ungarn, Frankreich',
 'Italien, Griechenland, Ungarn, Rumänien',
 'Italien, Serbien, Griechenland, Rumänien, Ungarn, Frankreich, Kosovo, Albanien, Macedonien, Montenegro, Serbien, Türkei',
 'Kamerun',
 'Kanada',
 'Kenia',
 'Kolumbien',
 'Kongo',
 'Kroatien',
 'Kuwait',
 'La Reunion',
 'Liberia',
 'Madagaskar',
 'Malawi',
 'Mali',
 'Mauretanien',
 'Mosambik',
 'Myanmar',
 'Namibia',
 'Niger',
 'Nigeria',
 'Nordeuropa',
 'Oman ',
 'Pakistan',
 'Papua-Neuguinea',
 'Peru',
 'Peru ',
 'Philippinen

In [282]:
from bs4 import BeautifulSoup
import requests

In [308]:
req = requests.get("https://de.wikipedia.org/wiki/Liste_der_Staaten_der_Erde")
soup = BeautifulSoup(req.content,"html.parser")

In [309]:
parsed_soup = soup.find("table",class_="wikitable sortable zebra").find("tbody") # Find table with of all countries
parsed_soup = parsed_soup.find_all("tr") # Get entries of countries form table
amount_countries = len(parsed_soup)
parsed_soup = [parsed_soup[i].find_all('td') \
               for i in range(amount_countries)] # Extract table entries from country entry

In [332]:
translation = {"state_name_de":[],"full_state_name_de":[],"capital_de":[],"translation_state_name":[]}
regex = re.compile(r"\[\d*\]") # To remove footnotes in the names
for i in range(amount_countries):
    try:
        translation["state_name_de"].append(list(parsed_soup[i][0].children)[0].string.replace("\n","")) # slicing the last part because of \n
        translation["full_state_name_de"].append(regex.sub("",parsed_soup[i][1].text[:-1])) # slicing the last part because of \n
        translation["capital_de"].append(regex.sub("",parsed_soup[i][2].text[:-1])) # slicing the last part because of \n
        translation["translation_state_name"].append(regex.sub("",parsed_soup[i][10].text[:-1])) # slicing the last part because of \n     
    except: # because header and footer are part of the table
        print(i)

0
213


In [333]:
pd.DataFrame.from_dict(translation)[40:]

Unnamed: 0,state_name_de,full_state_name_de,capital_de,translation_state_name
40,Cookinseln,Cookinseln,Avarua,Cook Islands
41,Costa Rica,Republik Costa Rica,San José,Costa Rica
42,Dänemark,Königreich Dänemark,Kopenhagen,Denmark
43,Deutschland,Bundesrepublik Deutschland,Berlin,Germany
44,Dominica,Commonwealth Dominica,Roseau,Dominica
45,Dominikanische Republik,Dominikanische Republik,Santo Domingo,Dominican Republic
46,Dschibuti,Republik Dschibuti,Dschibuti-Stadt,Djibouti
47,Ecuador,Republik Ecuador,Quito,Ecuador
48,El Salvador,Republik El Salvador,San Salvador,El Salvador
49,Elfenbeinküste,Republik Côte d’Ivoire,Yamoussoukro,Ivory Coast


In [57]:
translated_name = [soup.find("table",class_="wikitable sortable zebra").find("tbody")\
                   .find_all("tr")[0].find_all('td')[10] for i in range(len(soup.find("table",class_="wikitable sortable zebra").find("tbody")\
                                                                            .find_all("tr")[0]))]

IndexError: list index out of range

In [59]:
len(soup.find("table",class_="wikitable sortable zebra").find("tbody").find_all("tr"))

214