In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
import numpy as np
import pandas as pd

from nlp_surveillance.event_db_preprocessing import event_db


In [3]:
edb = event_db._read_unprocessed()
cleaned_edb = event_db.read_cleaned()

# Cleaning Statistics
## Date

In [4]:
ts = edb['Datenstand für Fallzahlen gesamt*'].replace(['nan', '-', np.nan], [None] * 3).tolist()
valid_ts = list(filter(lambda x: isinstance(x, str), ts))

In [5]:
print('There are', len(cleaned_edb), 'entries in the cleaned event db')
print(len(valid_ts), 'of them are non-empty dates.')

There are 557 entries in the cleaned event db
187 of them are non-empty dates.


In [6]:
is_date = lambda x: re.match(r"(\d{1,2})\D(\d{1,2})\D(\d{4})", x)
valid_dates = list(filter(lambda x: is_date(x), valid_ts))
print(len(valid_dates), 'of all entries are valid dates.')

168 of all entries are valid dates.


In [7]:
print('The following are non-valid dates:')
print(list(filter(lambda x: not is_date(x), valid_ts)))

The following are non-valid dates:
['?', 'Ende Mai', '43329', '43332', '43332', '43335', '43336', '43335', '43335', '43340', '43340', 'Mitte Sept.', '13.10.218', 'September 2018', '2017', '2018', 'Jan-4.Nov 2018', 'July 2017 -22.10.2018', 'Juni-Nov 2018']


## Count

In [8]:
print('There are', len(cleaned_edb), 'entries in the cleaned event db')
print(sum(cleaned_edb['count_edb'].notna()), 'of them are non-empty case numbers')

There are 557 entries in the cleaned event db
394 of them are non-empty case numbers


In [9]:
valid_counts = edb['Fälle gesamt*'][edb['Fälle gesamt*'].notna()]

In [10]:
def is_int(string):
    try:
        num = int(string)
    except ValueError:
        return False
    return True

In [11]:
print('These are the invalid count entries:')
print(list(filter(lambda x: not is_int(x), valid_counts)))

These are the invalid count entries:
['1,078,997', '40 abzgl. 19 non-cases', 'mind 18', '368 Fälle', '6,382', '0 bei Menschen.\nMehr als 50 Todesfälle bei Kühem. Schafen und Pferden', '4 cVDPV2\n2 WPV1 in Afghanistan', '>1000', '>3.300', '13 430', '15,944', '1,207,596', '650,000', '3,057', '446.150', 'erhöhte Fallzahlen seit 2013', '25 (im Jahr 2018)', '10.604']


## Country

In [12]:
before = set(edb['Ausgangs- bzw. Ausbruchsland'])
after = set(cleaned_edb.country_edb)
print(before - after)
print('\n is cleaned to: \n')
print(after - before)

{nan, 'Peru ', 'USA, Delaware', 'Französich_Polynesien', 'Afghanistan,\nDR Congo\nNigeria\nSomalia', 'Serbien, Italien, Griechenland, Ungarn,  Rumänien', 'Italien, Serbien, Griechenland, Rumänien, Ungarn, Frankreich, Kosovo, Albanien, Macedonien, Montenegro, Serbien, Türkei', 'Trinidad & Tobago', 'DRC, Nord Kivu', 'Namibia, Opuwo District, Kunene Region', 'Italien, Griechenland, Rumanien, Ungarn, Frankreich', 'Nigeria, Edo State', 'Venezuela ', 'Italien, Griechenland, Ungarn, Rumänien', ' Äthiopien (AWD)', 'VAE ', 'Oman ', 'Indien, Jaipur', 'Saudi-Arabien '}

 is cleaned to: 

{'VAE', 'Kunene Region', None, 'Französich Polynesien', 'Kosovo', 'Oman', 'Jaipur', 'Edo State', 'Rumänien', 'Ungarn', 'Montenegro', 'Nord Kivu', 'Trinidad und Tobago', 'Opuwo District', 'Griechenland', 'Delaware', 'Macedonien', 'Albanien', 'Rumanien', 'DR Congo', 'Serbien'}


In [13]:
# Stripped strings don't appear in the last set but were processed correctly
('Peru' in after) & ('Saudi-Arabien' in after) & ('Oman' in after) & ('Äthiopien' in after)

True

## Disease

In [14]:
before_disease =  set(edb['Krankheitsbild(er)'])
after_disease = set(cleaned_edb['disease_edb'])
before_disease - after_disease 

{' ',
 '?',
 'AFP, AFM',
 'Camel Prion Disease ',
 'Diarrhoe, Überkeit, Erbrechen',
 'FSME ',
 'Gesichtslähmung, Nervensystem betroffen.',
 'Gonorrhö, multiresistent',
 'Husten, Fieber',
 'Lassafieber ',
 'Leptospirose ',
 'Nierenversagen; v.a. HUS, Leptospirose',
 'Tick-borne relapsing fever ',
 'Tularämie ',
 'keine',
 nan}

## URLs

In [15]:
before_URLs =  set(edb['Link zur Quelle 1'])
after_URLs = set(cleaned_edb['URL'])

In [16]:
# Guillements, splitting by comma, additional whitespace, and non-valid URLS
# are the problem
before_URLs - after_URLs

{'<http://apps.who.int/iris/bitstream/10665/260468/1/OEW10-39032018.pdf>',
 '<http://apps.who.int/iris/bitstream/10665/260468/1/OEW10-39032018.pdf>, <http://www.ncdc.gov.ng/diseases/sitreps>',
 '<http://apps.who.int/iris/bitstream/10665/260468/1/OEW10-39032018.pdf>, <http://www.nicd.ac.za/wp-content/uploads/2018/03/Listeria-Sitrep-08Mar2018.pdf>',
 'ECDC RRA 10 Aug 2018 Early occurrence of a large number of WNV infection.._',
 'Siehe Email',
 'http://apps.who.int/iris/bitstream/handle/10665/272360/OEW15-071342018.pdf ',
 'http://apps.who.int/iris/bitstream/handle/10665/272431/OEW17-2127042018.pdf ',
 'http://apps.who.int/iris/bitstream/handle/10665/274791/OEW38-1521092018.pdf ',
 'http://crofsblogs.typepad.com/h5n1/2018/03/brazil-moh-updates-yellow-fever-cases.html, http://portalarquivos2.saude.gov.br/images/pdf/2018/marco/07/Informe-FA-16-7mar18.pdf   ECDC RRA vom 14.03.2018',
 'http://portalarquivos2.saude.gov.br/images/pdf/2018/abril/26/Informe-FA-23-25abr18.pdf ',
 'http://promedma

# Pipeline

In [17]:
from nlp_surveillance.pipeline import *

In [18]:
CleanEventDB().data_output().head()

Unnamed: 0,date_of_data,count_edb,country_edb,disease_edb,URL
0,2018-03-18,1121.0,Nigeria,Lassafieber,http://apps.who.int/iris/bitstream/10665/26046...
1,2018-03-18,1121.0,Nigeria,Lassafieber,http://www.ncdc.gov.ng/diseases/sitreps
2,2018-03-05,24.0,Benin,Lassafieber,http://apps.who.int/iris/bitstream/10665/26046...
3,2018-03-16,42.0,Liberia,Lassafieber,http://apps.who.int/iris/bitstream/10665/26046...
4,2018-03-02,1.0,Ghana,Lassafieber,http://apps.who.int/iris/bitstream/10665/26046...


In [19]:
RequestDiseaseNamesFromWikiData().data_output().head()

Unnamed: 0,itemLabel_DE,itemLabel_EN
0,Masern,measles
1,oromandibuläre Dystonie,oromandibular dystonia
2,Lemierre-Syndrom,Lemierre's syndrome
3,Frambösie,yaws
4,Ovarialkarzinom,ovarian cancer


In [20]:
ScrapeCountryNamesFromWikipedia().data_output().head()

Unnamed: 0,state_name_de,full_state_name_de,translation_state_name,iso_three_abbreviation,iso_two_abbreviation
0,Erde,—,Earth,—,—
1,Europäische Union,—,European Union,—,EU
2,Union Südamerikanischer Nationen,—,Union of South American Nations,—,UNASUL
3,Afrikanische Union,—,African Union,—,—
4,Verband Südost­asiatischer Nationen,—,Association of Southeast Asian Nations,—,—


In [21]:
CleanCountryLookUpAndAddAbbreviations().data_output()

{'RA': 'Republic of Artsakh',
 'IRA': 'Afghanistan',
 'ARÄ': 'Egypt',
 'DVA': 'Algeria',
 'FA': 'Andorra',
 'AB': 'Antigua and Barbuda',
 'RÄ': 'Equatorial Guinea',
 'AR': 'Argentina',
 'DBÄ': 'Ethiopia',
 'CB': 'Bahamas',
 'KB': 'Bhutan',
 'VB': 'Bangladesh',
 'RB': 'Belarus',
 'PSB': 'Bolivia',
 'BH': 'Bahrain',
 'FRB': 'Brazil',
 'BD': 'Bangladesh',
 'BF': 'Burkina Faso',
 'RC': 'Republic of Congo',
 'VC': 'Saint Vincent and the Grenadines',
 'RCR': 'Costa Rica',
 'KD': 'Denmark',
 'CD': 'Democratic Republic of the Congo',
 'DR': 'Dominican Republic',
 'RD': 'Djibouti',
 'RE': 'Estonia',
 'RES': 'El Salvador',
 'RCI': 'Ivory Coast',
 'SE': 'Sweden',
 'RF': 'Russia oder Russian Federation',
 'FR': 'France',
 'GR': 'Greece',
 'RG': 'Guinea',
 'HR': 'Croatia',
 'RGB': 'Guinea-Bissau',
 'KRG': 'Guyana',
 'RH': 'Honduras',
 'RI': 'Iceland',
 'IRI': 'Iran',
 'SI': 'Slovenia',
 'IR': 'Iran',
 'RJ': 'Yemen',
 'HKJ': 'Jordan',
 'KK': 'Cambodia',
 'RK': 'Republic of Korea',
 'RCV': 'Cape Verd

In [22]:
MergeDiseaseNameLookupWithAbbreviationsOfRKI().data_output()

{'MSV': 'measles',
 'MPV': 'mumps',
 'SPY': 'scarlet fever',
 'STY': 'typhoid fever',
 'NEG': 'gonorrhea',
 'BAN': 'anthrax',
 'YPS': 'plague',
 'COR': 'diphtheria',
 'GIL': 'giardiasis',
 'HEV': 'hepatitis E',
 'SAL': 'salmonellosis',
 'TRI': 'trichinosis',
 'HDV': 'hepatitis D',
 'BPS': 'pertussis',
 'CLO': 'botulism',
 'HCV': 'hepatitis C',
 'FRT': 'tularemia',
 'GFV': 'yellow fever',
 'BRU': 'brucellosis',
 'RUV': 'rubella',
 'HAV': 'hepatitis A',
 'LIS': 'listeriosis',
 'RIC': 'spotted fever',
 'CHL': 'ornithosis',
 'COX': 'Q fever',
 'LEP': 'leptospirosis',
 'LEG': "legionnaires' disease",
 'MBV': 'Marburg virus disease',
 'GBR': 'gas gangrene',
 'HFM': 'hand, foot and mouth disease',
 'LSV': 'Lassa fever',
 'SPA': 'paratyphoid fever',
 'BOB': 'borreliosis',
 'PVB': 'erythema infectiosum',
 'ECH': 'echinococcosis',
 'ECC': 'echinococcosis',
 'YEN': 'yersiniosis',
 'INV': 'influenza',
 'HBV': 'hepatitis B',
 'VCH': 'cholera',
 'PLA': 'malaria',
 'PLM': 'malaria',
 'POV': 'poliomye

In [23]:
ApplyControlledVocabularyToEventDB().data_output().head()

Unnamed: 0,date_of_data,count_edb,country_edb,disease_edb,URL
0,2018-03-18,1121.0,Nigeria,Lassa fever,http://apps.who.int/iris/bitstream/10665/26046...
1,2018-03-18,1121.0,Nigeria,Lassa fever,http://www.ncdc.gov.ng/diseases/sitreps
2,2018-03-05,24.0,Benin,Lassa fever,http://apps.who.int/iris/bitstream/10665/26046...
3,2018-03-16,42.0,Liberia,Lassa fever,http://apps.who.int/iris/bitstream/10665/26046...
4,2018-03-02,1.0,Ghana,Lassa fever,http://apps.who.int/iris/bitstream/10665/26046...


In [24]:
ScrapePromed('2018').data_output().head()

Unnamed: 0,URL
0,https://www.promedmail.org/post/6237302
1,https://www.promedmail.org/post/6236998
2,https://www.promedmail.org/post/6236997
3,https://www.promedmail.org/post/6236702
4,https://www.promedmail.org/post/6235503


In [25]:
ScrapeWHO('2018').data_output().head()

Unnamed: 0,URL
0,http://www.who.int/csr/don/28-december-2018-eb...
1,http://www.who.int/csr/don/28-december-2018-me...
2,http://www.who.int/csr/don/27-december-2018-ty...
3,http://www.who.int/csr/don/20-december-2018-eb...
4,http://www.who.int/csr/don/18-December-2018-ye...


In [26]:
ScrapeFromURLsAndExtractText('event_db').data_output().head()

Unnamed: 0,date_of_data,count_edb,country_edb,disease_edb,URL,extracted_text
0,2018-03-18,1121.0,Nigeria,Lassa fever,http://apps.who.int/iris/bitstream/10665/26046...,
1,2018-03-18,1121.0,Nigeria,Lassa fever,http://www.ncdc.gov.ng/diseases/sitreps,Subscribe to NewsletterMandate of NCDCThe Nige...
2,2018-03-05,24.0,Benin,Lassa fever,http://apps.who.int/iris/bitstream/10665/26046...,
3,2018-03-16,42.0,Liberia,Lassa fever,http://apps.who.int/iris/bitstream/10665/26046...,
4,2018-03-02,1.0,Ghana,Lassa fever,http://apps.who.int/iris/bitstream/10665/26046...,


In [27]:
ScrapeFromURLsAndExtractText('who').data_output().head()

Unnamed: 0,URL,extracted_text
0,http://www.who.int/csr/don/28-december-2018-eb...,Ebola virus disease – Democratic Republic of t...
1,http://www.who.int/csr/don/28-december-2018-me...,Middle East respiratory syndrome coronavirus (...
2,http://www.who.int/csr/don/27-december-2018-ty...,Typhoid fever – Islamic Republic of PakistanDi...
3,http://www.who.int/csr/don/20-december-2018-eb...,Ebola virus disease – Democratic Republic of t...
4,http://www.who.int/csr/don/18-December-2018-ye...,Yellow Fever – Kingdom of the NetherlandsDisea...


In [28]:
ScrapeFromURLsAndExtractText('promed').data_output().head()

Unnamed: 0,URL,extracted_text
0,https://www.promedmail.org/post/6237302,"©2001,2008 International Society for Infectiou..."
1,https://www.promedmail.org/post/6236998,"©2001,2008 International Society for Infectiou..."
2,https://www.promedmail.org/post/6236997,"©2001,2008 International Society for Infectiou..."
3,https://www.promedmail.org/post/6236702,"©2001,2008 International Society for Infectiou..."
4,https://www.promedmail.org/post/6235503,"©2001,2008 International Society for Infectiou..."


In [29]:
AnnotateDoc('event_db').data_output().head()

Unnamed: 0,date_of_data,count_edb,country_edb,disease_edb,URL,annotated
0,2018-03-18,1121.0,Nigeria,Lassa fever,http://apps.who.int/iris/bitstream/10665/26046...,
1,2018-03-18,1121.0,Nigeria,Lassa fever,http://www.ncdc.gov.ng/diseases/sitreps,<epitator.annodoc.AnnoDoc object at 0x7f54d690...
2,2018-03-05,24.0,Benin,Lassa fever,http://apps.who.int/iris/bitstream/10665/26046...,
3,2018-03-16,42.0,Liberia,Lassa fever,http://apps.who.int/iris/bitstream/10665/26046...,
4,2018-03-02,1.0,Ghana,Lassa fever,http://apps.who.int/iris/bitstream/10665/26046...,


In [30]:
ExtractSentencesAndLabel('dates').data_output().head()

Unnamed: 0,label,sentence
2,False,Excerpt from the Google translation:The Minist...
5,False,Thus the period for the analysis considers fro...
10,True,Disease outbreak news9 April 2018On 2 March 20...
11,False,Disease outbreak news9 April 2018On 2 March 20...
12,False,Disease outbreak news9 April 2018On 2 March 20...


In [31]:
ExtractSentencesAndLabel('counts').data_output().head()

Unnamed: 0,label,sentence
0,False,Subscribe to NewsletterMandate of NCDCThe Nige...
1,False,Subscribe to NewsletterMandate of NCDCThe Nige...
2,False,Subscribe to NewsletterMandate of NCDCThe Nige...
3,False,"Abuja, 25 January 2019 – In an historic event ..."
4,False,"Abuja, 25 January 2019 – In an historic event ..."


In [57]:
RecommenderLabeling().data_output().head()

Unnamed: 0,annotated,label
0,<epitator.annodoc.AnnoDoc object at 0x7f54d484...,False
1,<epitator.annodoc.AnnoDoc object at 0x7f54d484...,False
2,<epitator.annodoc.AnnoDoc object at 0x7f54d484...,False
3,<epitator.annodoc.AnnoDoc object at 0x7f54d484...,False
4,<epitator.annodoc.AnnoDoc object at 0x7f54d484...,False


# Recommender System

In [33]:
len(ScrapeWHO('2018').data_output())

91

In [34]:
len(ScrapePromed('2018').data_output())

3141

In [35]:
scraped = pd.concat([ScrapeWHO('2018').data_output(),
                    ScrapePromed('2018').data_output()])

In [36]:
urls = CleanEventDB().data_output()['URL']
urls_not_na = urls[urls.notna()]
print(f'There are {len(urls_not_na)} usable URLs')
who_promed_urls = [url for url in urls_not_na if '/don/' in url or 'promed' in url]
urls_that_have_an_article = list(filter(lambda x: x != 'https://www.promedmail.org/', who_promed_urls))
print(f'{len(urls_that_have_an_article)} of them are usable for the recommender system')

There are 557 usable URLs
174 of them are usable for the recommender system


In [37]:
print('Following URLs of the event db could not be matched to all the Promed and WHO DON articles of 2018')
set(urls_that_have_an_article) - set(scraped.URL.tolist())

Following URLs of the event db could not be matched to all the Promed and WHO DON articles of 2018


{'https://www.promedmail.org/ Active number 20180428.5771404',
 'https://www.promedmail.org/5770330',
 'https://www.promedmail.org/5928463',
 'https://www.promedmail.org/6059979',
 'https://www.promedmail.org/post/201808305997572',
 'https://www.promedmail.org/post/201809016002180',
 'https://www.promedmail.org/post/5717098',
 'https://www.promedmail.org/post/5793837',
 'https://www.promedmail.org/post/5824986',
 'www.promedmail.org'}