In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
import numpy as np
import pandas as pd

from nlp_surveillance.event_db_preprocessing import event_db


In [3]:
edb = event_db._read_unprocessed()
cleaned_edb = event_db.read_cleaned()

# Amount of URLs

In [312]:
print('There are', len(cleaned_edb[pd.notna(cleaned_edb['URL'])]), 'usable URLs')

There are 466 usable URLs


# Cleaning Statistics
## Date

In [7]:
ts = edb['Datenstand für Fallzahlen gesamt*'].replace(['nan', '-', np.nan], [None] * 3).tolist()
valid_ts = list(filter(lambda x: isinstance(x, str), ts))

In [8]:
print('There are', len(cleaned_edb), 'entries in the cleaned event db')
print(len(valid_ts), 'of them are non-empty dates.')

There are 557 entries in the cleaned event db
187 of them are non-empty dates.


In [9]:
is_date = lambda x: re.match(r"(\d{1,2})\D(\d{1,2})\D(\d{4})", x)
valid_dates = list(filter(lambda x: is_date(x), valid_ts))
print(len(valid_dates), 'of all entries are valid dates.')

168 of all entries are valid dates.


In [10]:
print('The following are non-valid dates:')
print(list(filter(lambda x: not is_date(x), valid_ts)))

The following are non-valid dates:
['?', 'Ende Mai', '43329', '43332', '43332', '43335', '43336', '43335', '43335', '43340', '43340', 'Mitte Sept.', '13.10.218', 'September 2018', '2017', '2018', 'Jan-4.Nov 2018', 'July 2017 -22.10.2018', 'Juni-Nov 2018']


## Count

In [11]:
print('There are', len(cleaned_edb), 'entries in the cleaned event db')
print(sum(cleaned_edb['count_edb'].notna()), 'of them are non-empty case numbers')

There are 557 entries in the cleaned event db
394 of them are non-empty case numbers


In [12]:
valid_counts = edb['Fälle gesamt*'][edb['Fälle gesamt*'].notna()]

In [13]:
def is_int(string):
    try:
        num = int(string)
    except ValueError:
        return False
    return True

In [14]:
print('These are the invalid count entries:')
print(list(filter(lambda x: not is_int(x), valid_counts)))

These are the invalid count entries:
['1,078,997', '40 abzgl. 19 non-cases', 'mind 18', '368 Fälle', '6,382', '0 bei Menschen.\nMehr als 50 Todesfälle bei Kühem. Schafen und Pferden', '4 cVDPV2\n2 WPV1 in Afghanistan', '>1000', '>3.300', '13 430', '15,944', '1,207,596', '650,000', '3,057', '446.150', 'erhöhte Fallzahlen seit 2013', '25 (im Jahr 2018)', '10.604']


## Country

In [15]:
before = set(edb['Ausgangs- bzw. Ausbruchsland'])
after = set(cleaned_edb.country_edb)
print(before - after)
print('\n is cleaned to: \n')
print(after - before)

{nan, 'Nigeria, Edo State', 'Indien, Jaipur', 'Italien, Griechenland, Ungarn, Rumänien', 'Afghanistan,\nDR Congo\nNigeria\nSomalia', 'Oman ', 'Trinidad & Tobago', 'Serbien, Italien, Griechenland, Ungarn,  Rumänien', 'Venezuela ', 'Französich_Polynesien', 'Saudi-Arabien ', ' Äthiopien (AWD)', 'USA, Delaware', 'VAE ', 'Peru ', 'Italien, Serbien, Griechenland, Rumänien, Ungarn, Frankreich, Kosovo, Albanien, Macedonien, Montenegro, Serbien, Türkei', 'DRC, Nord Kivu', 'Namibia, Opuwo District, Kunene Region', 'Italien, Griechenland, Rumanien, Ungarn, Frankreich'}

 is cleaned to: 

{None, 'Französich Polynesien', 'Jaipur', 'VAE', 'Rumanien', 'Edo State', 'Ungarn', 'Opuwo District', 'Macedonien', 'Nord Kivu', 'Serbien', 'Delaware', 'Rumänien', 'Trinidad und Tobago', 'Oman', 'Kunene Region', 'Griechenland', 'DR Congo', 'Kosovo', 'Albanien', 'Montenegro'}


In [16]:
# Stripped strings don't appear in the last set but were processed correctly
('Peru' in after) & ('Saudi-Arabien' in after) & ('Oman' in after) & ('Äthiopien' in after)

True

## Disease

In [17]:
before_disease =  set(edb['Krankheitsbild(er)'])
after_disease = set(cleaned_edb['disease_edb'])
before_disease - after_disease 

{' ',
 '?',
 'AFP, AFM',
 'Camel Prion Disease ',
 'Diarrhoe, Überkeit, Erbrechen',
 'FSME ',
 'Gesichtslähmung, Nervensystem betroffen.',
 'Gonorrhö, multiresistent',
 'Husten, Fieber',
 'Lassafieber ',
 'Leptospirose ',
 'Nierenversagen; v.a. HUS, Leptospirose',
 'Tick-borne relapsing fever ',
 'Tularämie ',
 'keine',
 nan}

## URLs

In [19]:
before_URLs =  set(edb['Link zur Quelle 1'])
after_URLs = set(cleaned_edb['URL'])

In [20]:
# Guillements, splitting by comma, additional whitespace, and non-valid URLS
# are the problem
before_URLs - after_URLs

{'<http://apps.who.int/iris/bitstream/10665/260468/1/OEW10-39032018.pdf>',
 '<http://apps.who.int/iris/bitstream/10665/260468/1/OEW10-39032018.pdf>, <http://www.ncdc.gov.ng/diseases/sitreps>',
 '<http://apps.who.int/iris/bitstream/10665/260468/1/OEW10-39032018.pdf>, <http://www.nicd.ac.za/wp-content/uploads/2018/03/Listeria-Sitrep-08Mar2018.pdf>',
 'ECDC RRA 10 Aug 2018 Early occurrence of a large number of WNV infection.._',
 'Siehe Email',
 'http://apps.who.int/iris/bitstream/handle/10665/272360/OEW15-071342018.pdf ',
 'http://apps.who.int/iris/bitstream/handle/10665/272431/OEW17-2127042018.pdf ',
 'http://apps.who.int/iris/bitstream/handle/10665/274791/OEW38-1521092018.pdf ',
 'http://crofsblogs.typepad.com/h5n1/2018/03/brazil-moh-updates-yellow-fever-cases.html, http://portalarquivos2.saude.gov.br/images/pdf/2018/marco/07/Informe-FA-16-7mar18.pdf   ECDC RRA vom 14.03.2018',
 'http://portalarquivos2.saude.gov.br/images/pdf/2018/abril/26/Informe-FA-23-25abr18.pdf ',
 'http://promedma

# Pipeline

In [21]:
from nlp_surveillance.pipeline import *

In [24]:
CleanEventDB().data_output().head()

Unnamed: 0,date_of_data,count_edb,country_edb,disease_edb,URL
0,2018-03-18,1121.0,Nigeria,Lassafieber,http://apps.who.int/iris/bitstream/10665/26046...
1,2018-03-18,1121.0,Nigeria,Lassafieber,http://www.ncdc.gov.ng/diseases/sitreps
2,2018-03-05,24.0,Benin,Lassafieber,http://apps.who.int/iris/bitstream/10665/26046...
3,2018-03-16,42.0,Liberia,Lassafieber,http://apps.who.int/iris/bitstream/10665/26046...
4,2018-03-02,1.0,Ghana,Lassafieber,http://apps.who.int/iris/bitstream/10665/26046...


In [25]:
RequestDiseaseNamesFromWikiData().data_output().head()

Unnamed: 0,itemLabel_DE,itemLabel_EN
0,Masern,measles
1,oromandibuläre Dystonie,oromandibular dystonia
2,Lemierre-Syndrom,Lemierre's syndrome
3,Frambösie,yaws
4,Akne,acne


In [35]:
CleanCountryLookUpAndAddAbbreviations().data_output()

{'RA': 'Republic of Artsakh',
 'IRA': 'Afghanistan',
 'ARÄ': 'Egypt',
 'DVA': 'Algeria',
 'FA': 'Andorra',
 'AB': 'Antigua and Barbuda',
 'RÄ': 'Equatorial Guinea',
 'AR': 'Argentina',
 'DBÄ': 'Ethiopia',
 'CB': 'Bahamas',
 'KB': 'Bhutan',
 'VB': 'Bangladesh',
 'RB': 'Belarus',
 'PSB': 'Bolivia',
 'BH': 'Bahrain',
 'FRB': 'Brazil',
 'BD': 'Bangladesh',
 'BF': 'Burkina Faso',
 'RC': 'Republic of Congo',
 'VC': 'Saint Vincent and the Grenadines',
 'RCR': 'Costa Rica',
 'KD': 'Denmark',
 'CD': 'Democratic Republic of the Congo',
 'DR': 'Dominican Republic',
 'RD': 'Djibouti',
 'RE': 'Estonia',
 'RES': 'El Salvador',
 'RCI': 'Ivory Coast',
 'SE': 'Sweden',
 'RF': 'Russia oder Russian Federation',
 'FR': 'France',
 'GR': 'Greece',
 'RG': 'Guinea',
 'HR': 'Croatia',
 'RGB': 'Guinea-Bissau',
 'KRG': 'Guyana',
 'RH': 'Honduras',
 'RI': 'Iceland',
 'IRI': 'Iran',
 'SI': 'Slovenia',
 'IR': 'Iran',
 'RJ': 'Yemen',
 'HKJ': 'Jordan',
 'KK': 'Cambodia',
 'RK': 'Republic of Korea',
 'RCV': 'Cape Verd

In [36]:
MergeDiseaseNameLookupWithAbbreviationsOfRKI().data_output()

{'MSV': 'measles',
 'STY': 'typhoid fever',
 'NEG': 'gonorrhea',
 'MPV': 'mumps',
 'BAN': 'anthrax',
 'SPY': 'scarlet fever',
 'HEV': 'hepatitis E',
 'SAL': 'salmonellosis',
 'GIL': 'giardiasis',
 'HDV': 'hepatitis D',
 'YPS': 'plague',
 'COR': 'diphtheria',
 'BPS': 'pertussis',
 'TRI': 'trichinosis',
 'FRT': 'tularemia',
 'CLO': 'botulism',
 'HCV': 'hepatitis C',
 'LEP': 'leptospirosis',
 'LEG': "legionnaires' disease",
 'GFV': 'yellow fever',
 'BRU': 'brucellosis',
 'RUV': 'rubella',
 'HAV': 'hepatitis A',
 'YEN': 'yersiniosis',
 'RIC': 'spotted fever',
 'CHL': 'ornithosis',
 'COX': 'Q fever',
 'LIS': 'listeriosis',
 'MBV': 'Marburg virus disease',
 'GBR': 'gas gangrene',
 'BOB': 'borreliosis',
 'HFM': 'hand, foot and mouth disease',
 'LSV': 'Lassa fever',
 'SPA': 'paratyphoid fever',
 'PVB': 'erythema infectiosum',
 'ECH': 'echinococcosis',
 'ECC': 'echinococcosis',
 'INV': 'influenza',
 'HBV': 'hepatitis B',
 'POV': 'poliomyelitis',
 'VCH': 'cholera',
 'PLA': 'malaria',
 'PLM': 'ma

In [53]:
ScrapeFromURLsAndExtractText('event_db').data_output().head()

Unnamed: 0,date_of_data,count_edb,country_edb,disease_edb,extracted_text
0,2018-03-18,1121.0,Nigeria,Lassa fever,
1,2018-03-18,1121.0,Nigeria,Lassa fever,Subscribe to NewsletterMandate of NCDCThe Nige...
2,2018-03-05,24.0,Benin,Lassa fever,
3,2018-03-16,42.0,Liberia,Lassa fever,
4,2018-03-02,1.0,Ghana,Lassa fever,


In [38]:
AnnotateDoc('event_db').data_output().head()

Unnamed: 0,date_of_data,count_edb,country_edb,disease_edb,annotated
0,2018-03-18,1121.0,Nigeria,Lassa fever,
1,2018-03-18,1121.0,Nigeria,Lassa fever,<epitator.annodoc.AnnoDoc object at 0x7f59f3a9...
2,2018-03-05,24.0,Benin,Lassa fever,
3,2018-03-16,42.0,Liberia,Lassa fever,
4,2018-03-02,1.0,Ghana,Lassa fever,


In [313]:
ExtractSentencesAndLabel('dates').data_output().head()

Unnamed: 0,date_of_data,annotated,sentences,dates
1,2018-03-18,<epitator.annodoc.AnnoDoc object at 0x7f59f0ff...,[Subscribe to NewsletterMandate of NCDCThe Nig...,"[[2011-01-01 00:00:00, 2012-01-01 00:00:00]]"
29,2018-03-19,<epitator.annodoc.AnnoDoc object at 0x7f59f0bc...,[ International Society for Infectious Disease...,"[[2010-01-01 00:00:00, 2011-01-01 00:00:00]]"
40,2018-03-14,<epitator.annodoc.AnnoDoc object at 0x7f59f0bc...,[Excerpt from the Google translation:The Minis...,"[[2018-01-24 00:00:00, 2018-01-25 00:00:00], [..."
43,2018-03-20,<epitator.annodoc.AnnoDoc object at 0x7f59f0bc...,[ International Society for Infectious Disease...,"[[2010-01-01 00:00:00, 2011-01-01 00:00:00]]"
44,2018-03-20,<epitator.annodoc.AnnoDoc object at 0x7f59f0bc...,[ International Society for Infectious Disease...,"[[2010-01-01 00:00:00, 2011-01-01 00:00:00]]"
