In [1]:
%load_ext autoreload
%autoreload 2

In [62]:
import re
import numpy as np
import pandas as pd

from nlp_surveillance.event_db_preprocessing import event_db


In [176]:
edb = event_db._read_unprocessed()
cleaned_edb = event_db.read_cleaned()

# Amount of URLs

In [65]:
print('There are', len(cleaned_edb[pd.notna(cleaned_edb['URL_1'])]), 'URLs')

There are 376 URLs


# Cleaning Statistics
## Date

In [66]:
ts = edb['Datenstand für Fallzahlen gesamt*'].replace(['nan', '-', np.nan], [None] * 3).tolist()
valid_ts = list(filter(lambda x: isinstance(x, str), ts))

In [67]:
print('There are', len(cleaned_edb), 'entries in the cleaned event db')
print(len(valid_ts), 'of them are non-empty dates.')

There are 468 entries in the cleaned event db
187 of them are non-empty dates.


In [68]:
is_date = lambda x: re.match(r"(\d{1,2})\D(\d{1,2})\D(\d{4})", x)
valid_dates = list(filter(lambda x: is_date(x), valid_ts))
print(len(valid_dates), 'of all entries are valid dates.')

168 of all entries are valid dates.


In [69]:
print('The following are non-valid dates:')
print(list(filter(lambda x: not is_date(x), valid_ts)))

The following are non-valid dates:
['?', 'Ende Mai', '43329', '43332', '43332', '43335', '43336', '43335', '43335', '43340', '43340', 'Mitte Sept.', '13.10.218', 'September 2018', '2017', '2018', 'Jan-4.Nov 2018', 'July 2017 -22.10.2018', 'Juni-Nov 2018']


## Count

In [70]:
print('There are', len(cleaned_edb), 'entries in the cleaned event db')
print(sum(cleaned_edb['count_edb'].notna()), 'of them are non-empty case numbers')

There are 468 entries in the cleaned event db
330 of them are non-empty case numbers


In [71]:
valid_counts = edb['Fälle gesamt*'][edb['Fälle gesamt*'].notna()]

In [72]:
def is_int(string):
    try:
        num = int(string)
    except ValueError:
        return False
    return True

In [73]:
print('These are the invalid count entries:')
print(list(filter(lambda x: not is_int(x), valid_counts)))

These are the invalid count entries:
['1,078,997', '40 abzgl. 19 non-cases', 'mind 18', '368 Fälle', '6,382', '0 bei Menschen.\nMehr als 50 Todesfälle bei Kühem. Schafen und Pferden', '4 cVDPV2\n2 WPV1 in Afghanistan', '>1000', '>3.300', '13 430', '15,944', '1,207,596', '650,000', '3,057', '446.150', 'erhöhte Fallzahlen seit 2013', '25 (im Jahr 2018)', '10.604']


## Country

In [None]:
before = set(edb['Ausgangs- bzw. Ausbruchsland'])
after = set(cleaned_edb.country_edb)
print(before - after)
print('\n is cleaned to: \n')
print(after - before)

In [None]:
# Stripped strings don't appear in the last set but were processed correctly
('Peru' in after) & ('Saudi-Arabien' in after) & ('Oman' in after) & ('Äthiopien' in after)

## Disease

In [154]:
before_disease =  set(edb['Krankheitsbild(er)'])
after_disease = set(cleaned_edb['disease_edb'])
before_disease - after_disease 

{' ',
 '?',
 'AFP, AFM',
 'Camel Prion Disease ',
 'Diarrhoe, Überkeit, Erbrechen',
 'FSME ',
 'Gesichtslähmung, Nervensystem betroffen.',
 'Gonorrhö, multiresistent',
 'Husten, Fieber',
 'Lassafieber ',
 'Leptospirose ',
 'Nierenversagen; v.a. HUS, Leptospirose',
 'Tick-borne relapsing fever ',
 'Tularämie ',
 'keine',
 nan}

## URLs

In [183]:
before_URLs =  set(edb['Link zur Quelle 1'])
after_URLs = set(cleaned_edb['URL_1'])

In [184]:
# Guillements, splitting by comma, additional whitespace, and non-valid URLS
# are the problem
before_URLs - after_URLs

{'<http://apps.who.int/iris/bitstream/10665/260468/1/OEW10-39032018.pdf>',
 '<http://apps.who.int/iris/bitstream/10665/260468/1/OEW10-39032018.pdf>, <http://www.ncdc.gov.ng/diseases/sitreps>',
 '<http://apps.who.int/iris/bitstream/10665/260468/1/OEW10-39032018.pdf>, <http://www.nicd.ac.za/wp-content/uploads/2018/03/Listeria-Sitrep-08Mar2018.pdf>',
 'ECDC RRA 10 Aug 2018 Early occurrence of a large number of WNV infection.._',
 'Siehe Email',
 'http://apps.who.int/iris/bitstream/handle/10665/272360/OEW15-071342018.pdf ',
 'http://apps.who.int/iris/bitstream/handle/10665/272431/OEW17-2127042018.pdf ',
 'http://apps.who.int/iris/bitstream/handle/10665/274791/OEW38-1521092018.pdf ',
 'http://crofsblogs.typepad.com/h5n1/2018/03/brazil-moh-updates-yellow-fever-cases.html, http://portalarquivos2.saude.gov.br/images/pdf/2018/marco/07/Informe-FA-16-7mar18.pdf   ECDC RRA vom 14.03.2018',
 'http://portalarquivos2.saude.gov.br/images/pdf/2018/abril/26/Informe-FA-23-25abr18.pdf ',
 'http://promedma