In [1]:
import sys
sys.path.append("../../eventepi")

from collections import Counter

import pandas as pd
from epitator.annotator import AnnoDoc
from epitator.count_annotator import CountAnnotator
from epitator.date_annotator import DateAnnotator

from eventepi.corpus_reader import PickledCorpusReader
from eventepi.idb import IDB
from eventepi.summarize import Summarizer

  from pandas import Panel


In [2]:
idb = IDB()
idb.preprocess()
preprocessed_idb = idb.df_processed

In [3]:
preprocessed_idb.head()

Unnamed: 0,url_idb,date_cases_idb,case_counts_idb,country_idb,disease_idb,fileid
0,http://www.who.int/csr/don/15-march-2018-mers-...,,,Oman,Middle east respiratory syndrome,who_dons/15-march-2018-mers-oman.html
1,http://www.who.int/csr/don/09-april-2018-liste...,2018-03-02,20.0,Australia,Listeriosis,who_dons/09-april-2018-listeriosis-australia.html
2,https://promedmail.org/promed-post/?id=&id=575...,,,Bolivia,Plague,promed/2018-04-17_id5752357.html
3,https://promedmail.org/promed-post/?id=&id=573...,,,Yemen,Cholera,promed/2018-04-07_id5732129.html
4,https://promedmail.org/promed-post/?id=&id=574...,,65.0,La Reunion,Leptospirosis,promed/2018-04-14_id5746842.html


In [4]:
summarizer = Summarizer()
corpus = PickledCorpusReader()

In [5]:
texts_of_preprocessed_idb = list(corpus.docs(preprocessed_idb.fileid.str.replace(".html", ".pickle")))

In [6]:
summarized = summarizer.summarize(texts_of_preprocessed_idb)

In [7]:
summarized[0]

{'disease': 'Middle East respiratory syndrome', 'geoname': 'Sultanate of Oman'}

In [8]:
# If this script is run the first time, uncomment and save the extraction performance CSVs
# Otherwise, load the manually scored performance of the key entity extraction using the most frequent approach

# pd.DataFrame(
#     {
#         "summarized": [i["disease"] for i in summarized], 
#         "idb": preprocessed_idb["disease_idb"],
#         "correct": [0] * len(preprocessed_idb)
#     }
# ).to_csv("data/disease_extraction_performance.csv")

# pd.DataFrame(
#     {
#         "summarized": [i["geoname"] for i in summarized], 
#         "idb": preprocessed_idb["country_idb"],
#         "correct": [0] * len(preprocessed_idb)
#     }
# ).to_csv("data/country_extraction_performance.csv")

# After manually scoring

In [8]:
diseases = pd.read_csv("data/disease_extraction_performance.csv")
print(diseases["correct"].mean())
print(diseases["correct"].value_counts())

0.9411764705882353
1.0    112
0.0      7
Name: correct, dtype: int64


In [9]:
sum(diseases.summarized.isna() & diseases.idb.notna())  # Entries where EpiTator failed to detect the disease

8

In [10]:
countries = pd.read_csv("data/country_extraction_performance.csv")
print(countries["correct"].mean())
print(countries["correct"].value_counts())

0.8545454545454545
1.0    141
0.0     24
Name: correct, dtype: int64


In [11]:
sum(countries.summarized.isna() & countries.idb.notna())  # Entries where EpiTator failed to detect the country

1

### Check most-frequent for dates and counts

In [12]:
def most_frequent_count(doc):
    try:
        counts = [i.metadata["count"] for i in doc.tiers["counts"].spans]
        return Counter(counts).most_common(1)[0][0]
    except Exception:
        return ""

def most_frequent_date(doc):
    try:
        dates = [i.metadata["dates"].metadata["datetime_range"][0] for i in doc.tiers["dates"].spans]
        return Counter(dates).most_common(1)[0][0]
    except Exception:
        return ""
    
def annotate(text):
    doc = AnnoDoc(text)
    doc.add_tiers(CountAnnotator())
    doc.add_tiers(DateAnnotator())
    return doc
def is_in_date_range(found, target):
    return ((target - pd.Timedelta("3days")) >= found) & ((target + pd.Timedelta("3days")) <= target)

In [13]:
most_frequents = {"count": [], "date": []}
for text in texts_of_preprocessed_idb:
    doc = annotate(text)
    most_frequents["count"].append(most_frequent_count(doc))
    most_frequents["date"].append(most_frequent_date(doc))

In [14]:
sum(preprocessed_idb.case_counts_idb == most_frequents["count"])

  res_values = method(rvalues)


0

In [15]:
matches = []
for i, target in enumerate(preprocessed_idb.date_cases_idb.values):
    matches.append(is_in_date_range(pd.to_datetime(most_frequents["date"][i]), pd.to_datetime(target)))

In [16]:
sum(matches)

0