In [1]:
import sys
sys.path.append("../../eventepi")
from eventepi.corpus_reader import HTMLCorpusReader, PickledCorpusReader

# This notebook intends to show the corpus functionality

### HTMLCorpusReader is a corpus reader for the raw, scraped ProMED Mail and WHO DONs HTML files 

In [2]:
html_reader = HTMLCorpusReader()

In [3]:
# Resolves the available documts and returns a list of their names based on categories or file ids
html_reader.resolve(categories="who_dons")[:5]

['who_dons/01-february-2015-avian-influenza.html',
 'who_dons/01-july-2016-ah7n9-china.html',
 'who_dons/01-june-2015-mers-korea.html',
 'who_dons/01-june-2015-mers-saudi-arabia.html',
 'who_dons/01-march-2018-lassa-fever-nigeria.html']

In [4]:
# Returns the raw docs given categories or fileids
print(next(html_reader.docs(categories="who_dons"))[:600])

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html>
<head>
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>WHO |  Human infection with avian influenza A(H7N9) virus – Canada</title>
<meta name="DC.title" content="WHO |  Human infection with avian influenza A(H7N9) virus – Canada">
<meta name="DC.keywords" content=" avian influenza [subject], bird flu, avian flu, fowl pla


In [5]:
# Returns the a readable html given categories or fileids
print(next(html_reader.html(categories="who_dons"))[:600])

<html><body><div><div id="primary">


    <h1 class="headline"> Human infection with avian influenza A(H7N9) virus – Canada</h1>



 


















		<p>
  		
  		<span>On 27 January 2015, the IHR National Focal Point of Canada notified WHO of 1 laboratory-confirmed case of human infection with avian influenza A(H7N9) virus. On January 30, 2015 a second individual, travelling through China with the index case, was laboratory confirmed to also have influenza A(H7N9) infection.</span></p>
		<p>
  		
  		<span>The two individuals flew from Hong Kong, SAR China to British Columbia, Canada afte


In [6]:
# Returns paragraphs given categories or fileids. A paragraph is based on the prespecified htmls tags
print("These tags are considered paragraphs: ", html_reader.tags)
print(next(html_reader.paras(categories="who_dons")))

These tags are considered paragraphs:  ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'li']
 Human infection with avian influenza A(H7N9) virus – Canada


In [7]:
# Returns sentences given categories or fileids.
print(next(html_reader.sents(categories="who_dons")))

 Human infection with avian influenza A(H7N9) virus – Canada


In [8]:
# Returns words given categories or fileids.
print(next(html_reader.words(categories="who_dons")))

Human


In [9]:
# Returns POS-tags given categories or fileids.
print(next(html_reader.tokenize(categories="who_dons")))

[[('Human', 'JJ'), ('infection', 'NN'), ('with', 'IN'), ('avian', 'JJ'), ('influenza', 'NN'), ('A', 'NNP'), ('(', '('), ('H7N9', 'NNP'), (')', ')'), ('virus', 'NN'), ('–', 'CD'), ('Canada', 'NNP')]]


In [10]:
# Returns POS-tags given categories or fileids.
print(next(html_reader.sizes(categories="who_dons")))

67KB


In [11]:
# The corpus reader has a describe methods that itereates over all files and returns basic statistics
print(html_reader.describe(
    fileids=[
        'who_dons/01-february-2015-avian-influenza.html',
        'who_dons/01-july-2016-ah7n9-china.html',
        'who_dons/01-june-2015-mers-korea.html',
        'who_dons/01-june-2015-mers-saudi-arabia.html',
        'who_dons/01-march-2018-lassa-fever-nigeria.html'
    ]
)
     )

{'files': 5, 'num_categories': 1, 'paragraphs': 87, 'sentences': 157, 'words': 3389, 'vocabulary': 795, 'lexical_diversity': 4.262893081761006, 'paras_per_doc': 17.4, 'sents_per_para': 1.8045977011494252, 'secs': 0.21319365501403809}


### PickledCorpusReader is a corpus reader for preprocessed, pickled, ProMED Mail and WHO DONs HTML files

In [12]:
pickle_reader = PickledCorpusReader()

In [13]:
# Resolves the available documts and returns a list of their names based on categories or file ids
pickle_reader.resolve(categories="who_dons")[:5]

['who_dons/01-february-2015-avian-influenza.pickle',
 'who_dons/01-july-2016-ah7n9-china.pickle',
 'who_dons/01-june-2015-mers-korea.pickle',
 'who_dons/01-june-2015-mers-saudi-arabia.pickle',
 'who_dons/01-march-2018-lassa-fever-nigeria.pickle']

In [14]:
# Returns the raw docs given categories or fileids
print(next(pickle_reader.docs(categories="who_dons"))[0])

[[('Human', 'JJ'), ('infection', 'NN'), ('with', 'IN'), ('avian', 'JJ'), ('influenza', 'NN'), ('A', 'NNP'), ('(', '('), ('H7N9', 'NNP'), (')', ')'), ('virus', 'NN'), ('–', 'CD'), ('Canada', 'NNP')]]


In [15]:
# Returns paragraphs given categories or fileids.
print(next(pickle_reader.paras(categories="who_dons")))

[[('Human', 'JJ'), ('infection', 'NN'), ('with', 'IN'), ('avian', 'JJ'), ('influenza', 'NN'), ('A', 'NNP'), ('(', '('), ('H7N9', 'NNP'), (')', ')'), ('virus', 'NN'), ('–', 'CD'), ('Canada', 'NNP')]]


In [16]:
# Returns sentences given categories or fileids.
print(next(pickle_reader.sents(categories="who_dons")))

[('Human', 'JJ'), ('infection', 'NN'), ('with', 'IN'), ('avian', 'JJ'), ('influenza', 'NN'), ('A', 'NNP'), ('(', '('), ('H7N9', 'NNP'), (')', ')'), ('virus', 'NN'), ('–', 'CD'), ('Canada', 'NNP')]


In [17]:
# Returns sentences given categories or fileids.
print(next(pickle_reader.tagged(categories="who_dons")))

('Human', 'JJ')


In [18]:
# Returns sentences given categories or fileids.
print(next(pickle_reader.words(categories="who_dons")))

Human
