# Preamble

In [1]:
import urllib.request
from bs4 import BeautifulSoup
import re
import os

import pandas as pd
import numpy as np
from itertools import cycle

def shift(l, n):
    return l[n:] + l[:n]

# List of html files downloaded manually from [OIE web-site](https://www.oie.int/wahis_2/public/wahid.php/Diseaseinformation/reportarchive)

In [2]:
htmlFiles = !ls ../../data/htmls/
print(htmlFiles)

['200502.html', '200503.html', '200504.html', '200505.html', '200506.html', '200507.html', '200508.html', '200509.html', '200510.html', '200511.html', '200512.html', '200601.html', '200602.html', '200603.html', '200604.html', '200605.html', '200606.html', '200607.html', '200608.html', '200609.html', '200610.html', '200611.html', '200612.html', '200701.html', '200702.html', '200703.html', '200704.html', '200705.html', '200706.html', '200707.html', '200708.html', '200709.html', '200710.html', '200711.html', '200712.html', '200801.html', '200802.html', '200803.html', '200804.html', '200805.html', '200806.html', '200807.html', '200808.html', '200809.html', '200810.html', '200811.html', '200812.html', '200901.html', '200902.html', '200903.html', '200904.html', '200905.html', '200906.html', '200907.html', '200908.html', '200909.html', '200910.html', '200911.html', '200912.html', '201001.html', '201002.html', '201003.html', '201004.html', '201005.html', '201006.html', '201007.html', '201008.h

# Retrieving the list of links for pdf-files stored on OIE website

In [3]:
links = []
for htmlFile in htmlFiles:
    if(int(htmlFile[:4])>2018 or (int(htmlFile[:4])==2018 and int(htmlFile[4:6])>=8)):
        infile = open("../../data/htmls/"+htmlFile,"r", encoding = "ISO-8859-1")
        contents = infile.read()
        soup = BeautifulSoup(contents,'html.parser')
        cells = soup.find_all("a",href=re.compile("pdf"))
        links = links + [a['href'] for a in cells if ("African swine fever" in a.get_text()) and (("China" in a.get_text()))] #or ("Hong Kong" in a.get_text())
links[1:5]

['https://www.oie.int/wahis_2/temp/reports/en_imm_0000027636_20180823_183422.pdf',
 'https://www.oie.int/wahis_2/temp/reports/en_imm_0000027598_20180820_154827.pdf',
 'https://www.oie.int/wahis_2/temp/reports/en_fup_0000027570_20180816_175933.pdf',
 'https://www.oie.int/wahis_2/temp/reports/en_imm_0000027568_20180816_165812.pdf']

# Downloading pdf-files

In [4]:
pdfsDir = "../../data/ASF_pdfs"
# !rm -rf {pdfsDir}
!mkdir -p {pdfsDir}

In [5]:
up_to_date = True
for link in links:
    if not os.path.exists(pdfsDir+"/"+link.split("/")[-1]):
        !wget --random-wait -r -p -nd -e robots=off --directory-prefix={pdfsDir} {link}
        up_to_date = False
if (up_to_date):
    print("The dataset is up to date")

--2019-11-23 21:10:49--  https://www.oie.int/wahis_2/temp/reports/en_fup_0000031892_20190924_175316.pdf
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving www.oie.int (www.oie.int)... 51.254.148.117
Connecting to www.oie.int (www.oie.int)|51.254.148.117|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 58667 (57K) [application/pdf]
Saving to: ‘../../data/ASF_pdfs/en_fup_0000031892_20190924_175316.pdf’


2019-11-23 21:10:51 (210 KB/s) - ‘../../data/ASF_pdfs/en_fup_0000031892_20190924_175316.pdf’ saved [58667/58667]

FINISHED --2019-11-23 21:10:51--
Total wall clock time: 2.2s
Downloaded: 1 files, 57K in 0.3s (210 KB/s)
--2019-11-23 21:10:51--  https://www.oie.int/wahis_2/temp/reports/en_fup_0000031697_20190912_111711.pdf
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving www.oie.int (www.oie.int)... 51.254.148.117
Connecting to www.oie.int (www.oie.int)|51.254.148.117|:443... connected.
HTTP request sent, awaiting response... 2

In [6]:
!rm {pdfsDir}/*.pdf.* 2> /dev/null
pdfReports = !ls {pdfsDir}/*.pdf
pdfReports[1:10]

['../../data/ASF_pdfs/en_fup_0000027533_20180813_175309.pdf',
 '../../data/ASF_pdfs/en_fup_0000027570_20180816_175933.pdf',
 '../../data/ASF_pdfs/en_fup_0000027688_20180907_184405.pdf',
 '../../data/ASF_pdfs/en_fup_0000027689_20180907_184909.pdf',
 '../../data/ASF_pdfs/en_fup_0000027690_20180907_184548.pdf',
 '../../data/ASF_pdfs/en_fup_0000027691_20180907_184715.pdf',
 '../../data/ASF_pdfs/en_fup_0000027809_20180904_134034.pdf',
 '../../data/ASF_pdfs/en_fup_0000027839_20180906_133151.pdf',
 '../../data/ASF_pdfs/en_fup_0000027845_20180907_105504.pdf']

# Translating pdf-files to xml-format

In [7]:
xmlsDir = "../../data/ASF_xmls"
# !rm -rf {xmlsDir}
!mkdir -p {xmlsDir}

In [8]:
# will take some time because there are many reports from other countries about African swine fever
# that are needed to process but thrown away later
for report in pdfReports:
    if not os.path.exists(xmlsDir+"/"+report.split("/")[-1]+".xml"):
        !pdftohtml -i -xml -q {report} {report}.xml 
        !echo {report}
        !sed -i '/Printed/d; /OIE Ref:/d; /Page [0-9].*\/[0-9]/d' {report}.xml
        !mv {report}.xml {xmlsDir}

../../data/ASF_pdfs/en_fup_0000030204_20191107_171529.pdf
../../data/ASF_pdfs/en_fup_0000030205_20191115_123657.pdf
../../data/ASF_pdfs/en_fup_0000030215_20191107_173737.pdf
../../data/ASF_pdfs/en_fup_0000031697_20190912_111711.pdf
../../data/ASF_pdfs/en_fup_0000031745_20191107_175059.pdf
../../data/ASF_pdfs/en_fup_0000031746_20191024_173421.pdf
../../data/ASF_pdfs/en_fup_0000031747_20191024_173539.pdf
../../data/ASF_pdfs/en_fup_0000031748_20191024_173629.pdf
../../data/ASF_pdfs/en_fup_0000031749_20191024_172832.pdf
../../data/ASF_pdfs/en_fup_0000031750_20191024_173711.pdf
../../data/ASF_pdfs/en_fup_0000031751_20191107_172324.pdf
../../data/ASF_pdfs/en_fup_0000031752_20191107_170414.pdf
../../data/ASF_pdfs/en_fup_0000031892_20190924_175316.pdf
../../data/ASF_pdfs/en_fup_0000032148_20191021_155119.pdf
../../data/ASF_pdfs/en_fup_0000032233_20191028_131553.pdf
../../data/ASF_pdfs/en_fup_0000032332_20191115_122844.pdf
../../data/ASF_pdfs/en_fup_0000032362_20191115_115516.pdf


In [9]:
xmlReports = !ls {xmlsDir}/*xml
print("Number of reports: %d"%len(xmlReports))
xmlReports[0:10]

Number of reports: 168


['../../data/ASF_xmls/en_fup_0000027458_20180806_155743.pdf.xml',
 '../../data/ASF_xmls/en_fup_0000027533_20180813_175309.pdf.xml',
 '../../data/ASF_xmls/en_fup_0000027570_20180816_175933.pdf.xml',
 '../../data/ASF_xmls/en_fup_0000027688_20180907_184405.pdf.xml',
 '../../data/ASF_xmls/en_fup_0000027689_20180907_184909.pdf.xml',
 '../../data/ASF_xmls/en_fup_0000027690_20180907_184548.pdf.xml',
 '../../data/ASF_xmls/en_fup_0000027691_20180907_184715.pdf.xml',
 '../../data/ASF_xmls/en_fup_0000027809_20180904_134034.pdf.xml',
 '../../data/ASF_xmls/en_fup_0000027839_20180906_133151.pdf.xml',
 '../../data/ASF_xmls/en_fup_0000027845_20180907_105504.pdf.xml']

# One example of making a *soup* for a particular xml-file 

In [10]:
indexReport = 6
xmlReport = xmlReports[indexReport]
infile = open(xmlReport,"r")
print(xmlReport)
contents = infile.read()
soup = BeautifulSoup(contents)
cells = soup.find_all('text')
cells = [cell.get_text() for cell in cells]
cells[0:80]

../../data/ASF_xmls/en_fup_0000027691_20180907_184715.pdf.xml


['Follow-up report No.1',
 "Report reference:  , Reference OIE : 27691, Report Date : 07/09/2018, Country : China (People's Rep. of)",
 'Report Summary',
 'Name of sender of the report',
 'Dr Zhang Zhongqui',
 'Telephone',
 '(86-10) 591 928 28',
 'Position',
 'Director General',
 'Fax',
 '(86-10) 591 928 45',
 'Address',
 'No.11, Nongzhanguan Nanli ',
 'Chaoyang District ',
 'Beijing, 100125 ',
 ' Beijing',
 'Email',
 'chinadelegate@agri.gov.cn',
 'Date submitted to OIE',
 '07/09/2018',
 ' ',
 'Animal type',
 'Terrestrial',
 'Date of report',
 '07/09/2018',
 'Disease',
 'African swine fever',
 'Date of start of the event',
 '17/08/2018',
 'Causal Agent',
 'African swine fever virus',
 'Date of confirmation of the event',
 '22/08/2018',
 'Reason',
 'First occurrence of a listed disease',
 'Diagnosis',
 'Clinical, Laboratory (advanced)',
 'Country or zone',
 'a zone or compartment',
 'Clinical signs',
 'Yes',
 'Number of reported outbreaks',
 'submitted= 1, Draft= 0',
 'Outbreak details'

In [11]:
# machinery
a = ['Province', 'Number of outbreaks', 'City', 'County', 'Unit Type', 'Location', 'Latitude', 
     'Longitude', 'Start Date', 'End Date:', 'Species', 'Measuring units', 'Susceptible', 'Cases', 'Deaths', 
     'Killed and disposed of', 'Slaughtered']
cells = np.delete(cells, [range(i,(i+len(a))) for i in range(len(cells)) if (cells[i:(i+len(a))]==a)], axis=0)
cells[:10]

array(['Follow-up report No.1',
       "Report reference:  , Reference OIE : 27691, Report Date : 07/09/2018, Country : China (People's Rep. of)",
       'Report Summary', 'Name of sender of the report',
       'Dr Zhang Zhongqui', 'Telephone', '(86-10) 591 928 28', 'Position',
       'Director General', 'Fax'], dtype='<U280')

In [12]:
outbreak_indexes = [[i for i,s in enumerate(cells) if s=='Latitude'][0]-2]
outbreak_indexes = outbreak_indexes + [i for i,s in enumerate(cells) if (i>=outbreak_indexes[0] and "Affected Population" in s)]
str(outbreak_indexes)

'[48, 78]'

In [13]:
df = pd.DataFrame()
entity = []
zz1 = [i for i,cell in enumerate(cells) if cell=='Date submitted to OIE'][0] # for Date submitted to OIE
zz2 = [i for i,cell in enumerate(cells) if cell=='Date of report'][0] # Date of report
zz3 = [i for i,cell in enumerate(cells) if cell=='Date of confirmation of the event'][0] #Date of confirmation of the event
for idx0,idx1 in zip(outbreak_indexes,shift(outbreak_indexes,1)):
    if idx0<idx1 and (idx1-idx0)>10:
        cls = cells[idx0:idx1]
        # some machinery
        xx = [i for i,cell in enumerate(cls) if cell=='Latitude'][0]
        xx = [i for i,cell in enumerate(cls) 
              if i>xx+7 and #+7 here is a small technical adjustment
                  (cell.isdigit() or 
                   (("." in cell or "," in cell) and cell.replace(",",".").split(".")[1].isdigit()))][0]
        yy = [i for i,cell in enumerate(cls) if cell=='Species'][-1]
        try:
            yy = [i for i,cell in enumerate(cls) if cell=='Species'][-1]
            delta = (1 if "/" in cls[yy-2] else 0)
        except:
            delta = 0
        zz = [i for i,cell in enumerate(cls) if cell=='End Date:'][0]
        delta_z = 1
        while ((cls[zz+delta_z]!='-')&(zz+delta_z<xx)):
            delta_z = delta_z+1
        entity.append({
            'No': indexReport, # for reference
            'date_report': cells[zz2+1],
            'date_submission': cells[zz1+1],
            'date_confirmation': cells[zz3+1],
            'province': cls[zz+1].split("-")[0],
            'location': "_".join(cls[(zz+delta_z+1):xx]).replace(","," "),
            'lat': cls[xx].replace(",",".").split()[-1], #latitude (use of split(" ") b/c there are some inconsistency in few xmls)
            'long': cls[xx+1].replace(",","."), #Longitude
            'start': cls[xx+2], #Start date
            'end': (cls[xx+3] if delta else np.nan), #End date
            'species': (cls[-8].split("(")[0] if cls[-7][-1]==')' else cls[-7]), #Species
            'susceptible': (int(cls[-5]) if cls[-5].isdigit() else np.nan), #Susceptibles
            'cases': (int(cls[-4]) if cls[-4].isdigit() else np.nan), #Cases
            'deaths': (int(cls[-3]) if cls[-3].isdigit() else np.nan), #Deaths
            'destroyed': (int(cls[-2]) if cls[-2].isdigit() else np.nan), #Destroyed
            'slaughtered': (int(cls[-1]) if cls[-1].isdigit() else np.nan), #Slaughtered
            'report_name': xmlReport.split("/")[-1].split(".")[0]
        })
        
(pd.DataFrame.from_dict(entity)[list(entity[0].keys())])

Unnamed: 0,No,date_report,date_submission,date_confirmation,province,location,lat,long,start,end,species,susceptible,cases,deaths,destroyed,slaughtered,report_name
0,6,07/09/2018,07/09/2018,22/08/2018,Zhejiang,Wenzhou_Yueqing_Farm_Zhang ao Village,28.2,120.94,17/08/2018,,Swine,1864,430,340,1524,0,en_fup_0000027691_20180907_184715


# Accumulating the final dataset from all retrieved reports

In [14]:
entity = []
for indexReport, xmlReport in enumerate(xmlReports):
    entity.append({'No': indexReport, 'report': (xmlReport.split("/")[-1])[:-8]})
pd.DataFrame.from_dict(entity)[list(entity[0].keys())].to_csv("../../data/ASF_reports_indexing.csv",index=False,sep=",")

In [15]:
entity = []
for indexReport, xmlReport in enumerate(xmlReports):
    print ([indexReport,xmlReport])
    infile = open(xmlReport,"r")
    contents = infile.read()
    soup = BeautifulSoup(contents)
    cells = soup.find_all('text')
    cells = [cell.get_text() for cell in cells]
    
    a = ['Province', 'Number of outbreaks', 'City', 'County', 'Unit Type', 'Location', 'Latitude', 
     'Longitude', 'Start Date', 'End Date:', 'Species', 'Measuring units', 'Susceptible', 'Cases', 'Deaths', 
     'Killed and disposed of', 'Slaughtered']
    cells = np.delete(cells, [range(i,(i+len(a))) for i in range(len(cells)) if (cells[i:(i+len(a))]==a)], axis=0)
    
    outbreak_indexes = [[i for i,s in enumerate(cells) if s=='Latitude'][0]]
    outbreak_indexes = outbreak_indexes + [i for i,s in enumerate(cells) if (i>=outbreak_indexes[0] and "Affected Population" in s)]

    zz1 = [i for i,cell in enumerate(cells) if cell=='Date submitted to OIE'][0] # for Date submitted to OIE
    zz2 = [i for i,cell in enumerate(cells) if cell=='Date of report'][0] # Date of report
    zz3 = [i for i,cell in enumerate(cells) if cell=='Date of confirmation of the event'][0] #Date of confirmation of the event

    order = 1
    for idx0,idx1 in zip(outbreak_indexes,shift(outbreak_indexes,1)):
        if idx0<idx1 and (idx1-idx0)>10:
            cls = cells[idx0:idx1]
            # some machinery
            xx = [i for i,cell in enumerate(cls) if cell=='Latitude'][0]
            xx = [i for i,cell in enumerate(cls) 
                  if i>xx and #+7 here is a small technical adjustment
                      (cell.isdigit() or 
                       (("." in cell or "," in cell) and cell.replace(",",".").split(".")[1].isdigit()))][0]
            try:
                yy = [i for i,cell in enumerate(cls) if cell=='Species'][-1]
                delta = (1 if "/" in cls[yy-2] else 0)
            except:
                delta = 0
            zz = [i for i,cell in enumerate(cls) if cell=='End Date:'][0]
            delta_z = 1
            while ((cls[zz+delta_z]!='-')&(zz+delta_z<xx)):
                delta_z = delta_z+1
            entity.append({
                'No': indexReport, # for reference
                'report': xmlReport.split("/")[-1].split(".")[0],
                'report_info': cls[zz+1].split("-")[1],
                'order': order,
                'date_report': cells[zz2+1],
                'date_submission': cells[zz1+1],
                'date_confirmation': cells[zz3+1],
                'province': cls[zz+1].split("-")[0],
                'location': "_".join(cls[(zz+delta_z+1):xx]).replace(","," "),
                'lat': cls[xx].replace(",",".").split()[-1], #latitude (use of split(" ") b/c there are some inconsistency in few xmls)
                'long': cls[xx+1].replace(",","."), #Longitude
                'start': cls[xx+2], #Start date
                'end': (cls[xx+3] if delta else np.nan), #End date
                'species': (cls[-8].split("(")[0] if cls[-7][-1]==')' else cls[-7]), #Species
                'susceptible': (int(cls[-5]) if cls[-5].isdigit() else np.nan), #Susceptibles
                'cases': (int(cls[-4]) if cls[-4].isdigit() else np.nan), #Cases
                'deaths': (int(cls[-3]) if cls[-3].isdigit() else np.nan), #Deaths
                'destroyed': (int(cls[-2]) if cls[-2].isdigit() else np.nan), #Destroyed
                'slaughtered': (int(cls[-1]) if cls[-1].isdigit() else np.nan) #Slaughtered
            })
            order = order + 1
            
pd.DataFrame.from_dict(entity)[list(entity[0].keys())].to_csv("../../data/ASF_data.csv",
                                                              na_rep = 'NA',
                                                              index=False,
                                                              sep=",")

[0, '../../data/ASF_xmls/en_fup_0000027458_20180806_155743.pdf.xml']
[1, '../../data/ASF_xmls/en_fup_0000027533_20180813_175309.pdf.xml']
[2, '../../data/ASF_xmls/en_fup_0000027570_20180816_175933.pdf.xml']
[3, '../../data/ASF_xmls/en_fup_0000027688_20180907_184405.pdf.xml']
[4, '../../data/ASF_xmls/en_fup_0000027689_20180907_184909.pdf.xml']
[5, '../../data/ASF_xmls/en_fup_0000027690_20180907_184548.pdf.xml']
[6, '../../data/ASF_xmls/en_fup_0000027691_20180907_184715.pdf.xml']
[7, '../../data/ASF_xmls/en_fup_0000027809_20180904_134034.pdf.xml']
[8, '../../data/ASF_xmls/en_fup_0000027839_20180906_133151.pdf.xml']
[9, '../../data/ASF_xmls/en_fup_0000027845_20180907_105504.pdf.xml']
[10, '../../data/ASF_xmls/en_fup_0000027846_20180907_110550.pdf.xml']
[11, '../../data/ASF_xmls/en_fup_0000027856_20180910_192244.pdf.xml']
[12, '../../data/ASF_xmls/en_fup_0000027857_20180910_190506.pdf.xml']
[13, '../../data/ASF_xmls/en_fup_0000027897_20180911_160445.pdf.xml']
[14, '../../data/ASF_xmls/en_f

In [16]:
print("Haha, finished!")

Haha, finished!
