## DSWD DROMIC Coronavirus disease (COVID-2019) situation reports

Reports from https://dromic.dswd.gov.ph/coronavirus-disease-covid-19-31-dec-2019/ in Tabular format.


In [32]:
import pandas as pd
import requests
import os
import glob
import re
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from decimal import Decimal
from docx import Document
import logging

In [34]:
# papermill parameters
output_folder = '../output/'

In [35]:
# Source Google Map DSWD Offices
dswd_offices = [
    ['DSWD Central Office', 'Central Office', 14.696294999999996, 121.09674800000005],
    ['DSWD National Resource Operation Center', 'NRLMB - NROC', 14.513659, 121.0036236],
    ['DSWD Visayas Disaster Response Center', 'NRLMB - VDRC', 10.3093678, 123.9049462],
    ['FO NCR Manila', 'NCR', 14.598674999999998, 120.990366],   
    ['FO 2 Tuguegarao, Cagayan', 'II', 17.655575000000006, 121.74608899999998],
    ['FO 1 San Fernando, La Union', 'I', 16.610573000000013, 120.317046],
    ['FO 3 San Fernando, Pampanga', 'III', 15.029500000000027, 120.68910000000005],
    ['FO 4A CALABARZON', 'CALABARZON', 14.425525999999985, 121.03137700000002],
    ['FO 4B MIMAROPA', 'MIMAROPA', 14.576845000000002, 120.99156300000004],
    ['FO 5 Legaspi City', 'V', 13.13603, 123.73976500000003],
    ['FO 6 Iloilo City', 'VI', 10.698014999999998, 122.54788800000006],
    ['FO 7 Cebu City', 'VII', 10.308428000000026, 123.90742999999998],
    ['FO 8 Tacloban City', 'VIII', 11.250660000000002, 125.00533799999994],
    ['FO 9 Zamboanga City', 'IX', 6.903621000000008, 122.08027400000003],
    ['FO 10 Cagayan de Oro', 'X', 8.44645800000001, 124.62133199999994],
    ['FO 11 Davao City', 'XI', 7.073695000000023, 125.61824999999999],
    ['FO 12 Koronadal', 'XII', 6.4894340000000295, 124.85539199999994],
    ['FO CARAGA, Butuan City', 'CARAGA', 8.94300700000001, 125.53416300000004],
    ['FO CAR Baguio City', 'CAR', 16.408525000000004, 120.60078599999997],
]
dswd_offices_df = pd.DataFrame(dswd_offices, columns=['name', 'alias', 'latitude', 'longitude'])

In [36]:
if os.getcwd().endswith('notebooks'): root_path = '../'
else: root_path = ''
    
def path_of(path): 
    return root_path + path

In [131]:
def process(doc, sitrep_no, timestamp):
    document = Document(doc)
    tables = document.tables
    for tbl in tables:
        header = tbl.rows[0].cells[1].paragraphs[0].text.upper().strip()
        if header == 'STANDBY FUNDS':
            table = tbl
            break

    # Available Standby Funds and Stockpiles
    #table = tables[0]
    data_start_row = 3
    data = []
    for i, row in enumerate(table.rows):
        if i >= data_start_row:
            cur_row = []
            cur_row.append(sitrep_no)
            cur_row.append(timestamp)
            for j, cell in enumerate(row.cells):
                p = cell.paragraphs[0] # get the first paragraph of the cell only
                if j == 0: # Office
                    cur_row.append(p.text)
                else: # Numerical data
                    if p.text.strip() == '-' or p.text.strip() == '':
                        cur_row.append(None)
                    else:
                        num_val = re.sub('[^0-9\.]+', '', p.text)
                        try: cur_row.append(Decimal(num_val)) # remove commas and double
                        except: raise RuntimeError('Unable to parse Decimal "%s"' % num_val)

            if len(cur_row) > 9:
                logging.warning('Row has more than 9 columns removing subtotal column')
                cur_row.pop(8)
            data.append(cur_row)

    data_df = pd.DataFrame(data, columns=['sitrep', 'timestamp', 'office', 'standby_funds', 'ffp_quantity', 'ffp_total_cost', 'ofi_total_cost', 'nfi_total_cost', 'total'])
    data_df = pd.merge(data_df, dswd_offices_df, how='left', left_on='office', right_on='alias')
    data_df = data_df.drop(['alias', 'name'], axis=1)
    data_df = data_df[['timestamp', 'sitrep', 'office', 'latitude', 'longitude', 'standby_funds', 'ffp_quantity', 'ffp_total_cost', 'ofi_total_cost', 'nfi_total_cost', 'total']]
    data_df.to_csv('{}/dromic_covid19_sitreps_fas_sr{:0>3d}_fas.csv'.format(output_folder, sitrep_no), index = False)


In [132]:
inputs = []
for input_file in glob.glob(path_of('datasets/sitreps/*.docx')):
    fparts = os.path.splitext(os.path.basename(input_file))[0].split('-')
    sitrep_no = fparts[3]
    if fparts[8] == 'COVID':
        dom = fparts[12][:2]
        mon = fparts[13]
        yr = fparts[14][:4]
        hr = fparts[15][:3]
    else: 
        dom = fparts[11][:2]
        mon = fparts[12]
        yr = fparts[13][:4]
        hr = fparts[14][:3]
    
    datetime_string = '%s-%s-%s-%s' % (yr, mon, dom, hr)
    timestamp = datetime.strptime(datetime_string, '%Y-%B-%d-%I%p')
    inputs.append([int(sitrep_no), timestamp, input_file])

inputs.sort(key=lambda r: r[0])

min_sitrep = 36 # Process only sitreps from this value for incremental processing

inputs.sort(key=lambda r: r[0]) # reverse=True for testing to get most recent file
for inp in inputs:
    sitrep_no = inp[0]
    timestamp = inp[1]
    doc = inp[2]
    if sitrep_no >= min_sitrep:
        logging.info('Proecssing SR%s t=%s %s' % (inp[0], inp[1], inp[2]))
        process(doc, sitrep_no, timestamp)


Proecssing SR1 t=2020-03-20 20:00:00 ../datasets/sitreps/DSWD-DROMIC-Report-1-on-the-Coronavirus-Disease-COVID-19-as-of-20-March-2020-8PM-2.docx
Row has more than 9 columns removing subtotal column
Row has more than 9 columns removing subtotal column
Row has more than 9 columns removing subtotal column
Row has more than 9 columns removing subtotal column
Row has more than 9 columns removing subtotal column
Row has more than 9 columns removing subtotal column
Row has more than 9 columns removing subtotal column
Row has more than 9 columns removing subtotal column
Row has more than 9 columns removing subtotal column
Row has more than 9 columns removing subtotal column
Row has more than 9 columns removing subtotal column
Row has more than 9 columns removing subtotal column
Row has more than 9 columns removing subtotal column
Row has more than 9 columns removing subtotal column
Row has more than 9 columns removing subtotal column
Row has more than 9 columns removing subtotal column
Row has