# Ministry of Health PDF parser

A simple script to gather PDF files from MOH and parse them into pandas DataFrame.

פרסור דו"חות תחלואה לפי ערים של משרד הבריאות

## Imports and setup

In [1]:
from __future__ import print_function

import pandas as pd
import tabula
import os
import numpy as np
from pathlib import Path

from ipywidgets import interact, interactive, fixed, interact_manual
from IPython.core.display import display, HTML
import matplotlib.pyplot as plt
import plotly.express as px
import folium
import plotly.graph_objects as go
import seaborn as sns
import ipywidgets as widgets
# %pwd

Base URLS:

In [14]:
report_dir = "./MOHReports_raw"

## Individual PDF file parser function

In [63]:
def parse_pdf_report(report_url):
    #Get data
    raw_tables = tabula.read_pdf(report_url, pages="all", stream=False, silent=True)
    #Merge to one dataframe
    cols =  ["Rate per 100000", "Isolations", "Cured", "Confirmed", "Tested", "2018 Population", "City"]
#   data = pd.DataFrame(columns = cols)
#   for i, table in enumerate(raw_tables):
#     if i==0:
#       
#       table = table.iloc[4:].rename(columns = new_cols) #TODO: use drop()
#     else:
#       first_values = table.columns.values
#       new_cols = {x: y for x, y in zip(table.columns, cols)}
#       first_values = [{x: y for x, y in zip(cols, first_values)}]
#       table = pd.concat([pd.DataFrame(first_values), table.rename(columns=new_cols)], ignore_index=True)
#     data = data.append(table, ignore_index=True)
#   #Remove last two rows whch are a different table
#   data = data.iloc[:-2]
    new_cols = {x: y for x, y in zip(raw_tables.columns, cols)}
    data = raw_tables.iloc[4:-2]
    data = data.reset_index(drop=True).rename(columns = new_cols)
#     data = raw_tables.drop(raw_tables.index[:4]).drop(raw_tables.index[-2:])
#     print(data.head())
    #Deal with nans
    data.dropna(axis=0, how='all', inplace=True)
    inds = pd.isnull(data).any(1).to_numpy().nonzero()[0]
 
    for ind in inds:
        if data['City'][ind] is not np.nan and data['City'][ind+1] is not np.nan: #part of name is here, data is on next row
            new_name = data['City'][ind] + " " + data['City'][ind+1]
#           data['City'][ind+1] = new_name
            data.at[ind+1, 'City'] = new_name
            data.loc[ind, :] = np.nan
        elif data['2018 Population'][ind-1] is np.nan and data['2018 Population'][ind+1] is np.nan: #merge three lines
            new_name = data['City'][ind-1] + " " + data['City'][ind+1]
#             data['City'][ind] = new_name
            data.at[ind, 'City'] = new_name
            data.loc[[ind-1,ind+1], :] = np.nan
    #TODO: make more robust
    data.dropna(axis=0, how='all', inplace=True)

    #Remove carriage returns in city names
    data['City'] = data['City'].str.replace('\\r', ' ',)

    #Convert relevant columns to numbers
    numeric_cols = data.columns.drop('City')
    data_numeric = data.copy()
    data_numeric[numeric_cols] = data[numeric_cols].apply(lambda x: x.astype(str).str.replace(',', ''))
    data_numeric[numeric_cols] = data_numeric[numeric_cols].apply(pd.to_numeric, errors='ignore')
    return data_numeric

# parse_pdf_report('./MOHReports_raw/20200412.pdf')

Gather files and parse:

In [64]:
path = Path.cwd() / Path(report_dir)
gfiles = path.glob('*.pdf')
with pd.ExcelWriter('data_parsed.xlsx') as writer: 
    for file in gfiles:
        url = str(file)
        parsed_data = parse_pdf_report(url)
        #Export to excel file
        result_fn = file.stem
        print(result_fn)
        parsed_data.to_excel(writer, sheet_name=result_fn)

  Rate per 100000 Isolations Cured Confirmed Tested 2018 Population       City
0             654          9    18        44    213           6,724   כפר חב"ד
1             639         35     4        55    281           8,602  כוכב יעקב
2             599         11    27        64    632          10,676       אפרת
3             518         71    11       242  1,785          46,760       אלעד
4             372         53    23        97    848          26,058  מגדל העמק
20200412
  Rate per 100000 Isolations Cured Confirmed Tested 2018 Population      City
0          966.73        337   112     1,888  8,964         195,298   בני ברק
1             NaN        NaN   NaN       NaN    NaN             NaN      קרית
2          682.65         15    21        39    409           5,713       NaN
3             NaN        NaN   NaN       NaN    NaN             NaN     יערים
4          654.37          9    20        44    215           6,724  כפר חב"ד
20200413
  Rate per 100000 Isolations Cured Confi

index              object
Rate per 100000    object
Isolations         object
Cured              object
Confirmed          object
Tested             object
2018 Population    object
City               object
2                  object
10                 object
13                 object
17                 object
24                 object
27                 object
32                 object
dtype: object

Display interactively:

In [None]:
def bubble_chart(n):
    fig = px.scatter(parsed_data.head(n), x="City", y="Rate per 100000", size="Rate per 100000", color="City",
               hover_name="City", size_max=60)
    fig.update_layout(
      title=str(n) +" ערים הנפגעות ביותר",
      xaxis_title="ערים",
      yaxis_title="שיעור ל100,000",
      width = 700
    )
    fig.show();

interact(bubble_chart, n=10)
plt.show()

# ipywLayout = widgets.Layout(border='solid 2px green')
# ipywLayout.display='none'
# widgets.VBox([fig], layout=ipywLayout)