# Ministry of Health PDF parser

A simple script to gather PDF files from MOH and parse them into pandas DataFrame.

פרסור דו"חות תחלואה לפי ערים של משרד הבריאות

## Imports and setup

In [1]:
from __future__ import print_function

import pandas as pd
import tabula
import os
import numpy as np
from pathlib import Path

from ipywidgets import interact, interactive, fixed, interact_manual
from IPython.core.display import display, HTML
import matplotlib.pyplot as plt
import plotly.express as px
import folium
import plotly.graph_objects as go
import seaborn as sns
import ipywidgets as widgets
from pyproj import CRS, Transformer
# %pwd

Base URLS:

In [2]:
report_dir = "./MOHReports_raw"

## Individual PDF file parser function

In [71]:
def parse_pdf_report(report_url):
    #Get data
    raw_tables = tabula.read_pdf(report_url, pages="all", stream=False, silent=True)
    #Merge to one dataframe
    cols =  ["Rate per 100000", "Isolations", "Cured", "Confirmed", "Tested", "2018 Population", "City"]
#   data = pd.DataFrame(columns = cols)
#   for i, table in enumerate(raw_tables):
#     if i==0:
#       
#       table = table.iloc[4:].rename(columns = new_cols) #TODO: use drop()
#     else:
#       first_values = table.columns.values
#       new_cols = {x: y for x, y in zip(table.columns, cols)}
#       first_values = [{x: y for x, y in zip(cols, first_values)}]
#       table = pd.concat([pd.DataFrame(first_values), table.rename(columns=new_cols)], ignore_index=True)
#     data = data.append(table, ignore_index=True)
#   #Remove last two rows whch are a different table
#   data = data.iloc[:-2]
    new_cols = {x: y for x, y in zip(raw_tables.columns, cols)}
    data = raw_tables.rename(columns = new_cols)
    idx = data["City"].fillna(" ").str.contains("בני ברק").idxmax()
    data = data.iloc[idx:-2]
    data = data.reset_index(drop=True)
#     data = raw_tables.drop(raw_tables.index[:4]).drop(raw_tables.index[-2:])
#     print(data.head())
    #Deal with nans
    data.dropna(axis=0, how='all', inplace=True)
    inds = pd.isnull(data).any(1).to_numpy().nonzero()[0]
 
    for ind in inds:
        if data['City'][ind] is not np.nan and data['City'][ind+1] is not np.nan: #part of name is here, data is on next row
            new_name = data['City'][ind] + " " + data['City'][ind+1]
#           data['City'][ind+1] = new_name
            data.at[ind+1, 'City'] = new_name
            data.loc[ind, :] = np.nan
        elif data['2018 Population'][ind-1] is np.nan and data['2018 Population'][ind+1] is np.nan: #merge three lines
            new_name = data['City'][ind-1] + " " + data['City'][ind+1]
#             data['City'][ind] = new_name
            data.at[ind, 'City'] = new_name
            data.loc[[ind-1,ind+1], :] = np.nan
    #TODO: make more robust
    data.dropna(axis=0, how='all', inplace=True)

    #Remove carriage returns in city names
    data['City'] = data['City'].str.replace('\\r', ' ',)

    #Convert relevant columns to numbers
    numeric_cols = data.columns.drop('City')
    data_numeric = data.copy()
    data_numeric[numeric_cols] = data[numeric_cols].apply(lambda x: x.astype(str).str.replace(',', ''))
    data_numeric[numeric_cols] = data_numeric[numeric_cols].apply(pd.to_numeric, errors='ignore')
    return data_numeric

# report_url='./MOHReports_raw/20200413.pdf'
# raw_tables = tabula.read_pdf(report_url, pages="all", stream=False, silent=True)
# cols =  ["Rate per 100000", "Isolations", "Cured", "Confirmed", "Tested", "2018 Population", "City"]
# new_cols = {x: y for x, y in zip(raw_tables.columns, cols)}
# data = raw_tables.rename(columns = new_cols)
# idx = data["City"].fillna(" ").str.contains("בני ברק").idxmax()
# print(idx)
# data.head(10)

Gather files and parse:

In [72]:
from datetime import datetime

path = Path.cwd() / Path(report_dir)
gfiles = path.glob('*.pdf')
confirmed_cases = pd.DataFrame(columns = ["City"])
with pd.ExcelWriter('data_parsed.xlsx') as writer: 
    for file in gfiles:
        url = str(file)
        parsed_data = parse_pdf_report(url)
        #Export to excel file
        result_fn = file.stem
        date = datetime.strptime(result_fn, '%Y%m%d').strftime('%Y-%m-%d')
        print(date)
        tmp = parsed_data.loc[:, ["City", "Confirmed"]].rename(columns={"Confirmed" : date})
        confirmed_cases = confirmed_cases.merge(tmp, on="City", how="outer")
        parsed_data.to_excel(writer, sheet_name=result_fn)

confirmed_cases.head(10)

2020-04-12
2020-04-13
2020-04-14
2020-04-16


Unnamed: 0,City,2020-04-12,2020-04-13,2020-04-14,2020-04-16
0,בני ברק,1806.0,1888.0,2053.0,2150.0
1,קרית יערים,39.0,39.0,39.0,39.0
2,"כפר חב""ד",44.0,44.0,44.0,44.0
3,כוכב יעקב,55.0,56.0,57.0,58.0
4,אפרת,64.0,64.0,64.0,64.0
5,אלעד,242.0,253.0,278.0,301.0
6,מגדל העמק,97.0,99.0,99.0,99.0
7,אור יהודה,113.0,117.0,120.0,122.0
8,טבריה,130.0,133.0,134.0,144.0
9,מודיעין עילית,206.0,218.0,239.0,261.0


In [5]:
# parsed_data.head(10)

Display interactively:

In [73]:
def bubble_chart(n):
    fig = px.scatter(parsed_data.head(n), x="City", y="Rate per 100000", size="Rate per 100000", color="City",
               hover_name="City", size_max=60)
    fig.update_layout(
      title=str(n) +" ערים הנפגעות ביותר",
      xaxis_title="ערים",
      yaxis_title="שיעור ל100,000",
      width = 700
    )
    fig.show();

interact(bubble_chart, n=10)
plt.show()

# ipywLayout = widgets.Layout(border='solid 2px green')
# ipywLayout.display='none'
# widgets.VBox([fig], layout=ipywLayout)

Geolocate cities:

In [47]:
# itm_crs = CRS.from_proj4("+proj=tmerc +lat_0=31.7343936111111 +lon_0=35.2045169444445 +k=1.0000067 +x_0=219529.584 +y_0=626907.39 +ellps=GRS80 +towgs84=-24.002400,-17.103200,-17.844400,-0.33077,-1.852690,1.669690,5.424800 +units=m +no_defs")
# itm_crs = CRS.from_proj4("+proj=tmerc +lat_0=31.73439361111111 +lon_0=35.20451694444445 +k=1.0000067 +x_0=219529.584 +y_0=626907.39 +ellps=GRS80 +units=m +no_defs")
itm_crs = CRS.from_proj4("+proj=tmerc +lat_0=31.7343936111111 +lon_0=35.2045169444445 +k=1.0000067 +x_0=219529.584 +y_0=626907.39 +ellps=GRS80 +towgs84=-24.002400,-17.103200,-17.844400,-0.33077,-1.852690,1.669690,5.424800 +units=m +no_defs")
wgs84_crs = CRS.from_epsg(4326)
transformer = Transformer.from_crs(itm_crs, wgs84_crs)
# print(transformer.transform( 184550, 665990))
#bnei brak = 18455 66599


cities_df = pd.read_excel("bycode2018.xlsx", usecols = ["שם יישוב", "קואורדינטות", 'תעתיק'])
cities_df.dropna(axis=0, how='any', inplace=True)
cities_df.loc[:, "קואורדינטות"] = cities_df.loc[:, "קואורדינטות"].round(0).astype(np.int64).astype(str)
cities_df['x'] = cities_df.loc[:, "קואורדינטות"].str[0:5].astype(int)
cities_df['y'] = cities_df.loc[:, "קואורדינטות"].str[5:].astype(int)
lats, lons = transformer.transform(cities_df['x'].to_numpy()*10, cities_df['y'].to_numpy()*10)
cities_df['lat'] = lats
cities_df['lon'] = lons
cities_df = cities_df[['שם יישוב', 'lat', 'lon', 'תעתיק']]
cities_df.rename(columns = {"שם יישוב":"City", "תעתיק":"CityEng"} ,inplace=True)
cities_df.head(20)

Unnamed: 0,City,lat,lon,CityEng
0,אבו ג'ווייעד (שבט),31.230437,35.042204,ABU JUWEI'ID
1,אבו גוש,31.805999,35.110062,ABU GHOSH
2,אבו סנאן,32.960558,35.168216,ABU SINAN
3,אבו סריחאן (שבט),31.261641,34.85841,ABU SUREIHAN
4,אבו עבדון (שבט),31.302184,34.842506,ABU ABDUN
5,אבו עמאר (שבט),31.194207,34.947838,ABU AMMAR
6,אבו עמרה (שבט),31.230031,34.842782,ABU AMRE
7,אבו קורינאת (יישוב),31.14057,34.962243,ABU QUREINAT
8,אבו קורינאת (שבט),31.131112,34.968978,ABU QUREINAT
9,אבו רובייעה (שבט),31.239469,35.052686,ABU RUBEI'A


Merge:

In [74]:
# print(confirmed_cases.index)
confirmed_df = confirmed_cases.copy()
confirmed_df = confirmed_df.merge(cities_df, how='left', on="City")
confirmed_df.dropna(axis=0, how='any', inplace=True)
confirmed_df = confirmed_df.reset_index(drop=True) #TODO: correct and search
# confirmed_df.head(20)
# print(confirmed_df.loc[confirmed_df.City.str.contains("בני ברק"),:])
#print(str(confirmed_df.iloc[10,1]))
# confirmed_df.head(20)


In [77]:
world_map = folium.Map(location=[31.4,35], tiles="cartodbpositron", zoom_start=8, max_zoom = 15, min_zoom = 8)

for i in range(0,len(confirmed_df)):
    folium.Circle(
        location=[confirmed_df.at[i,'lat'], confirmed_df.at[i,'lon']],
        fill=True,
#         radius=(int((np.log(500*confirmed_df.iloc[i,-1]+1.00001)))+0.2),
         radius=int(confirmed_df.iloc[i,1]),
        color='red',
        fill_color='indigo',
        tooltip = "<meta http-equiv='content-type' content='text/html; charset=UTF-8' /><div style='margin: 0; background-color: black; color: white;'>"+
                    "<h5 style='text-align:center;font-weight: bold'>"+str(confirmed_df.at[i,'CityEng']) + "</h4>"
                    "<hr style='margin:10px;color: white;'>"+
                    "<ul style='color: white;;list-style-type:circle;align-item:left;padding-left:20px;padding-right:20px'>"+
                        "<li>Confirmed: "+str(confirmed_df.at[i,"2020-04-16"])+"</li>"+
                        "</ul></div>",
        ).add_to(world_map)

world_map

With time evolution!


In [100]:
import vincent
import json
# from bokeh.plotting import figure, output_file, show
from bokeh.plotting import figure
from bokeh.resources import CDN
from bokeh.embed import file_html

# plot = figure()
# plot.circle([1,2], [3,4])

# html = file_html(plot, CDN, "my plot")


# scatter_json = line.to_json()

# # Let's convert it to dict.
# scatter_dict = json.loads(scatter_json)

world_map = folium.Map(location=[31.4,35], tiles="cartodbpositron", zoom_start=8, max_zoom = 15, min_zoom = 8)

for i in range(0,len(confirmed_df)):
    scatter_chart = vincent.Line(confirmed_df.loc[i, ["2020-04-12","2020-04-13","2020-04-14"]].to_list())
    scatter_chart.width = 350
    scatter_chart.height = 175
    popup = folium.Popup(max_width=400)
    folium.Vega(scatter_chart, height=200, width=400).add_to(popup)
    folium.Circle(
        location=[confirmed_df.at[i,'lat'], confirmed_df.at[i,'lon']],
        fill=True,
#         radius=(int((np.log(500*confirmed_df.iloc[i,-1]+1.00001)))+0.2),
         radius=int(confirmed_df.iloc[i,1]),
        color='red',
        fill_color='indigo',
        popup = popup,
        ).add_to(world_map)

world_map
# world_map.save('data.html')