### Import the necessary modules

In [1]:
from bs4 import BeautifulSoup
from io import BytesIO
import pandas as pd
import requests
import PyPDF2
import re

### Functions

In [2]:
def read_page(url, page):
    response = requests.get('https://covid19.saglik.gov.tr' + link)
    
    pdfFileObj = BytesIO(response.content)
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
    pageObj = pdfReader.getPage(page)

    return pageObj.extractText()

In [3]:
def parse_data(data):
    parsed = data
    parsed = parsed[parsed.index("Istanbul"):]
    parsed = parsed[:parsed.index("Turkey")]
    
    return re.findall(r"([a-zA-Z ]+)[\n -]*([0-9,.]+)[\n -]*", parsed)

In [4]:
def get_row(parsed, dt, key, value):
    df_data = {"Date": pd.to_datetime(dt, format='%d%m%Y', errors='ignore')}
    for pair in parsed:
        df_data[pair[key].replace('\n', '')] = [pair[value]]
    return pd.DataFrame(df_data)

### Program

In [5]:
saglik_url = 'https://covid19.saglik.gov.tr/TR-68444/gunluk-rapor--daily-report.html'

response = requests.get(saglik_url)
content = response.content

In [6]:
soup = BeautifulSoup(content, 'html.parser')

links = [link.get('href') for link in soup.select('tbody a')]

eng = re.compile('.*eng.html')

links = [link for link in links if eng.match(link)]

In [7]:
df_patient = pd.DataFrame()
df_hospitalization = pd.DataFrame()
df_hospital_discharge = pd.DataFrame()

for link in links:
    date = re.search("-(\d*)-eng.html", link).group(1)
    
    if date == '14082020': # The link is broken
        continue
    
    data = read_page(link, page=4)
    parsed = data[data.index("Istanbul"):]
    parsed = parsed[:parsed.index("Turkey")]

    parsed =  re.findall(r"([A-Z][a-zA-Z \n]*[a-z])[\n -]+([0-9.]+)[\n -]+.*[\n -]+([0-9.]+)", parsed)
    df_hospitalization = pd.concat([df_hospitalization, get_row(parsed, date, key=0, value=1)])
    df_hospital_discharge = pd.concat([df_hospital_discharge, get_row(parsed, date, key=0, value=2)])
    
    data = read_page(link, page=2)
    parsed = data[data.index("Istanbul"):]
    parsed = parsed[:parsed.index("Turkey")]
    parsed =  re.findall(r"([A-Z][a-zA-Z \n]*[a-z])[\n -]+([0-9.]+)", parsed)
    df_patient = pd.concat([df_patient, get_row(parsed, date, key=0, value=1)])
    
    print('record date:', date)

record date: 23112020
record date: 22112020
record date: 21112020
record date: 20112020
record date: 19112020
record date: 18112020
record date: 17112020
record date: 16112020
record date: 15112020
record date: 14112020
record date: 13112020
record date: 12112020
record date: 11112020
record date: 10112020
record date: 09112020
record date: 08112020
record date: 07112020
record date: 06112020
record date: 05112020
record date: 04112020
record date: 03112020
record date: 02112020
record date: 01112020
record date: 31102020
record date: 30102020
record date: 29102020
record date: 28102020
record date: 27102020
record date: 26102020
record date: 25102020
record date: 24102020
record date: 23102020
record date: 22102020
record date: 21102020
record date: 20102020
record date: 19102020
record date: 18102020
record date: 17102020
record date: 16102020
record date: 15102020
record date: 14102020
record date: 13102020
record date: 12102020
record date: 11102020
record date: 10102020
record dat

In [8]:
df_patient.head()

Unnamed: 0,Date,Istanbul,Western Marmara,Aegean,Eastern Marmara,Western Anatolia,Mediterranean,Central Anatolia,Western Blacksea,Eastern Blacksea,Northeastern Anatolia,Mideastern Anatolia,Southeastern Anatolia
0,2020-11-23,1.557,234,893,899,533,728,294,551,226,137,193,468
0,2020-11-22,1.361,262,902,733,433,711,232,504,179,110,187,402
0,2020-11-21,1.404,234,810,792,367,564,199,368,167,101,137,389
0,2020-11-20,1.289,236,695,697,368,515,213,350,156,99,135,350
0,2020-11-19,1.161,171,603,696,347,430,175,320,134,86,129,289


In [9]:
df_hospitalization.head()

Unnamed: 0,Date,Istanbul,Western Marmara,Aegean,Eastern Marmara,Western Anatolia,Mediterranean,Central Anatolia,Western Blacksea,Eastern Blacksea,Northeastern Anatolia,Mideastern Anatolia,Southeastern Anatolia
0,2020-11-23,159,32,93,124,81,92,39,64,24,21,22,60
0,2020-11-22,149,36,101,113,61,65,42,62,24,19,24,58
0,2020-11-21,165,31,95,119,72,81,42,62,21,26,25,58
0,2020-11-20,159,34,103,131,68,91,45,55,21,22,21,57
0,2020-11-19,165,34,106,123,76,83,38,61,22,21,22,62


In [10]:
df_hospital_discharge.head()

Unnamed: 0,Date,Istanbul,Western Marmara,Aegean,Eastern Marmara,Western Anatolia,Mediterranean,Central Anatolia,Western Blacksea,Eastern Blacksea,Northeastern Anatolia,Mideastern Anatolia,Southeastern Anatolia
0,2020-11-23,137,24,66,99,52,59,27,37,14,16,23,59
0,2020-11-22,117,21,59,97,49,52,29,42,13,22,24,50
0,2020-11-21,133,25,68,109,57,61,34,43,16,21,24,53
0,2020-11-20,133,25,74,106,52,62,29,41,13,17,26,60
0,2020-11-19,131,23,71,94,62,53,34,50,15,18,23,54
