### PyPDF2 installation

In [1]:
!pip install PyPDF2

Collecting PyPDF2
[?25l  Downloading https://files.pythonhosted.org/packages/b4/01/68fcc0d43daf4c6bdbc6b33cc3f77bda531c86b174cac56ef0ffdb96faab/PyPDF2-1.26.0.tar.gz (77kB)
[K     |████▎                           | 10kB 12.2MB/s eta 0:00:01[K     |████████▌                       | 20kB 16.4MB/s eta 0:00:01[K     |████████████▊                   | 30kB 15.0MB/s eta 0:00:01[K     |█████████████████               | 40kB 11.4MB/s eta 0:00:01[K     |█████████████████████▏          | 51kB 9.9MB/s eta 0:00:01[K     |█████████████████████████▍      | 61kB 9.1MB/s eta 0:00:01[K     |█████████████████████████████▋  | 71kB 9.0MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 5.2MB/s 
[?25hBuilding wheels for collected packages: PyPDF2
  Building wheel for PyPDF2 (setup.py) ... [?25l[?25hdone
  Created wheel for PyPDF2: filename=PyPDF2-1.26.0-cp37-none-any.whl size=61102 sha256=f199fcbbadac8e9fac70d3df00d9e4472d742cfa4ecd5c729e42ea9654ca860c
  Stored in directory: /

In [2]:
from bs4 import BeautifulSoup
from io import BytesIO
import pandas as pd
import requests
import PyPDF2
import re

### Functions

In [3]:
def get_row(parsed, dt, key, value):
    df_data = {pair[key].replace('\n', ''): pair[value] for pair in parsed}
    df_data['Date'] = pd.to_datetime(dt, format='%d%m%Y', errors='ignore')
    
    return pd.Series(df_data)

### Data scraping

In [4]:
saglik_url = 'https://covid19.saglik.gov.tr/TR-68444/gunluk-rapor--daily-report.html'

response = requests.get(saglik_url)
content = response.content

In [5]:
soup = BeautifulSoup(content, 'html.parser')

links = [link.get('href') for link in soup.select('tbody a')]

eng = re.compile('.*eng.html')

links = [link for link in links if eng.match(link)]

In [6]:
df_patient = pd.DataFrame()
df_hospitalization = pd.DataFrame()
df_hospital_discharge = pd.DataFrame()

for link in links:
    date = re.search("-(\d*)-eng.html", link).group(1)
    
    if date == '14082020': # The link is broken
        continue

    response = requests.get('https://covid19.saglik.gov.tr' + link)
    pdfFileObj = BytesIO(response.content)
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
    
    data = pdfReader.getPage(4).extractText()

    parsed = data[data.index("Istanbul"):]
    parsed = parsed[:parsed.index("Turkey")]
    parsed =  re.findall(r"([A-Z][a-zA-Z \n]*[a-z])[\n -]+([0-9.]+)[\n -]+.*[\n -]+([0-9.]+)", parsed)

    df_hospitalization = df_hospitalization.append(get_row(parsed, date, key=0, value=1), ignore_index=True)
    df_hospital_discharge = df_hospital_discharge.append(get_row(parsed, date, key=0, value=2), ignore_index=True)
    
    data = pdfReader.getPage(2).extractText()

    parsed = data[data.index("Istanbul"):]
    parsed = parsed[:parsed.index("Turkey")]
    parsed =  re.findall(r"([A-Z][a-zA-Z \n]*[a-z])[\n -]+([0-9.]+)", parsed)

    df_patient = df_patient.append(get_row(parsed, date, key=0, value=1), ignore_index=True)
    
    print('record date:', date)

record date: 23112020
record date: 22112020
record date: 21112020
record date: 20112020
record date: 19112020
record date: 18112020
record date: 17112020
record date: 16112020
record date: 15112020
record date: 14112020
record date: 13112020
record date: 12112020
record date: 11112020
record date: 10112020
record date: 09112020
record date: 08112020
record date: 07112020
record date: 06112020
record date: 05112020
record date: 04112020
record date: 03112020
record date: 02112020
record date: 01112020
record date: 31102020
record date: 30102020
record date: 29102020
record date: 28102020
record date: 27102020
record date: 26102020
record date: 25102020
record date: 24102020
record date: 23102020
record date: 22102020
record date: 21102020
record date: 20102020
record date: 19102020
record date: 18102020
record date: 17102020
record date: 16102020
record date: 15102020
record date: 14102020
record date: 13102020
record date: 12102020
record date: 11102020
record date: 10102020
record dat

### Data transformation

In [7]:
df_patient = df_patient.set_index(['Date'])
df_hospitalization = df_hospitalization.set_index(['Date'])
df_hospital_discharge = df_hospital_discharge.set_index(['Date'])

In [8]:
df_patient.head(3)

Unnamed: 0_level_0,Aegean,Central Anatolia,Eastern Blacksea,Eastern Marmara,Istanbul,Mediterranean,Mideastern Anatolia,Northeastern Anatolia,Southeastern Anatolia,Western Anatolia,Western Blacksea,Western Marmara
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2020-11-23,893,294,226,899,1.557,728,193,137,468,533,551,234
2020-11-22,902,232,179,733,1.361,711,187,110,402,433,504,262
2020-11-21,810,199,167,792,1.404,564,137,101,389,367,368,234


In [9]:
df_hospitalization.head(3)

Unnamed: 0_level_0,Aegean,Central Anatolia,Eastern Blacksea,Eastern Marmara,Istanbul,Mediterranean,Mideastern Anatolia,Northeastern Anatolia,Southeastern Anatolia,Western Anatolia,Western Blacksea,Western Marmara
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2020-11-23,93,39,24,124,159,92,22,21,60,81,64,32
2020-11-22,101,42,24,113,149,65,24,19,58,61,62,36
2020-11-21,95,42,21,119,165,81,25,26,58,72,62,31


In [10]:
df_hospital_discharge.head(3)

Unnamed: 0_level_0,Aegean,Central Anatolia,Eastern Blacksea,Eastern Marmara,Istanbul,Mediterranean,Mideastern Anatolia,Northeastern Anatolia,Southeastern Anatolia,Western Anatolia,Western Blacksea,Western Marmara
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2020-11-23,66,27,14,99,137,59,23,16,59,52,37,24
2020-11-22,59,29,13,97,117,52,24,22,50,49,42,21
2020-11-21,68,34,16,109,133,61,24,21,53,57,43,25


### Download

In [11]:
from google.colab import files

df_patient.to_csv('patient.csv')
df_hospitalization.to_csv('hospitalization.csv')
df_hospital_discharge.to_csv('hospital_discharge.csv')

files.download('patient.csv')
files.download('hospitalization.csv')
files.download('hospital_discharge.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>