# Setup variables and functions

In [2]:
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import re
import sys
from tqdm import tqdm

In [3]:
month_translation = {"January": "januari",
                    "February": "februari",
                    "March": "mac",
                    "April": "april",
                    "May": "mei",
                    "June": "jun",
                    "July": "julai",
                    "August": "ogos",
                    "September": "september",
                    "October": "oktober",
                    "November": "november",
                    "December": "disember"}

In [4]:
def create_datetime(day, month, year):
    data_date = '-'.join([str(day).zfill(2), str(month).zfill(2), str(year)])
    data_datetime = datetime.strptime(data_date, '%d-%m-%Y')
    return data_datetime

In [5]:
def create_date_dict(dt):
    month_full = month_translation[dt.strftime('%B')]
    date_dict = {'format1': dt.strftime('%Y/%m/%d'), 'format2': f'{dt.day}-{month_full}-{dt.year}'}
    return date_dict

In [6]:
def create_datetime_and_dict(day, month, year):
    data_datetime = create_datetime(day, month, year)
    date_dict = create_date_dict(data_datetime)
    return data_datetime, date_dict

In [7]:
cases_to_extract_old = ["pulih", "telah discaj", "JUMLAH KESELURUHAN", 
                        "jumlah kes positif", "Unit Rawatan Rapi", 
                        "bantuan pernafasan", "kes kematian", "kumulatif kes kematian"]

In [8]:
cases_to_extract_new = ["Kes sembuh", "kumulatif_0", "JUMLAH KESELURUHAN", 
                        "kumulatif_1", "Kes import", "Kes tempatan", 
                        "Kes aktif", "Unit Rawatan Rapi", 
                        "bantuan pernafasan", "Kes kematian", "kumulatif_2"]

In [9]:
column_names = ["Date", "Recovered", "Cumulative Recovered", "Imported Case", 
                "Local Case", "Active Case", "New Case", 
                "Cumulative Case", "ICU", "Ventilator", 
                "Death", "Cumulative Death", "URL"]

In [10]:
case_name_mapping = {"pulih": "Recovered", "telah discaj": "Cumulative Recovered",
                     "JUMLAH KESELURUHAN": "New Case", "jumlah kes positif": "Cumulative Case", 
                     "Unit Rawatan Rapi": "ICU", "bantuan pernafasan": "Ventilator", 
                     "kes kematian": "Death", "kumulatif kes kematian": "Cumulative Death",
                     "jumlah kumulatif kes positif": "Cumulative Case",
                     "Jumlah kes positif": "Cumulative Case",
                     # new text format mapping
                     "Kes sembuh": "Recovered", "kumulatif_0": "Cumulative Recovered", 
                     "kumulatif_1": "Cumulative Case", "Kes import": "Imported Case", 
                     "Kes tempatan": "Local Case", "Kes aktif": "Active Case",
                     "kumulatif_2": "Cumulative Death"}

In [11]:
def get_matched_number(txt_found, numbers_found, 
                       text_pos='first', number_pos='first', 
                       verbose=0):
    
    # text_pos & number_pos are the locating positions to find the nearest digits
    assert text_pos in ('first', 'end')
    assert number_pos in ('first', 'end')
    
    distance_list = []
    short_dist_found = False
    
    text_idx = 0 if text_pos == 'first' else 1
    number_idx = 0 if number_pos == 'first' else 1

    for number in numbers_found:
        distance = abs(txt_found.span()[text_idx] - number.span()[number_idx])

        # Stop the loop if already found shortest distance
        if distance < 200:
            short_dist_found = True
        if distance > 400 and short_dist_found:
            break

        distance_list.append(distance)

    min_dist = min(distance_list)
    min_index = distance_list.index(min_dist)
    matched_number = numbers_found[min_index].group()
    
    if verbose:
        print(f'Numbers found so far: \
              {[number.group() for number in numbers_found[:len(distance_list)]]}')
        print(f'Distance list: {distance_list}\n')
    
    return matched_number

In [21]:
def scrape_data_old(current_url, verbose=0):
    r = requests.get(current_url)
    if r.status_code == 404:
        raise Exception("Error accessing page!!")
    soup = BeautifulSoup(r.content)
    all_text = soup.get_text()
    # Remove all COVID-19 words to avoid getting number 19 accidentally
    all_text = all_text.replace('COVID-19', '')\
                        .replace('covid19', '')\
                        .replace(',', '')
    numbers_found = list(re.finditer('\d+', all_text))
    data_dict = {}
    txt_to_skip = None
    
    if 'tiada kes kematian berkaitan' in all_text or current_url == problem_url_1:
        data_dict['Death'] = 0
        data_dict['Cumulative Death'] = np.nan
        txt_to_skip = ('kes kematian', 'kumulatif kes kematian')
        
    cases_to_extract = cases_to_extract_old.copy()
    
#     if current_url == problem_url:
#         cases_to_extract.remove("jumlah kes positif")
#         cases_to_extract.append("jumlah kumulatif kes positif")
    
#     if current_url == problem_url_2:
#         cases_to_extract.remove("jumlah kes positif")
#         cases_to_extract.append("Jumlah kes positif")
    
    for txt in cases_to_extract:
        if txt_to_skip:
            if txt in txt_to_skip:
                continue
        
        try:
            if verbose:
                print(f"[INFO] Finding {txt} ...")

            if txt == 'pulih':
                try:
                    txt_found = list(re.finditer(txt, all_text))[0]
                except:
                    # not the same text anymore, proceed to new format
                    return None
            elif txt == 'jumlah kes positif':
                txt_found_list = list(re.finditer(txt, all_text))
                if len(txt_found_list) == 0:
                    for i in ("jumlah kumulatif kes positif", "Jumlah kes positif"):
                        print(f"[INFO] Trying {i}")
                        txt_found_list = list(re.finditer(i, all_text))
                        # print(len(txt_found_list))
                        if len(txt_found_list) > 0:
                            break  
                txt_found = txt_found_list[0]
            else:
                txt_found = list(re.finditer(txt, all_text))[0]
        except Exception as e:
            print(f"Error obtaining {txt} !!")
            raise Exception(f"{e.__class__} occurred.")
        
        if txt == "JUMLAH KESELURUHAN":
            text_pos = 'end'
        else:
            text_pos = 'first'
        
        if verbose:
            print(f"Text found: {txt_found}\n")

        matched_number = get_matched_number(txt_found, numbers_found, 
                                            verbose=verbose, text_pos=text_pos)
        
        correct_col_name = case_name_mapping[txt]
        data_dict[correct_col_name] = matched_number
        
    for col_name in ("Imported Case", "Local Case", "Active Case",):
        data_dict[col_name] = np.nan
    
      
    return data_dict

In [13]:
def find_text(txt, all_text):
    sentence_list = list(re.finditer(rf"([^.\n]*?{txt}[^.]*\.)", all_text))
    if not sentence_list:
        raise Exception(f"[ERROR] {txt} not found!")
    else:
        return sentence_list[0]

In [22]:
def scrape_data(current_url, verbose=0):
    r = requests.get(current_url)
    if r.status_code == 404:
        raise Exception("Error accessing page!!")
    soup = BeautifulSoup(r.content)
    all_text = soup.get_text()
    # Remove all COVID-19 words to avoid getting number 19 accidentally
    all_text = all_text.replace('COVID-19', '')\
                        .replace('covid19', '')\
                        .replace(',', '')
    numbers_found = list(re.finditer('\d+', all_text))
    data_dict = {}
    txt_to_skip = None
    
    # if 'tiada kes kematian berkaitan' in all_text or current_url == problem_url_1:
    #     data_dict['Death'] = 0
    #     data_dict['Cumulative Death'] = np.nan
    #     txt_to_skip = ('kes kematian', 'kumulatif kes kematian')
        
    cases_to_extract = cases_to_extract_old.copy()
    
    for txt in cases_to_extract:
        if txt_to_skip:
            if txt in txt_to_skip:
                continue
        
        try:
            if verbose:
                print(f"[INFO] Finding {txt} ...")

            if txt == 'pulih':
                try:
                    txt_found = find_text(txt, all_text)
                except:
                    print('[ERROR] "pulih" not found ...')
                    # not the same text anymore, proceed to new format
                    return None
            elif txt == 'jumlah kes positif':
                txt_to_search = "'(jumlah kes positif|jumlah kumulatif kes positif|Jumlah kes positif)'"
                txt_found = find_text(txt_to_search, all_text)
            else:
                txt_found = find_text(txt, all_text)
        except Exception as e:
            print(f"Error obtaining {txt} !!")
            raise Exception(f"{e.__class__} occurred.")
        
        if txt == "JUMLAH KESELURUHAN":
            text_pos = 'end'
        else:
            text_pos = 'first'
        
        if verbose:
            print(f"Text found: {txt_found}\n")

        matched_number = get_matched_number(txt_found, numbers_found, 
                                            verbose=verbose, text_pos=text_pos)
        
        correct_col_name = case_name_mapping[txt]
        data_dict[correct_col_name] = matched_number
        
    for col_name in ("Imported Case", "Local Case", "Active Case",):
        data_dict[col_name] = np.nan
    
      
    return data_dict

# Test scraping new format

In [15]:
URL = "https://kpkesihatan.com/2021/04/17/kenyataan-akhbar-kpk-17-april-2021-situasi-semasa-jangkitan-penyakit-coronavirus-2019-covid-19-di-malaysia/"

In [16]:
r = requests.get(URL)
soup = BeautifulSoup(r.content)

all_text = soup.get_text().lower()
# Remove all COVID-19 words to avoid getting number 19 accidentally
all_text = all_text.replace('covid-19', '').replace('covid19', '').replace(',', '')

In [17]:
txt_found = list(re.finditer('kes sembuh', all_text))[0]
txt_found

<re.Match object; span=(825, 835), match='kes sembuh'>

In [18]:
numbers_found = list(re.finditer('\d+', all_text))

In [19]:
def scrape_data_2(current_url, verbose=0):
    r = requests.get(current_url)
    if r.status_code == 404:
        raise Exception("Error 404 accessing page!!")
    soup = BeautifulSoup(r.content)
    
    all_text = soup.get_text()
    # Remove all COVID-19 words to avoid getting number 19 accidentally
    # Removing commas to obtain the full digits
    all_text = all_text.replace('COVID-19', '')\
                        .replace('covid19', '')\
                        .replace(',', '')
    numbers_found = list(re.finditer('\d+', all_text))
    data_dict = {}
    
    cases_to_extract = cases_to_extract_new.copy()
    
    for txt in cases_to_extract:
        if verbose:
            print(f"[INFO] Finding {txt} ...")
            
        if 'kumulatif' in txt:
            text_pos = 'first'
            number_pos = 'end'
            # get the specific position of the text for cumulative case
            text_idx = int(txt.split("_")[1])
            cumulative_text = 'kes kumulatif'
            try:
                txt_found = list(re.finditer(cumulative_text, all_text))[text_idx]
            except Exception as e:
                print(f"Error obtaining {cumulative_text} !!")
                raise Exception(f"{e.__class__} occurred.")
        else:
            text_pos = 'end'
            number_pos = 'first'
            try:
                txt_found = list(re.finditer(txt, all_text))[0]
            except Exception as e:
                print(f"Error obtaining {txt} !!")
                raise Exception(f"{e.__class__} occurred.")
                
        if txt in ("JUMLAH KESELURUHAN", "kumulatif kes kematian"):
            text_pos = 'end'
        else:
            text_pos = 'first'
        
        if verbose:
            print(f"Text found: {txt_found}\n")

        matched_number = get_matched_number(txt_found, numbers_found, 
                                            verbose=verbose, text_pos=text_pos)
        correct_col_name = case_name_mapping[txt]
        data_dict[correct_col_name] = matched_number
    
    return data_dict

In [21]:
scrape_data_2(URL)

NameError: name 'cases_to_extract_new' is not defined

# Scraping more days

In [7]:
URL = "https://kpkesihatan.com/2021/04/17/kenyataan-akhbar-kpk-17-april-2021-situasi-semasa-jangkitan-penyakit-coronavirus-2019-covid-19-di-malaysia/"

In [13]:
# df = pd.DataFrame(columns=["Recovered Case", "Imported Case", "Local Case", "Active Case", "ICU", "Ventilator Support", "Death"])
df = pd.DataFrame(columns=column_names)
display(df.head())
df.columns

Unnamed: 0,Date,Recovered,Cumulative Recovered,Imported Case,Local Case,Active Case,New Case,Cumulative Case,ICU,Ventilator,Death,Cumulative Death,URL


Index(['Date', 'Recovered', 'Cumulative Recovered', 'Imported Case',
       'Local Case', 'Active Case', 'New Case', 'Cumulative Case', 'ICU',
       'Ventilator', 'Death', 'Cumulative Death', 'URL'],
      dtype='object')

In [14]:
start_date_dt, start_date_dict = create_datetime_and_dict(day=27, month=3, year=2020)
# start_date_dt, start_date_dict = create_datetime_and_dict(day=10, month=6, year=2020)
end_date_dt, end_date_dict = create_datetime_and_dict(day=13, month=5, year=2020)
total_days = (end_date_dt - start_date_dt).days + 1  # inclusive of final date

In [15]:
start_date_dt, start_date_dict, total_days

(datetime.datetime(2020, 3, 27, 0, 0),
 {'format1': '2020/03/27', 'format2': '27-mac-2020'},
 48)

In [16]:
special_dates = ['16-Apr-2020', '13-May-2020', '28-May-2020', '18-Jun-2020']
special_dt = [datetime.strptime(i, '%d-%b-%Y') for i in special_dates]
special_urls = ["https://kpkesihatan.com/2020/04/16/kenyataan-akhbar-16-april-2020-situasi-semasa-jangkitan-penyakit-coronavirus-2019-covid-19-di-malaysia/",
               "https://kpkesihatan.com/2020/05/13/kenyataan-akhbar-kpk-13-may-2020-situasi-semasa-jangkitan-penyakit-coronavirus-2019-covid-19-di-malaysia/",
               "https://kpkesihatan.com/2020/05/28/situasi-semasa-jangkitan-penyakit-coronavirus-2019-covid-19-di-malaysia/",
               "https://kpkesihatan.com/2020/06/18/kenyataan-akhbar-kpk-situasi-semasa-jangkitan-penyakit-coronavirus-2019-covid-19-di-malaysia/"]

In [17]:
problem_url_1 = "https://kpkesihatan.com/2020/04/20/kenyataan-akhbar-kpk-20-april-2020-situasi-semasa-jangkitan-penyakit-coronavirus-2019-covid-19-di-malaysia/"

In [18]:
problem_url = "https://kpkesihatan.com/2020/05/12/kenyataan-akhbar-kpk-12-mei-2020-situasi-semasa-jangkitan-penyakit-coronavirus-2019-covid-19-di-malaysia/"
problem_url_2 = "https://kpkesihatan.com/2020/06/04/kenyataan-akhbar-kpk-4-jun-2020-situasi-semasa-jangkitan-penyakit-coronavirus-2019-covid-19-di-malaysia/"
prob_3 = "https://kpkesihatan.com/2020/06/10/kenyataan-akhbar-kpk-10-jun-2020-situasi-semasa-jangkitan-penyakit-coronavirus-2019-covid-19-di-malaysia/"

In [21]:
default_url = "https://kpkesihatan.com/{format1}/kenyataan-akhbar-kpk-{format2}-situasi-semasa-jangkitan-penyakit-coronavirus-2019-covid-19-di-malaysia/"

In [620]:
current_date, current_date_dict = start_date_dt, start_date_dict
new_format_flag = False

for day_number in tqdm(range(total_days), desc=f"Scraping data..."):
    # print(f"[INFO] Scraping data for {current_date.date()} ...")
    current_url = default_url.format(**current_date_dict)
    # print(current_url)
    if current_date in special_dt:
        current_url = special_urls[special_dt.index(current_date)]
    try:
        if not new_format_flag: 
            # still in old text format, scrape using old method
            data_dict = scrape_data(current_url)

        if not data_dict:
            # using new text format method
            new_format_flag = True
            print(f"[INFO] NEW text format on {current_date.date()}.")
            data_dict = scrape_data_2(current_url)

        # print(data_dict)

        # df.loc[current_date] = data_dict
        data_dict["Date"] = current_date
        data_dict["URL"] = current_url
        df = df.append(data_dict, ignore_index=True)        

        current_date += timedelta(days=1)
        current_date_dict = create_date_dict(current_date)
    except:
        print("[ERROR] Problem with", current_url)
        raise Exception(f"Error on {current_date.date()}")
    # print()
    # break

Scraping data...:  98%|██████████████████████████████████████████████████████████████▋ | 47/48 [00:31<00:00,  1.59it/s]

[INFO] Trying jumlah kumulatif kes positif


Scraping data...: 100%|████████████████████████████████████████████████████████████████| 48/48 [00:32<00:00,  1.48it/s]


In [621]:
df

Unnamed: 0,Date,Recovered,Cumulative Recovered,Imported Case,Local Case,Active Case,New Case,Cumulative Case,ICU,Ventilator,Death,Cumulative Death,URL
0,2020-03-27,44,259,,,,130,2161,34,34,3,26.0,https://kpkesihatan.com/2020/03/27/kenyataan-a...
1,2020-03-28,61,320,,,,159,2320,73,54,1,27.0,https://kpkesihatan.com/2020/03/28/kenyataan-a...
2,2020-03-29,68,388,,,,150,2470,73,52,7,7.0,https://kpkesihatan.com/2020/03/29/kenyataan-a...
3,2020-03-30,91,479,,,,156,2626,94,62,3,37.0,https://kpkesihatan.com/2020/03/30/kenyataan-a...
4,2020-03-31,58,537,,,,140,2766,94,60,6,6.0,https://kpkesihatan.com/2020/03/31/kenyataan-a...
5,2020-04-01,108,645,,,,2,2908,102,66,2,2.0,https://kpkesihatan.com/2020/04/01/kenyataan-a...
6,2020-04-02,122,767,,,,208,3116,105,54,5,5.0,https://kpkesihatan.com/2020/04/02/kenyataan-a...
7,2020-04-03,60,827,,,,217,3333,108,54,3,3.0,https://kpkesihatan.com/2020/04/03/kenyataan-a...
8,2020-04-04,88,915,,,,150,3483,99,50,4,4.0,https://kpkesihatan.com/2020/04/04/kenyataan-a...
9,2020-04-05,90,1005,,,,179,3662,99,48,4,4.0,https://kpkesihatan.com/2020/04/05/kenyataan-a...


In [622]:
data_dict

{'Recovered': '58',
 'Cumulative Recovered': '5281',
 'New Case': '37',
 'Cumulative Case': '33',
 'ICU': '16',
 'Ventilator': '4',
 'Death': '2',
 'Cumulative Death': '2',
 'Imported Case': nan,
 'Local Case': nan,
 'Active Case': nan,
 'URL': 'https://kpkesihatan.com/2020/05/13/kenyataan-akhbar-kpk-13-may-2020-situasi-semasa-jangkitan-penyakit-coronavirus-2019-covid-19-di-malaysia/',
 'Date': datetime.datetime(2020, 5, 13, 0, 0)}

## Test for debugging

In [15]:
case_name_mapping

{'pulih': 'Recovered',
 'telah discaj': 'Cumulative Recovered',
 'JUMLAH KESELURUHAN': 'New Case',
 'jumlah kes positif': 'Cumulative Case',
 'Unit Rawatan Rapi': 'ICU',
 'bantuan pernafasan': 'Ventilator',
 'kes kematian': 'Death',
 'kumulatif kes kematian': 'Cumulative Death',
 'jumlah kumulatif kes positif': 'Cumulative Case',
 'Jumlah kes positif': 'Cumulative Case',
 'Kes sembuh': 'Recovered',
 'kumulatif_0': 'Cumulative Recovered',
 'kumulatif_1': 'Cumulative Case',
 'Kes import': 'Imported Case',
 'Kes tempatan': 'Local Case',
 'Kes aktif': 'Active Case',
 'kumulatif_2': 'Cumulative Death'}

In [16]:
test_date_dt, test_date_dict = create_datetime_and_dict(day=1, month=4, year=2020)

In [17]:
default_url = "https://kpkesihatan.com/{format1}/kenyataan-akhbar-kpk-{format2}-situasi-semasa-jangkitan-penyakit-coronavirus-2019-covid-19-di-malaysia/"

In [18]:
current_url = default_url.format(**test_date_dict)
current_url

'https://kpkesihatan.com/2020/04/01/kenyataan-akhbar-kpk-1-april-2020-situasi-semasa-jangkitan-penyakit-coronavirus-2019-covid-19-di-malaysia/'

In [23]:
scrape_data(current_url, verbose=1)

[INFO] Finding pulih ...
Text found: <re.Match object; span=(651, 783), match='Kementerian Kesihatan Malaysia (KKM) ingin memakl>

Numbers found so far:               ['1', '2020', '2019', '1', '2020', '2019', '1', '2020', '108', '645', '22', '2', '1', '2020', '12', '00', '142']
Distance list: [625, 617, 565, 161, 153, 101, 55, 53, 71, 249, 258, 261, 346, 354, 365, 368, 392]

[INFO] Finding telah discaj ...
Text found: <re.Match object; span=(783, 912), match=' Ini menjadikan jumlah kumulatif kes yang telah p>

Numbers found so far:               ['1', '2020', '2019', '1', '2020', '2019', '1', '2020', '108', '645', '22', '2', '1', '2020', '12', '00', '142', '2908', '102']
Distance list: [757, 749, 697, 293, 285, 233, 187, 185, 61, 117, 126, 129, 214, 222, 233, 236, 260, 361, 393]

[INFO] Finding JUMLAH KESELURUHAN ...
Text found: <re.Match object; span=(3566, 3722), match='JUMLAH KESELURUHAN\n2\n15\n21\n27\n65\n\n\n\nJadu>

Numbers found so far:               ['1', '2020', '2019', '1',

Exception: <class 'Exception'> occurred.

In [24]:
r = requests.get(current_url)
if r.status_code == 404:
    raise Exception("Error 404 accessing page!!")
soup = BeautifulSoup(r.content)

all_text = soup.get_text()
all_text = all_text.replace('COVID-19', '')\
                    .replace('covid19', '')\
                    .replace(',', '')

In [30]:
soup.find_all("tr", string='JUMLAH KESELURUHAN')

[]

In [33]:
tag_list = [tags for tags in soup.find_all("tr") if "JUMLAH KESELURUHAN" in tags.text][-1]

In [35]:
tag_list

<tr>
<td width="302"><strong>JUMLAH KESELURUHAN</strong></td>
<td width="151"><strong>142</strong></td>
<td width="142"><strong>2,908</strong></td>
</tr>

In [36]:
list(re.finditer('Jumlah kes positif', all_text))

[]

In [43]:
sentence_list = [sentence + '.' for sentence in all_text.split('.') if 'kes yang telah pulih' in sentence]
sentence_list

['\n\n\n\uf400\n\n\n\n\n\n\n\n\nCommunicable disease\nKenyataan Akhbar KPK 10 Jun 2020 -Situasi Semasa Jangkitan Penyakit Coronavirus 2019 () di Malaysia \nBy DG of Health on June 10 2020 \n\nSTATUS TERKINI KES DISAHKAN  YANG TELAH PULIH\nKementerian Kesihatan Malaysia (KKM) ingin memaklumkan bahawa terdapat 39 kes yang telah pulih dan dibenarkan discaj pada hari ini.',
 ' Jumlah kumulatif kes yang telah pulih sepenuhnya dari  dan telah discaj daripada wad adalah sebanyak 7014 kes (84.']

In [54]:
x = 'pulih'
txt_x = rf"haha {x} 123."
txt_x

'haha pulih 123.'

In [117]:
txt = 'kematian'
sentence_list = list(re.finditer(rf"([^.\n]*?{txt}[^.]*\.)", all_text))
sentence_list

[<re.Match object; span=(1571, 1756), match='Sukacita dimaklumkan daripada maklumat terkini ya>,
 <re.Match object; span=(1756, 1839), match=' Oleh itu sehingga pukul 12 tengah hari 20 April >,
 <re.Match object; span=(3530, 3740), match=' Terdapat lima (5) kes kematian  dari kawasan ini>]

In [40]:
sentence_list[0]

'\n\n\n\uf400\n\n\n\n\n\n\n\n\nCommunicable disease\nKenyataan Akhbar KPK 10 Jun 2020 -Situasi Semasa Jangkitan Penyakit Coronavirus 2019 () di Malaysia \nBy DG of Health on June 10 2020 \n\nSTATUS TERKINI KES DISAHKAN  YANG TELAH PULIH\nKementerian Kesihatan Malaysia (KKM) ingin memaklumkan bahawa terdapat 39 kes yang telah pulih dan dibenarkan discaj pada hari ini.'

In [433]:
df.loc[test_date_dt] = scrape_data(current_url)
df

Unnamed: 0,Recovered,Cumulative Recovered,Imported Case,Local Case,Active Case,New Case,Cumulative Case,ICU,Ventilator,Death,Cumulative Death
2020-04-20,98,3295,,,,36,5425,45,28,0,


In [379]:
cases_to_extract_old

['pulih',
 'telah discaj',
 'JUMLAH KESELURUHAN',
 'jumlah kes positif',
 'Unit Rawatan Rapi',
 'bantuan pernafasan',
 'kematian',
 'kumulatif kes kematian']

In [396]:
r = requests.get(current_url)
soup = BeautifulSoup(r.content)
all_text = soup.get_text()

'\n\n\n\n\nKenyataan Akhbar KPK 20 April 2020 – Situasi Semasa Jangkitan Penyakit Coronavirus 2019 (COVID-19) di Malaysia – From the Desk of the Director-General of Health Malaysia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nMenu\nHome\nAbout\nPress\nPress response\nBlog\nSpeech\n \n\n\n\n\n\n \n\n\t\t\t\t\t\tFrom the Desk of the Director-General of Health Malaysia\t\t\t\t\t\n\n\n\t\t\t\t\tA Nation working together for better Health\t\t\t\t\n\n\n\n\nSearch for...\n\n\n\uf400\n\n\n\n\n\n\n\n\nCommunicable disease\nKenyataan Akhbar KPK 20 April 2020 – Situasi Semasa Jangkitan Penyakit Coronavirus 2019 (COVID-19) di Malaysia \nBy DG of Health on April 20, 2020 \n\nSTATUS TERKINI KES DISAHKAN COVID-19 YANG TELAH PULIH\nKementerian Kesihatan Malaysia (KKM) ingin memaklumkan bahawa terdapat 98 kes yang telah pulih dan dibenarkan discaj pada hari ini. Ini menjadikan jumlah kumulatif kes yang tel

# Archive

In [222]:
# def scrape_data_2(soup):
#     data_list = []
    
#     for txt in cases_to_extract_old:
#         # print(f"[INFO] Finding {txt} ...")
#         txt_found = soup.find(string=re.compile(txt))
#         int_found = np.nan
#         print(txt_found)

#         if txt_found:
#             txt_split = txt_found.split()
#             if 'COVID-19' in txt_split:
#                 txt_split.remove('COVID-19')
                
#             txt_found = ' '.join(txt_split)
#             int_found = re.findall(r'[\d]+', txt_found)
#             if len(int_found) == 1:
#                 int_found = int(int_found[0])
#             elif len(int_found) > 1:
#                 distance_list = []
#                 for number in int_found:
#                     distance = abs(re.search(number, txt_found).span()[0]\
#                                    - re.search(txt, txt_found).span()[0])
#                     distance_list.append(distance)
#                 min_dist = min(distance_list)
#                 min_index = distance_list.index(min_dist)
#                 int_found = int_found[min_index]
#             else:
#                 int_found = np.nan
#             # print(int_found)
#         else:
#             int_found = np.nan
#         data_list.append(int_found)
#         # break
#         # print()
#     return data_list

# Scrape table for the data of each state

In [2]:
# current_url = "https://kpkesihatan.com/2020/07/08/kenyataan-akhbar-kpk-8-julai-2020-situasi-semasa-jangkitan-penyakit-coronavirus-2019-covid-19-di-malaysia/"
current_url = "https://kpkesihatan.com/2020/12/20/kenyataan-akhbar-kpk-20-disember-2020-situasi-semasa-jangkitan-penyakit-coronavirus-2019-covid-19-di-malaysia/"
# current_url = "https://kpkesihatan.com/2020/12/21/kenyataan-akhbar-kpk-21-disember-2020-situasi-semasa-jangkitan-penyakit-coronavirus-2019-covid-19-di-malaysia/"
# current_url = "https://kpkesihatan.com/2021/02/10/kenyataan-akhbar-kpk-10-februari-2021-situasi-semasa-jangkitan-penyakit-coronavirus-2019-covid-19-di-malaysia/"

r = requests.get(current_url)
if r.status_code == 404:
    raise Exception("Error 404 accessing page!!")

df = pd.read_html(r.content, match='JUMLAH KESELURUHAN', header=0)[-1]
df

Unnamed: 0,NEGERI,BILANGAN KES BAHARU *( ),BILANGAN KES KUMULATIF
0,SABAH,196,34478
1,SELANGOR,441 (1),24465
2,WP KUALA LUMPUR,284,10188
3,NEGERI SEMBILAN,169,7045
4,JOHOR,156,3143
5,PULAU PINANG,30,2960
6,KEDAH,6,2888
7,PERAK,10,2867
8,WP LABUAN,27,1541
9,SARAWAK,3 (2),1090


In [3]:
current_date = datetime.today()
current_date

datetime.datetime(2021, 4, 23, 13, 19, 11, 168722)

In [29]:
df2 = df.copy()
df2 = df2.set_index('NEGERI')
df2 = df2.T
df2['Date'] = current_date
df2

NEGERI,SABAH,SELANGOR,WP KUALA LUMPUR,NEGERI SEMBILAN,JOHOR,PULAU PINANG,KEDAH,PERAK,WP LABUAN,SARAWAK,PAHANG,MELAKA,KELANTAN,TERENGGANU,WP PUTRAJAYA,PERLIS,JUMLAH KESELURUHAN,Date
BILANGAN KES BAHARU *( ),196,441 (1),284,169,156,30,6,10,27,3 (2),12,1,1,2,2,0,"1,340 (3)",2021-04-23 13:19:11.168722
BILANGAN KES KUMULATIF,34478,24465,10188,7045,3143,2960,2888,2867,1541,1090,889,695,494,287,234,45,93309,2021-04-23 13:19:11.168722


In [30]:
df2.set_index('Date', inplace=True)
df2

NEGERI,SABAH,SELANGOR,WP KUALA LUMPUR,NEGERI SEMBILAN,JOHOR,PULAU PINANG,KEDAH,PERAK,WP LABUAN,SARAWAK,PAHANG,MELAKA,KELANTAN,TERENGGANU,WP PUTRAJAYA,PERLIS,JUMLAH KESELURUHAN
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2021-04-23 13:19:11.168722,196,441 (1),284,169,156,30,6,10,27,3 (2),12,1,1,2,2,0,"1,340 (3)"
2021-04-23 13:19:11.168722,34478,24465,10188,7045,3143,2960,2888,2867,1541,1090,889,695,494,287,234,45,93309


- Before     --  'W.P. KUALA LUMPUR'
- 20-12-2020 --  'WP\xa0KUALA LUMPUR'
- 21-12-2020 --  'WP KUALA LUMPUR', 'WP\xa0PUTRAJAYA'
- 10-02-2021 --  'WP KUALA LUMPUR'

In [31]:
df2.columns.values

array(['SABAH', 'SELANGOR', 'WP\xa0KUALA LUMPUR', 'NEGERI SEMBILAN',
       'JOHOR', 'PULAU PINANG', 'KEDAH', 'PERAK', 'WP\xa0LABUAN',
       'SARAWAK', 'PAHANG', 'MELAKA', 'KELANTAN', 'TERENGGANU',
       'WP\xa0PUTRAJAYA', 'PERLIS', 'JUMLAH KESELURUHAN'], dtype=object)

In [32]:
# def replace_unk_name(matchObj):
#     new_name = ' '.join(['WP', matchObj.group(3)]).strip()
#     # display(new_name)
#     return new_name

# new_col_names = []
# # to fixed weird names containing \xa0 
# #  or extra spaces in between WP and state name
# for col_name in df2.columns:
#     new_col_names.append(re.sub(r'(WP|W.P.)(\xa0|[\s]+)(\w+)', replace_unk_name, col_name))

# new_col_names

In [34]:
# df2.columns = new_col_names

In [35]:
# simpler method
df2.columns = df2.columns.str.replace(
                '\xa0', ' ').str.replace('.', '', regex=False)
df2.columns.values

array(['SABAH', 'SELANGOR', 'WP KUALA LUMPUR', 'NEGERI SEMBILAN', 'JOHOR',
       'PULAU PINANG', 'KEDAH', 'PERAK', 'WP LABUAN', 'SARAWAK', 'PAHANG',
       'MELAKA', 'KELANTAN', 'TERENGGANU', 'WP PUTRAJAYA', 'PERLIS',
       'JUMLAH KESELURUHAN'], dtype=object)

In [36]:
df2.columns.values

array(['SABAH', 'SELANGOR', 'WP KUALA LUMPUR', 'NEGERI SEMBILAN', 'JOHOR',
       'PULAU PINANG', 'KEDAH', 'PERAK', 'WP LABUAN', 'SARAWAK', 'PAHANG',
       'MELAKA', 'KELANTAN', 'TERENGGANU', 'WP PUTRAJAYA', 'PERLIS',
       'JUMLAH KESELURUHAN'], dtype=object)

In [116]:
col_names = np.array(['KEDAH', 'PULAU PINANG', 'PERAK', 'SELANGOR',
       'NEGERI SEMBILAN', 'MELAKA', 'JOHOR', 'PAHANG', 'TERENGGANU',
       'KELANTAN', 'SABAH', 'SARAWAK', 'W.P. KUALA LUMPUR',
       'W.P. PUTRAJAYA', 'W.P. LABUAN', 'JUMLAH KESELURUHAN', 'PERLIS'])

In [117]:
df2[col_names]

Unnamed: 0_level_0,KEDAH,PULAU PINANG,PERAK,SELANGOR,NEGERI SEMBILAN,MELAKA,JOHOR,PAHANG,TERENGGANU,KELANTAN,SABAH,SARAWAK,W.P. KUALA LUMPUR,W.P. PUTRAJAYA,W.P. LABUAN,JUMLAH KESELURUHAN,PERLIS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2021-04-22 18:30:55.264805,6,30,10,441 (1),169,1,156,12,2,1,196,3 (2),284,2,27,"1,340 (3)",0
2021-04-22 18:30:55.264805,2888,2960,2867,24465,7045,695,3143,889,287,494,34478,1090,10188,234,1541,93309,45


In [207]:
df_new_case = df2.iloc[[0], :]
# df_new_case.index = current_date
df_cumul_case = df2.iloc[[1], :]
# df_cumul_case.index = current_date
display(df_new_case) 
display(df_cumul_case)

NEGERI,PERLIS,KEDAH,PULAU PINANG,PERAK,SELANGOR,NEGERI SEMBILAN,MELAKA,JOHOR,PAHANG,TERENGGANU,KELANTAN,SABAH,SARAWAK,W.P. KUALA LUMPUR,W.P. PUTRAJAYA,W.P. LABUAN,JUMLAH KESELURUHAN
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2021-04-22 16:57:02.380137,0,1,6,8,36,6,0,20,4,2,4,10,15,16,2,0,130


NEGERI,PERLIS,KEDAH,PULAU PINANG,PERAK,SELANGOR,NEGERI SEMBILAN,MELAKA,JOHOR,PAHANG,TERENGGANU,KELANTAN,SABAH,SARAWAK,W.P. KUALA LUMPUR,W.P. PUTRAJAYA,W.P. LABUAN,JUMLAH KESELURUHAN
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2021-04-22 16:57:02.380137,10,73,80,159,546,138,33,259,70,41,98,182,110,337,20,5,2161


In [208]:
pd.concat([df_new_case, df_new_case])

NEGERI,PERLIS,KEDAH,PULAU PINANG,PERAK,SELANGOR,NEGERI SEMBILAN,MELAKA,JOHOR,PAHANG,TERENGGANU,KELANTAN,SABAH,SARAWAK,W.P. KUALA LUMPUR,W.P. PUTRAJAYA,W.P. LABUAN,JUMLAH KESELURUHAN
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2021-04-22 16:57:02.380137,0,1,6,8,36,6,0,20,4,2,4,10,15,16,2,0,130
2021-04-22 16:57:02.380137,0,1,6,8,36,6,0,20,4,2,4,10,15,16,2,0,130


In [209]:
df_new_case.append(df_new_case)

NEGERI,PERLIS,KEDAH,PULAU PINANG,PERAK,SELANGOR,NEGERI SEMBILAN,MELAKA,JOHOR,PAHANG,TERENGGANU,KELANTAN,SABAH,SARAWAK,W.P. KUALA LUMPUR,W.P. PUTRAJAYA,W.P. LABUAN,JUMLAH KESELURUHAN
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2021-04-22 16:57:02.380137,0,1,6,8,36,6,0,20,4,2,4,10,15,16,2,0,130
2021-04-22 16:57:02.380137,0,1,6,8,36,6,0,20,4,2,4,10,15,16,2,0,130


In [160]:
# To join multindex columns
# https://stackoverflow.com/questions/24290297/pandas-dataframe-with-multiindex-column-merge-levels
# df2.columns.map(lambda x: '_'.join([str(i) for i in x]))

Index(['PERLIS_BILANGAN KES BAHARU', 'PERLIS_BILANGAN KES KUMULATIF',
       'KEDAH_BILANGAN KES BAHARU', 'KEDAH_BILANGAN KES KUMULATIF',
       'PULAU PINANG_BILANGAN KES BAHARU',
       'PULAU PINANG_BILANGAN KES KUMULATIF', 'PERAK_BILANGAN KES BAHARU',
       'PERAK_BILANGAN KES KUMULATIF', 'SELANGOR_BILANGAN KES BAHARU',
       'SELANGOR_BILANGAN KES KUMULATIF',
       'NEGERI SEMBILAN_BILANGAN KES BAHARU',
       'NEGERI SEMBILAN_BILANGAN KES KUMULATIF', 'MELAKA_BILANGAN KES BAHARU',
       'MELAKA_BILANGAN KES KUMULATIF', 'JOHOR_BILANGAN KES BAHARU',
       'JOHOR_BILANGAN KES KUMULATIF', 'PAHANG_BILANGAN KES BAHARU',
       'PAHANG_BILANGAN KES KUMULATIF', 'TERENGGANU_BILANGAN KES BAHARU',
       'TERENGGANU_BILANGAN KES KUMULATIF', 'KELANTAN_BILANGAN KES BAHARU',
       'KELANTAN_BILANGAN KES KUMULATIF', 'SABAH_BILANGAN KES BAHARU',
       'SABAH_BILANGAN KES KUMULATIF', 'SARAWAK_BILANGAN KES BAHARU',
       'SARAWAK_BILANGAN KES KUMULATIF',
       'W.P. KUALA LUMPUR_BILANGAN K

## Second method: Concat first before processing

In [44]:
current_url = "https://kpkesihatan.com/2020/12/20/kenyataan-akhbar-kpk-20-disember-2020-situasi-semasa-jangkitan-penyakit-coronavirus-2019-covid-19-di-malaysia/"
second_url = "https://kpkesihatan.com/2020/12/21/kenyataan-akhbar-kpk-21-disember-2020-situasi-semasa-jangkitan-penyakit-coronavirus-2019-covid-19-di-malaysia/"

r = requests.get(current_url)
if r.status_code == 404:
    raise Exception("Error 404 accessing page!!")

df = pd.read_html(r.content, match='JUMLAH KESELURUHAN', header=0)[-1]
df

Unnamed: 0,NEGERI,BILANGAN KES BAHARU *( ),BILANGAN KES KUMULATIF
0,SABAH,196,34478
1,SELANGOR,441 (1),24465
2,WP KUALA LUMPUR,284,10188
3,NEGERI SEMBILAN,169,7045
4,JOHOR,156,3143
5,PULAU PINANG,30,2960
6,KEDAH,6,2888
7,PERAK,10,2867
8,WP LABUAN,27,1541
9,SARAWAK,3 (2),1090


In [45]:
r = requests.get(second_url)
if r.status_code == 404:
    raise Exception("Error 404 accessing page!!")

df2 = pd.read_html(r.content, match='JUMLAH KESELURUHAN', header=0)[-1]
df2

Unnamed: 0,NEGERI,BILANGAN KES BAHARU *( ),BILANGAN KES KUMULATIF
0,SABAH,247,34725
1,SELANGOR,"1,204 (1)",25669
2,WP KUALA LUMPUR,127 (4),10315
3,NEGERI SEMBILAN,58,7103
4,JOHOR,278 (1),3421
5,PULAU PINANG,18,2978
6,KEDAH,3,2891
7,PERAK,21,2888
8,WP LABUAN,29 (1),1570
9,SARAWAK,0,1090


In [46]:
from datetime import timedelta
current_date = datetime.today()
next_date = datetime.today() + timedelta(days=1)

In [47]:
df.columns = ['State', 'New Case', 'Cumulative Case']
df['Date'] = current_date
df['URL'] = current_url
df2.columns = ['State', 'New Case', 'Cumulative Case']
df2['Date'] = next_date
df2['URL'] = second_url
df2.columns

Index(['State', 'New Case', 'Cumulative Case', 'Date', 'URL'], dtype='object')

In [80]:
combined_df = df.append(df2)
combined_df.head()

Unnamed: 0,State,New Case,Cumulative Case,Date,URL
0,SABAH,196,34478,2021-04-23 13:25:35.996025,https://kpkesihatan.com/2020/12/20/kenyataan-a...
1,SELANGOR,441 (1),24465,2021-04-23 13:25:35.996025,https://kpkesihatan.com/2020/12/20/kenyataan-a...
2,WP KUALA LUMPUR,284,10188,2021-04-23 13:25:35.996025,https://kpkesihatan.com/2020/12/20/kenyataan-a...
3,NEGERI SEMBILAN,169,7045,2021-04-23 13:25:35.996025,https://kpkesihatan.com/2020/12/20/kenyataan-a...
4,JOHOR,156,3143,2021-04-23 13:25:35.996025,https://kpkesihatan.com/2020/12/20/kenyataan-a...


In [81]:
url_df = combined_df[['Date', 'URL']].drop_duplicates()
url_df.head()

Unnamed: 0,Date,URL
0,2021-04-23 13:25:35.996025,https://kpkesihatan.com/2020/12/20/kenyataan-a...
0,2021-04-24 13:25:35.996025,https://kpkesihatan.com/2020/12/21/kenyataan-a...


In [50]:
combined_df.State.values

array(['SABAH', 'SELANGOR', 'WP\xa0KUALA LUMPUR', 'NEGERI SEMBILAN',
       'JOHOR', 'PULAU PINANG', 'KEDAH', 'PERAK', 'WP\xa0LABUAN',
       'SARAWAK', 'PAHANG', 'MELAKA', 'KELANTAN', 'TERENGGANU',
       'WP\xa0PUTRAJAYA', 'PERLIS', 'JUMLAH KESELURUHAN', 'SABAH',
       'SELANGOR', 'WP KUALA LUMPUR', 'NEGERI SEMBILAN', 'JOHOR',
       'PULAU PINANG', 'KEDAH', 'PERAK', 'WP LABUAN', 'SARAWAK', 'PAHANG',
       'MELAKA', 'KELANTAN', 'TERENGGANU', 'WP\xa0PUTRAJAYA', 'PERLIS',
       'JUMLAH KESELURUHAN'], dtype=object)

In [52]:
combined_df.State = combined_df.State.str.replace('\xa0', ' ').str.replace('.', '', regex=False)
combined_df.State.values
# combined_df.State = combined_df.State.str.replace('.', '', regex=False)

array(['SABAH', 'SELANGOR', 'WP KUALA LUMPUR', 'NEGERI SEMBILAN', 'JOHOR',
       'PULAU PINANG', 'KEDAH', 'PERAK', 'WP LABUAN', 'SARAWAK', 'PAHANG',
       'MELAKA', 'KELANTAN', 'TERENGGANU', 'WP PUTRAJAYA', 'PERLIS',
       'JUMLAH KESELURUHAN', 'SABAH', 'SELANGOR', 'WP KUALA LUMPUR',
       'NEGERI SEMBILAN', 'JOHOR', 'PULAU PINANG', 'KEDAH', 'PERAK',
       'WP LABUAN', 'SARAWAK', 'PAHANG', 'MELAKA', 'KELANTAN',
       'TERENGGANU', 'WP PUTRAJAYA', 'PERLIS', 'JUMLAH KESELURUHAN'],
      dtype=object)

In [55]:
new_df = combined_df.pivot_table(index='Date', columns='State', values='New Case', aggfunc='max')
new_df

State,JOHOR,JUMLAH KESELURUHAN,KEDAH,KELANTAN,MELAKA,NEGERI SEMBILAN,PAHANG,PERAK,PERLIS,PULAU PINANG,SABAH,SARAWAK,SELANGOR,TERENGGANU,WP KUALA LUMPUR,WP LABUAN,WP PUTRAJAYA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2021-04-23 13:25:35.996025,156,"1,340 (3)",6,1,1,169,12,10,0,30,196,3 (2),441 (1),2,284,27,2
2021-04-24 13:25:35.996025,278 (1),"2,018 (7)",3,6,9,58,12,21,0,18,247,0,"1,204 (1)",1,127 (4),29 (1),5


In [56]:
cumu_df = combined_df.pivot_table(index='Date', columns='State', values='Cumulative Case', aggfunc='max')
cumu_df

State,JOHOR,JUMLAH KESELURUHAN,KEDAH,KELANTAN,MELAKA,NEGERI SEMBILAN,PAHANG,PERAK,PERLIS,PULAU PINANG,SABAH,SARAWAK,SELANGOR,TERENGGANU,WP KUALA LUMPUR,WP LABUAN,WP PUTRAJAYA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2021-04-23 13:25:35.996025,3143,93309,2888,494,695,7045,889,2867,45,2960,34478,1090,24465,287,10188,1541,234
2021-04-24 13:25:35.996025,3421,95327,2891,500,704,7103,901,2888,45,2978,34725,1090,25669,288,10315,1570,239


In [83]:
pd.merge(left=new_df, right=url_df, how='left', left_index=True, right_on='Date')
# x.drop_duplicates()
# x

Unnamed: 0,JOHOR,JUMLAH KESELURUHAN,KEDAH,KELANTAN,MELAKA,NEGERI SEMBILAN,PAHANG,PERAK,PERLIS,PULAU PINANG,SABAH,SARAWAK,SELANGOR,TERENGGANU,WP KUALA LUMPUR,WP LABUAN,WP PUTRAJAYA,Date,URL
0,156,"1,340 (3)",6,1,1,169,12,10,0,30,196,3 (2),441 (1),2,284,27,2,2021-04-23 13:25:35.996025,https://kpkesihatan.com/2020/12/20/kenyataan-a...
0,278 (1),"2,018 (7)",3,6,9,58,12,21,0,18,247,0,"1,204 (1)",1,127 (4),29 (1),5,2021-04-24 13:25:35.996025,https://kpkesihatan.com/2020/12/21/kenyataan-a...
