In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import shutil
from datetime import datetime

In [2]:
BASE_URL = 'https://www.mohfw.gov.in/'
#loading empty array for board members
covid19_data = []
#Loop through our URLs we loaded above

html = requests.get(BASE_URL).text
soup = BeautifulSoup(html, "html.parser")
#identify table we want to scrape
covid19_table = soup.find('table', {"class" : "table table-striped"})

# Today's date in dd-mm-yyyy format
date = datetime.today().strftime('%d-%m-%Y')

#loop through table, grab each of the 4 columns shown (try one of the links yourself to see the layout)
for row in covid19_table.find_all('tr'):
    cols = row.find_all('td')
    if len(cols) == 5:
        covid19_data.append((date, cols[1].text.strip(), cols[2].text.strip(), cols[3].text.strip(), cols[4].text.strip()))

In [3]:
#convert output to new array, check length
covid19_array = np.asarray(covid19_data)
len(covid19_array)

32

In [4]:
#convert new array to dataframe
df = pd.DataFrame(covid19_array)

In [6]:
#rename columns, check output
df.columns = ['Date', 'State', 'Total Confirmed Cases','Cured/Discharged/Migrated', 'Death']
# Correct Telengana spelling mistake
df = df.replace('Telengana','#Telangana')
# Replace any #'s in state names
df = df.replace(regex=r'#+', value='')
# Convert all numeric columns to int32
df[['Total Confirmed Cases','Cured/Discharged/Migrated', 'Death']] = df[['Total Confirmed Cases','Cured/Discharged/Migrated', 'Death']].astype('int32')
df = df.sort_values(by='Total Confirmed Cases', ascending=False)

In [7]:
# Take a backup
destination = '..\\backup\\covid19-india-statewise-timeseries-' + date + '.csv'
shutil.copy('..\data\covid19-india-statewise-timeseries.csv',destination)

# Export data
df.to_csv('..\data\covid19-india-statewise-timeseries.csv', mode='a', header=False, index=False)