In [23]:
# libraries

from datetime import datetime
import os
import re
import glob
import requests 
import pandas as pd
from bs4 import BeautifulSoup
import logging

# Web Scrapping

In [34]:
# web scrapping

link = 'https://www.mohfw.gov.in/'
req = requests.get(link)
soup = BeautifulSoup(req.content, "html.parser")

thead = soup.find_all('thead')[-1]
# print(thead)
head = thead.find_all('tr')

tbody = soup.find_all('tbody')[-1]
body = tbody.find_all('tr')

# print(rows)

head_rows = []
body_rows = []

for tr in head:
    td = tr.find_all(['th', 'td'])
    row = [i.text for i in td]
    head_rows.append(row)
    
for tr in body:
    td = tr.find_all(['th', 'td'])
    row = [i.text for i in td]
    body_rows.append(row)
    
# print(head_rows)
    
df_bs = pd.DataFrame(body_rows[:len(body_rows)-2], columns=head_rows[0])
    
df_bs.drop('S. No.', axis=1, inplace=True)
df_bs.head(36)

Unnamed: 0,Name of State / UT,Total Confirmed cases (Including 72 foreign Nationals),Cured/Discharged/Migrated,Death
0,Andhra Pradesh,432,11,7
1,Andaman and Nicobar Islands,11,10,0
2,Arunachal Pradesh,1,0,0
3,Assam,31,0,1
4,Bihar,64,26,1
5,Chandigarh,21,7,0
6,Chhattisgarh,31,10,0
7,Delhi,1154,27,24
8,Goa,7,5,0
9,Gujarat,539,47,26


# Data Cleaning

In [35]:
# date-time information
# ---------------------

now  = datetime.now()
df_bs['Date'] = now.strftime("%m/%d/%Y") 
df_bs['Date'] = pd.to_datetime(df_bs['Date'], format='%m/%d/%Y')
df_bs.head(36)

Unnamed: 0,Name of State / UT,Total Confirmed cases (Including 72 foreign Nationals),Cured/Discharged/Migrated,Death,Date
0,Andhra Pradesh,432,11,7,2020-04-13
1,Andaman and Nicobar Islands,11,10,0,2020-04-13
2,Arunachal Pradesh,1,0,0,2020-04-13
3,Assam,31,0,1,2020-04-13
4,Bihar,64,26,1,2020-04-13
5,Chandigarh,21,7,0,2020-04-13
6,Chhattisgarh,31,10,0,2020-04-13
7,Delhi,1154,27,24,2020-04-13
8,Goa,7,5,0,2020-04-13
9,Gujarat,539,47,26,2020-04-13


In [36]:
df_bs['Name of State / UT'].unique()

array(['Andhra Pradesh', 'Andaman and Nicobar Islands',
       'Arunachal Pradesh', 'Assam', 'Bihar', 'Chandigarh',
       'Chhattisgarh', 'Delhi', 'Goa', 'Gujarat', 'Haryana',
       'Himachal Pradesh', 'Jammu and Kashmir', 'Jharkhand', 'Karnataka',
       'Kerala', 'Ladakh', 'Madhya Pradesh', 'Maharashtra', 'Manipur',
       'Mizoram', 'Nagaland', 'Odisha', 'Puducherry', 'Punjab',
       'Rajasthan', 'Tamil Nadu', 'Telengana', 'Tripura', 'Uttarakhand',
       'Uttar Pradesh', 'West Bengal'], dtype=object)

In [37]:
# latitude and longitude information
# ----------------------------------

lat = {'Delhi':28.7041, 'Haryana':29.0588, 'Kerala':10.8505, 'Rajasthan':27.0238,
       'Telengana':18.1124, 'Uttar Pradesh':26.8467, 'Ladakh':34.2996, 'Tamil Nadu':11.1271,
       'Jammu and Kashmir':33.7782, 'Punjab':31.1471, 'Karnataka':15.3173, 'Maharashtra':19.7515,
       'Andhra Pradesh':15.9129, 'Odisha':20.9517, 'Uttarakhand':30.0668, 'West Bengal':22.9868, 
       'Puducherry': 11.9416, 'Chandigarh': 30.7333, 'Chhattisgarh':21.2787, 'Gujarat': 22.2587, 
       'Himachal Pradesh': 31.1048, 'Madhya Pradesh': 22.9734, 'Bihar': 25.0961, 'Manipur':24.6637, 
       'Mizoram':23.1645, 'Goa': 15.2993, 'Andaman and Nicobar Islands': 11.7401, 'Assam' : 26.2006, 
       'Jharkhand': 23.6102, 'Arunachal Pradesh': 28.2180, 'Tripura': 23.9408}

long = {'Delhi':77.1025, 'Haryana':76.0856, 'Kerala':76.2711, 'Rajasthan':74.2179,
        'Telengana':79.0193, 'Uttar Pradesh':80.9462, 'Ladakh':78.2932, 'Tamil Nadu':78.6569,
        'Jammu and Kashmir':76.5762, 'Punjab':75.3412, 'Karnataka':75.7139, 'Maharashtra':75.7139,
        'Andhra Pradesh':79.7400, 'Odisha':85.0985, 'Uttarakhand':79.0193, 'West Bengal':87.8550, 
        'Puducherry': 79.8083, 'Chandigarh': 76.7794, 'Chhattisgarh':81.8661, 'Gujarat': 71.1924, 
        'Himachal Pradesh': 77.1734, 'Madhya Pradesh': 78.6569, 'Bihar': 85.3131, 'Manipur':93.9063, 
        'Mizoram':92.9376, 'Goa': 74.1240, 'Andaman and Nicobar Islands': 92.6586, 'Assam' : 92.9376, 
        'Jharkhand': 85.2799, 'Arunachal Pradesh': 94.7278, 'Tripura': 91.9882}

df_bs['Latitude'] = df_bs['Name of State / UT'].map(lat)
df_bs['Longitude'] = df_bs['Name of State / UT'].map(long)

df_bs.head(36)

Unnamed: 0,Name of State / UT,Total Confirmed cases (Including 72 foreign Nationals),Cured/Discharged/Migrated,Death,Date,Latitude,Longitude
0,Andhra Pradesh,432,11,7,2020-04-13,15.9129,79.74
1,Andaman and Nicobar Islands,11,10,0,2020-04-13,11.7401,92.6586
2,Arunachal Pradesh,1,0,0,2020-04-13,28.218,94.7278
3,Assam,31,0,1,2020-04-13,26.2006,92.9376
4,Bihar,64,26,1,2020-04-13,25.0961,85.3131
5,Chandigarh,21,7,0,2020-04-13,30.7333,76.7794
6,Chhattisgarh,31,10,0,2020-04-13,21.2787,81.8661
7,Delhi,1154,27,24,2020-04-13,28.7041,77.1025
8,Goa,7,5,0,2020-04-13,15.2993,74.124
9,Gujarat,539,47,26,2020-04-13,22.2587,71.1924


In [38]:
df_bs.isna().sum()

Name of State / UT                                         0
Total Confirmed cases (Including 72 foreign Nationals)     0
Cured/Discharged/Migrated                                  0
Death                                                      0
Date                                                       0
Latitude                                                   1
Longitude                                                  1
dtype: int64

# Saving data

In [39]:
# saving data
# -----------


file_loc = './covid_19_india.csv'
df_bs.to_csv(file_loc, index=False)

df_bs.head(36)

Unnamed: 0,Name of State / UT,Total Confirmed cases (Including 72 foreign Nationals),Cured/Discharged/Migrated,Death,Date,Latitude,Longitude
0,Andhra Pradesh,432,11,7,2020-04-13,15.9129,79.74
1,Andaman and Nicobar Islands,11,10,0,2020-04-13,11.7401,92.6586
2,Arunachal Pradesh,1,0,0,2020-04-13,28.218,94.7278
3,Assam,31,0,1,2020-04-13,26.2006,92.9376
4,Bihar,64,26,1,2020-04-13,25.0961,85.3131
5,Chandigarh,21,7,0,2020-04-13,30.7333,76.7794
6,Chhattisgarh,31,10,0,2020-04-13,21.2787,81.8661
7,Delhi,1154,27,24,2020-04-13,28.7041,77.1025
8,Goa,7,5,0,2020-04-13,15.2993,74.124
9,Gujarat,539,47,26,2020-04-13,22.2587,71.1924


In [17]:
df_bs.columns

Index(['Name of State / UT',
       'Total Confirmed cases (Including 72 foreign Nationals) ',
       'Cured/Discharged/Migrated', 'Death', 'Date', 'Latitude', 'Longitude'],
      dtype='object')

# Combining data