In [18]:
# libraries

from datetime import datetime
import os
import glob
import requests 
import pandas as pd
from bs4 import BeautifulSoup

# Web Scrapping

In [19]:
# web scrapping

link = 'https://www.mohfw.gov.in/'
req = requests.get(link)
soup = BeautifulSoup(req.content, "html.parser")

thead = soup.find_all('thead')[-1]
# print(thead)
head = thead.find_all('tr')

tbody = soup.find_all('tbody')[-1]
body = tbody.find_all('tr')

# print(rows)

head_rows = []
body_rows = []

for tr in head:
    td = tr.find_all(['th', 'td'])
    row = [i.text for i in td]
    head_rows.append(row)
    
for tr in body:
    td = tr.find_all(['th', 'td'])
    row = [i.text for i in td]
    body_rows.append(row)
    
# print(head_rows)
    
df_bs = pd.DataFrame(body_rows[:len(body_rows)-1], columns=head_rows[0])
    
df_bs.drop('S. No.', axis=1, inplace=True)
df_bs.head(36)

Unnamed: 0,Name of State / UT,Total Confirmed cases (Indian National),Total Confirmed cases ( Foreign National ),Cured/Discharged/Migrated,Death
0,Andhra Pradesh,8,0,0,0
1,Bihar,3,0,0,1
2,Chhattisgarh,1,0,0,0
3,Delhi,29,1,6,1
4,Gujarat,32,1,0,1
5,Haryana,14,14,11,0
6,Himachal Pradesh,3,0,0,1
7,Karnataka,37,0,3,1
8,Kerala,87,8,4,0
9,Madhya Pradesh,7,0,0,0


# Data Cleaning

In [20]:
# date-time information
# ---------------------

now  = datetime.now()
df_bs['Date'] = now.strftime("%m/%d/%Y") 
df_bs['Date'] = pd.to_datetime(df_bs['Date'], format='%m/%d/%Y')
df_bs.head(36)

Unnamed: 0,Name of State / UT,Total Confirmed cases (Indian National),Total Confirmed cases ( Foreign National ),Cured/Discharged/Migrated,Death,Date
0,Andhra Pradesh,8,0,0,0,2020-03-24
1,Bihar,3,0,0,1,2020-03-24
2,Chhattisgarh,1,0,0,0,2020-03-24
3,Delhi,29,1,6,1,2020-03-24
4,Gujarat,32,1,0,1,2020-03-24
5,Haryana,14,14,11,0,2020-03-24
6,Himachal Pradesh,3,0,0,1,2020-03-24
7,Karnataka,37,0,3,1,2020-03-24
8,Kerala,87,8,4,0,2020-03-24
9,Madhya Pradesh,7,0,0,0,2020-03-24


In [21]:
df_bs['Name of State / UT'].unique()

array(['Andhra Pradesh', 'Bihar', 'Chhattisgarh', 'Delhi', 'Gujarat',
       'Haryana', 'Himachal Pradesh', 'Karnataka', 'Kerala',
       'Madhya Pradesh', 'Maharashtra', 'Manipur', 'Odisha', 'Puducherry',
       'Punjab', 'Rajasthan', 'Tamil Nadu', 'Telengana', 'Chandigarh',
       'Jammu and Kashmir', 'Ladakh', 'Uttar Pradesh', 'Uttarakhand',
       'West Bengal'], dtype=object)

In [22]:
# latitude and longitude information
# ----------------------------------

lat = {'Delhi':28.7041,
       'Haryana':29.0588,
       'Kerala':10.8505,
       'Rajasthan':27.0238,
       'Telengana':18.1124,
       'Uttar Pradesh':26.8467,
       'Ladakh':34.2996,
       'Tamil Nadu':11.1271,
       'Jammu and Kashmir':33.7782,
       'Punjab':31.1471,
       'Karnataka':15.3173,
       'Maharashtra':19.7515,
       'Andhra Pradesh':15.9129, 
       'Odisha':20.9517, 
       'Uttarakhand':30.0668, 
       'West Bengal':22.9868, 
       'Puducherry': 11.9416, 
       'Chandigarh': 30.7333, 
       'Chhattisgarh':21.2787, 
       'Gujarat': 22.2587, 
       'Himachal Pradesh': 31.1048, 
       'Madhya Pradesh': 22.9734, 
       'Bihar': 25.0961, 
       'Manipur':24.6637}

long = {'Delhi':77.1025,
        'Haryana':76.0856,
        'Kerala':76.2711,
        'Rajasthan':74.2179,
        'Telengana':79.0193,
        'Uttar Pradesh':80.9462,
        'Ladakh':78.2932,
        'Tamil Nadu':78.6569,
        'Jammu and Kashmir':76.5762,
        'Punjab':75.3412,
        'Karnataka':75.7139,
        'Maharashtra':75.7139,
        'Andhra Pradesh':79.7400, 
        'Odisha':85.0985, 
        'Uttarakhand':79.0193, 
        'West Bengal':87.8550, 
        'Puducherry': 79.8083, 
        'Chandigarh': 76.7794, 
        'Chhattisgarh':81.8661, 
        'Gujarat': 71.1924, 
        'Himachal Pradesh': 77.1734, 
        'Madhya Pradesh': 78.6569, 
        'Bihar': 85.3131, 
        'Manipur':93.9063}

df_bs['Latitude'] = df_bs['Name of State / UT'].map(lat)
df_bs['Longitude'] = df_bs['Name of State / UT'].map(long)

df_bs.head()

Unnamed: 0,Name of State / UT,Total Confirmed cases (Indian National),Total Confirmed cases ( Foreign National ),Cured/Discharged/Migrated,Death,Date,Latitude,Longitude
0,Andhra Pradesh,8,0,0,0,2020-03-24,15.9129,79.74
1,Bihar,3,0,0,1,2020-03-24,25.0961,85.3131
2,Chhattisgarh,1,0,0,0,2020-03-24,21.2787,81.8661
3,Delhi,29,1,6,1,2020-03-24,28.7041,77.1025
4,Gujarat,32,1,0,1,2020-03-24,22.2587,71.1924


In [23]:
df_bs.isna().sum()

Name of State / UT                            0
Total Confirmed cases (Indian National)       0
Total Confirmed cases ( Foreign National )    0
Cured/Discharged/Migrated                     0
Death                                         0
Date                                          0
Latitude                                      0
Longitude                                     0
dtype: int64

# Saving data

In [24]:
# saving data
# -----------

file_name = now.strftime("%Y_%m_%d")+'.csv'
file_loc = 'C:\\Users\\imdevskp\\Desktop\\covid_india\\.day_by_day_data\\'
df_bs.to_csv(file_loc + file_name, index=False)

df_bs.head(36)

Unnamed: 0,Name of State / UT,Total Confirmed cases (Indian National),Total Confirmed cases ( Foreign National ),Cured/Discharged/Migrated,Death,Date,Latitude,Longitude
0,Andhra Pradesh,8,0,0,0,2020-03-24,15.9129,79.74
1,Bihar,3,0,0,1,2020-03-24,25.0961,85.3131
2,Chhattisgarh,1,0,0,0,2020-03-24,21.2787,81.8661
3,Delhi,29,1,6,1,2020-03-24,28.7041,77.1025
4,Gujarat,32,1,0,1,2020-03-24,22.2587,71.1924
5,Haryana,14,14,11,0,2020-03-24,29.0588,76.0856
6,Himachal Pradesh,3,0,0,1,2020-03-24,31.1048,77.1734
7,Karnataka,37,0,3,1,2020-03-24,15.3173,75.7139
8,Kerala,87,8,4,0,2020-03-24,10.8505,76.2711
9,Madhya Pradesh,7,0,0,0,2020-03-24,22.9734,78.6569


In [25]:
df_bs.columns

Index(['Name of State / UT', 'Total Confirmed cases (Indian National)',
       'Total Confirmed cases ( Foreign National )',
       'Cured/Discharged/Migrated', 'Death', 'Date', 'Latitude', 'Longitude'],
      dtype='object')

# Combining data

In [26]:
! ls C:\Users\imdevskp\Desktop\covid_india\.day_by_day_data

2020_03_21.csv
2020_03_22.csv
2020_03_23.csv
2020_03_24.csv


In [27]:
# pd.read_csv?

In [28]:
# complete data

loc = "C:\\Users\\imdevskp\\Desktop\\covid_india\\.day_by_day_data\\"

files = glob.glob(loc+'2020*.csv')
dfs = []
for i in files:
    df_temp = pd.read_csv(i)
    df_temp = df_temp.rename(columns={'Cured':'Cured/Discharged'})
    df_temp = df_temp.rename(columns={'Cured/Discharged':'Cured/Discharged/Migrated'})
    dfs.append(df_temp)
    
# print(dfs)

complete_data = pd.concat(dfs, ignore_index=True).sort_values(['Date'], ascending=True).reset_index(drop=True)
complete_data['Date'] = pd.to_datetime(complete_data['Date'])
complete_data = complete_data.sort_values(['Date', 'Name of State / UT']).reset_index(drop=True)

cols = ['Total Confirmed cases (Indian National)', 'Total Confirmed cases ( Foreign National )', 
              'Cured/Discharged/Migrated', 'Death']

complete_data[cols] = complete_data[cols].fillna(0).astype('int')

# complete_data.tail(50)

In [29]:
complete_data.columns

Index(['Date', 'Name of State / UT', 'Total Confirmed cases (Indian National)',
       'Total Confirmed cases ( Foreign National )',
       'Cured/Discharged/Migrated', 'Latitude', 'Longitude', 'Death'],
      dtype='object')

In [30]:
complete_data['Name of State / UT'].replace('Chattisgarh', 'Chhattisgarh', inplace=True)
complete_data['Name of State / UT'].replace('Pondicherry', 'Puducherry', inplace=True) 

In [31]:
complete_data['Name of State / UT'].unique()

array(['Kerala', 'Delhi', 'Telengana', 'Rajasthan', 'Haryana',
       'Uttar Pradesh', 'Tamil Nadu', 'Union Territory of Ladakh',
       'Karnataka', 'Maharashtra', 'Punjab',
       'Union Territory of Jammu and Kashmir', 'Andhra Pradesh',
       'Uttarakhand', 'Odisha', 'Puducherry', 'West Bengal',
       'Chhattisgarh', 'Union Territory of Chandigarh', 'Gujarat',
       'Chandigarh', 'Himachal Pradesh', 'Jammu and Kashmir', 'Ladakh',
       'Madhya Pradesh', 'Bihar', 'Manipur'], dtype=object)

In [32]:
# sorted(complete_data['Name of State / UT'].unique())

In [33]:
complete_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 340 entries, 0 to 339
Data columns (total 8 columns):
 #   Column                                      Non-Null Count  Dtype         
---  ------                                      --------------  -----         
 0   Date                                        340 non-null    datetime64[ns]
 1   Name of State / UT                          340 non-null    object        
 2   Total Confirmed cases (Indian National)     340 non-null    int32         
 3   Total Confirmed cases ( Foreign National )  340 non-null    int32         
 4   Cured/Discharged/Migrated                   340 non-null    int32         
 5   Latitude                                    340 non-null    float64       
 6   Longitude                                   340 non-null    float64       
 7   Death                                       340 non-null    int32         
dtypes: datetime64[ns](1), float64(2), int32(4), object(1)
memory usage: 16.1+ KB


In [34]:
complete_data.to_csv('complete.csv', index=False)