In [1]:
# libraries

from datetime import datetime
import os
import re
import glob
import requests 
import pandas as pd
from bs4 import BeautifulSoup

# Web Scrapping

In [2]:
# web scrapping

link = 'https://www.mohfw.gov.in/'
req = requests.get(link)
soup = BeautifulSoup(req.content, "html.parser")

thead = soup.find_all('thead')[-1]
# print(thead)
head = thead.find_all('tr')

tbody = soup.find_all('tbody')[-1]
body = tbody.find_all('tr')

# print(rows)

head_rows = []
body_rows = []

for tr in head:
    td = tr.find_all(['th', 'td'])
    row = [i.text for i in td]
    head_rows.append(row)
    
for tr in body:
    td = tr.find_all(['th', 'td'])
    row = [i.text for i in td]
    body_rows.append(row)
    
# print(head_rows)
    
df_bs = pd.DataFrame(body_rows[:len(body_rows)-2], columns=head_rows[0])
    
df_bs.drop('S. No.', axis=1, inplace=True)
df_bs.head(36)

Unnamed: 0,Name of State / UT,Total Confirmed cases (Including 55 foreign Nationals),Cured/Discharged/Migrated,Death
0,Andhra Pradesh,132,1,1
1,Andaman and Nicobar Islands,10,0,0
2,Arunachal Pradesh,1,0,0
3,Assam,16,0,0
4,Bihar,24,0,1
5,Chandigarh,18,0,0
6,Chhattisgarh,9,2,0
7,Delhi,219,8,4
8,Goa,6,0,0
9,Gujarat,87,8,7


# Data Cleaning

In [3]:
# date-time information
# ---------------------

now  = datetime.now()
df_bs['Date'] = now.strftime("%m/%d/%Y") 
df_bs['Date'] = pd.to_datetime(df_bs['Date'], format='%m/%d/%Y')
df_bs.head(36)

Unnamed: 0,Name of State / UT,Total Confirmed cases (Including 55 foreign Nationals),Cured/Discharged/Migrated,Death,Date
0,Andhra Pradesh,132,1,1,2020-04-03
1,Andaman and Nicobar Islands,10,0,0,2020-04-03
2,Arunachal Pradesh,1,0,0,2020-04-03
3,Assam,16,0,0,2020-04-03
4,Bihar,24,0,1,2020-04-03
5,Chandigarh,18,0,0,2020-04-03
6,Chhattisgarh,9,2,0,2020-04-03
7,Delhi,219,8,4,2020-04-03
8,Goa,6,0,0,2020-04-03
9,Gujarat,87,8,7,2020-04-03


In [4]:
df_bs['Name of State / UT'].unique()

array(['Andhra Pradesh', 'Andaman and Nicobar Islands',
       'Arunachal Pradesh', 'Assam', 'Bihar', 'Chandigarh',
       'Chhattisgarh', 'Delhi', 'Goa', 'Gujarat', 'Haryana',
       'Himachal Pradesh', 'Jammu and Kashmir', 'Jharkhand', 'Karnataka',
       'Kerala', 'Ladakh', 'Madhya Pradesh', 'Maharashtra', 'Manipur',
       'Mizoram', 'Odisha', 'Puducherry', 'Punjab', 'Rajasthan',
       'Tamil Nadu', 'Telengana', 'Uttarakhand', 'Uttar Pradesh',
       'West Bengal'], dtype=object)

In [5]:
# latitude and longitude information
# ----------------------------------

lat = {'Delhi':28.7041, 'Haryana':29.0588, 'Kerala':10.8505, 'Rajasthan':27.0238,
       'Telengana':18.1124, 'Uttar Pradesh':26.8467, 'Ladakh':34.2996, 'Tamil Nadu':11.1271,
       'Jammu and Kashmir':33.7782, 'Punjab':31.1471, 'Karnataka':15.3173, 'Maharashtra':19.7515,
       'Andhra Pradesh':15.9129, 'Odisha':20.9517, 'Uttarakhand':30.0668, 'West Bengal':22.9868, 
       'Puducherry': 11.9416, 'Chandigarh': 30.7333, 'Chhattisgarh':21.2787, 'Gujarat': 22.2587, 
       'Himachal Pradesh': 31.1048, 'Madhya Pradesh': 22.9734, 'Bihar': 25.0961, 'Manipur':24.6637, 
       'Mizoram':23.1645, 'Goa': 15.2993, 'Andaman and Nicobar Islands': 11.7401, 'Assam' : 26.2006, 
       'Jharkhand': 23.6102, 'Arunachal Pradesh': 28.2180}

long = {'Delhi':77.1025, 'Haryana':76.0856, 'Kerala':76.2711, 'Rajasthan':74.2179,
        'Telengana':79.0193, 'Uttar Pradesh':80.9462, 'Ladakh':78.2932, 'Tamil Nadu':78.6569,
        'Jammu and Kashmir':76.5762, 'Punjab':75.3412, 'Karnataka':75.7139, 'Maharashtra':75.7139,
        'Andhra Pradesh':79.7400, 'Odisha':85.0985, 'Uttarakhand':79.0193, 'West Bengal':87.8550, 
        'Puducherry': 79.8083, 'Chandigarh': 76.7794, 'Chhattisgarh':81.8661, 'Gujarat': 71.1924, 
        'Himachal Pradesh': 77.1734, 'Madhya Pradesh': 78.6569, 'Bihar': 85.3131, 'Manipur':93.9063, 
        'Mizoram':92.9376, 'Goa': 74.1240, 'Andaman and Nicobar Islands': 92.6586, 'Assam' : 92.9376, 
        'Jharkhand': 85.2799, 'Arunachal Pradesh': 94.7278}

df_bs['Latitude'] = df_bs['Name of State / UT'].map(lat)
df_bs['Longitude'] = df_bs['Name of State / UT'].map(long)

df_bs.head(36)

Unnamed: 0,Name of State / UT,Total Confirmed cases (Including 55 foreign Nationals),Cured/Discharged/Migrated,Death,Date,Latitude,Longitude
0,Andhra Pradesh,132,1,1,2020-04-03,15.9129,79.74
1,Andaman and Nicobar Islands,10,0,0,2020-04-03,11.7401,92.6586
2,Arunachal Pradesh,1,0,0,2020-04-03,28.218,94.7278
3,Assam,16,0,0,2020-04-03,26.2006,92.9376
4,Bihar,24,0,1,2020-04-03,25.0961,85.3131
5,Chandigarh,18,0,0,2020-04-03,30.7333,76.7794
6,Chhattisgarh,9,2,0,2020-04-03,21.2787,81.8661
7,Delhi,219,8,4,2020-04-03,28.7041,77.1025
8,Goa,6,0,0,2020-04-03,15.2993,74.124
9,Gujarat,87,8,7,2020-04-03,22.2587,71.1924


In [6]:
df_bs.isna().sum()

Name of State / UT                                         0
Total Confirmed cases (Including 55 foreign Nationals)     0
Cured/Discharged/Migrated                                  0
Death                                                      0
Date                                                       0
Latitude                                                   0
Longitude                                                  0
dtype: int64

# Saving data

In [7]:
# saving data
# -----------

file_name = now.strftime("%Y_%m_%d")+'.csv'
file_loc = 'C:\\Users\\imdevskp\\Desktop\\covid_india\\.day_by_day_data\\'
df_bs.to_csv(file_loc + file_name, index=False)

df_bs.head(36)

Unnamed: 0,Name of State / UT,Total Confirmed cases (Including 55 foreign Nationals),Cured/Discharged/Migrated,Death,Date,Latitude,Longitude
0,Andhra Pradesh,132,1,1,2020-04-03,15.9129,79.74
1,Andaman and Nicobar Islands,10,0,0,2020-04-03,11.7401,92.6586
2,Arunachal Pradesh,1,0,0,2020-04-03,28.218,94.7278
3,Assam,16,0,0,2020-04-03,26.2006,92.9376
4,Bihar,24,0,1,2020-04-03,25.0961,85.3131
5,Chandigarh,18,0,0,2020-04-03,30.7333,76.7794
6,Chhattisgarh,9,2,0,2020-04-03,21.2787,81.8661
7,Delhi,219,8,4,2020-04-03,28.7041,77.1025
8,Goa,6,0,0,2020-04-03,15.2993,74.124
9,Gujarat,87,8,7,2020-04-03,22.2587,71.1924


In [8]:
df_bs.columns

Index(['Name of State / UT',
       'Total Confirmed cases (Including 55 foreign Nationals) ',
       'Cured/Discharged/Migrated', 'Death', 'Date', 'Latitude', 'Longitude'],
      dtype='object')

# Combining data

In [9]:
! ls C:\Users\imdevskp\Desktop\covid_india\.day_by_day_data

2020_03_21.csv
2020_03_22.csv
2020_03_23.csv
2020_03_24.csv
2020_03_25.csv
2020_03_26.csv
2020_03_27.csv
2020_03_28.csv
2020_03_29.csv
2020_03_30.csv
2020_03_31.csv
2020_04_01.csv
2020_04_02.csv
2020_04_03.csv


In [10]:
# pd.read_csv?

In [11]:
# complete data

loc = "C:\\Users\\imdevskp\\Desktop\\covid_india\\.day_by_day_data\\"

files = glob.glob(loc+'2020*.csv')
   
dfs = []
for i in files:
    df_temp = pd.read_csv(i)
    df_temp = df_temp.rename(columns={'Cured':'Cured/Discharged'})
    df_temp = df_temp.rename(columns={'Cured/Discharged':'Cured/Discharged/Migrated', 
                                      'Total Confirmed cases *': 'Total Confirmed cases', 
                                      'Total Confirmed cases ': 'Total Confirmed cases'})
    df_temp = df_temp.rename(columns=lambda x: re.sub('Total Confirmed cases \(Including .. foreign Nationals\) ',
                                                      'Total Confirmed cases',x))
    dfs.append(df_temp)
    
# print(dfs)

complete_data = pd.concat(dfs, ignore_index=True).sort_values(['Date'], ascending=True).reset_index(drop=True)
complete_data['Date'] = pd.to_datetime(complete_data['Date'])
complete_data = complete_data.sort_values(['Date', 'Name of State / UT']).reset_index(drop=True)

cols = ['Total Confirmed cases (Indian National)', 'Total Confirmed cases ( Foreign National )', 
              'Cured/Discharged/Migrated', 'Death']

# complete_data['Death'] = complete_data['Death'].str.extract('(\d+)')
complete_data[cols] = complete_data[cols].fillna(0).astype('int')

# complete_data.tail(50)

In [12]:
complete_data.columns

Index(['Date', 'Name of State / UT', 'Total Confirmed cases (Indian National)',
       'Total Confirmed cases ( Foreign National )',
       'Cured/Discharged/Migrated', 'Latitude', 'Longitude', 'Death',
       'Total Confirmed cases'],
      dtype='object')

In [13]:
complete_data['Name of State / UT'].replace('Chattisgarh', 'Chhattisgarh', inplace=True)
complete_data['Name of State / UT'].replace('Pondicherry', 'Puducherry', inplace=True) 

In [14]:
complete_data['Name of State / UT'].unique()

array(['Kerala', 'Delhi', 'Telengana', 'Rajasthan', 'Haryana',
       'Uttar Pradesh', 'Tamil Nadu', 'Union Territory of Ladakh',
       'Karnataka', 'Maharashtra', 'Punjab',
       'Union Territory of Jammu and Kashmir', 'Andhra Pradesh',
       'Uttarakhand', 'Odisha', 'Puducherry', 'West Bengal',
       'Chhattisgarh', 'Union Territory of Chandigarh', 'Gujarat',
       'Chandigarh', 'Himachal Pradesh', 'Jammu and Kashmir', 'Ladakh',
       'Madhya Pradesh', 'Bihar', 'Manipur', 'Mizoram',
       'Andaman and Nicobar Islands', 'Goa', 'Assam', 'Jharkhand',
       'Arunachal Pradesh'], dtype=object)

In [15]:
complete_data.tail()

Unnamed: 0,Date,Name of State / UT,Total Confirmed cases (Indian National),Total Confirmed cases ( Foreign National ),Cured/Discharged/Migrated,Latitude,Longitude,Death,Total Confirmed cases
608,2020-04-03,Tamil Nadu,0,0,6,11.1271,78.6569,1,309
609,2020-04-03,Telengana,0,0,1,18.1124,79.0193,3,107
610,2020-04-03,Uttar Pradesh,0,0,14,26.8467,80.9462,2,113
611,2020-04-03,Uttarakhand,0,0,2,30.0668,79.0193,0,10
612,2020-04-03,West Bengal,0,0,3,22.9868,87.855,3,53


In [16]:
# sorted(complete_data['Name of State / UT'].unique())

In [17]:
complete_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 613 entries, 0 to 612
Data columns (total 9 columns):
 #   Column                                      Non-Null Count  Dtype         
---  ------                                      --------------  -----         
 0   Date                                        613 non-null    datetime64[ns]
 1   Name of State / UT                          613 non-null    object        
 2   Total Confirmed cases (Indian National)     613 non-null    int32         
 3   Total Confirmed cases ( Foreign National )  613 non-null    int32         
 4   Cured/Discharged/Migrated                   613 non-null    int32         
 5   Latitude                                    613 non-null    float64       
 6   Longitude                                   613 non-null    float64       
 7   Death                                       613 non-null    int32         
 8   Total Confirmed cases                       613 non-null    int64         
dtypes: datetime

In [18]:
complete_data.to_csv('complete.csv', index=False)