In [1]:
# libraries

from datetime import datetime
import os
import glob
import requests 
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

# Web scrapping

In [2]:
# download data 
# =============

link = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vSc_2y5N0I67wDU38DjDh35IZSIS30rQf7_NYZhtYYGU1jJYT6_kDx4YpF-qw0LSlGsBYP8pqM_a1Pd/pubhtml#'
req = requests.get(link)
soup = BeautifulSoup(req.content, "html.parser")

In [3]:
tbody = soup.find_all('tbody')[0]
body = tbody.find_all('tr')

# print(tbody)
# print(body)

In [4]:
head_row = [i.text for i in body[0].find_all('td')]
head_row

['Patient Number',
 'State Patient Number',
 'Date Announced',
 'Age Bracket',
 'Gender',
 'Detected City',
 'Detected District',
 'Detected State',
 'Current Status',
 'Notes',
 'Contracted from which Patient (Suspected)',
 'Nationality',
 'Status Change Date',
 'Source_1',
 'Source_2',
 'Source_3',
 'Backup Notes',
 '',
 '',
 '',
 '',
 '',
 '',
 '']

In [5]:
contents = []

for i in range(len(body)):
    contents.append([i.text for i in body[i].find_all('td')])

# Saving to Dataframe

In [6]:
p_df = pd.DataFrame(contents[2:len(contents)], columns=head_row)
p_df.head()

Unnamed: 0,Patient Number,State Patient Number,Date Announced,Age Bracket,Gender,Detected City,Detected District,Detected State,Current Status,Notes,...,Source_2,Source_3,Backup Notes,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21
0,1,KL-TS-P1,30/01/2020,20.0,F,Thrissur,Thrissur,Kerala,Recovered,Travelled from Wuhan,...,https://weather.com/en-IN/india/news/news/2020...,Student from Wuhan,,,,,,,,
1,2,KL-AL-P1,02/02/2020,,,Alappuzha,Alappuzha,Kerala,Recovered,Travelled from Wuhan,...,https://weather.com/en-IN/india/news/news/2020...,,Student from Wuhan,,,,,,,
2,3,KL-KS-P1,03/02/2020,,,Kasaragod,Kasaragod,Kerala,Recovered,Travelled from Wuhan,...,https://twitter.com/ANI/status/122422148580539...,https://weather.com/en-IN/india/news/news/2020...,Student from Wuhan,,,,,,,
3,4,DL-P1,02/03/2020,45.0,M,East Delhi (Mayur Vihar),East Delhi,Delhi,Recovered,"Travelled from Austria, Italy",...,https://economictimes.indiatimes.com/news/poli...,,Travel history to Italy and Austria,,,,,,,
4,5,TS-P1,02/03/2020,24.0,M,Hyderabad,Hyderabad,Telangana,Recovered,,...,https://www.indiatoday.in/india/story/coronavi...,https://www.thehindu.com/news/national/coronav...,"Travel history to Dubai, Singapore contact",,,,,,,


# Data cleaning and transformations

In [7]:
# shape of dataframe
# ==================

p_df.shape

(2072, 24)

In [8]:
# columns
# =======

p_df.columns

Index(['Patient Number', 'State Patient Number', 'Date Announced',
       'Age Bracket', 'Gender', 'Detected City', 'Detected District',
       'Detected State', 'Current Status', 'Notes',
       'Contracted from which Patient (Suspected)', 'Nationality',
       'Status Change Date', 'Source_1', 'Source_2', 'Source_3',
       'Backup Notes', '', '', '', '', '', '', ''],
      dtype='object')

In [9]:
# selecting important columns only
# ================================

p_df = p_df[['Patient Number', 'State Patient Number', 'Date Announced',
       'Age Bracket', 'Gender', 'Detected City', 'Detected District',
       'Detected State', 'Current Status', 'Notes',
       'Contracted from which Patient (Suspected)', 'Nationality',
       'Status Change Date', 'Source_1', 'Source_2', 'Source_3',
       'Backup Notes']]

In [10]:
# looking for missing values 
# ==========================

p_df.isna().sum()

Patient Number                               0
State Patient Number                         0
Date Announced                               0
Age Bracket                                  0
Gender                                       0
Detected City                                0
Detected District                            0
Detected State                               0
Current Status                               0
Notes                                        0
Contracted from which Patient (Suspected)    0
Nationality                                  0
Status Change Date                           0
Source_1                                     0
Source_2                                     0
Source_3                                     0
Backup Notes                                 0
dtype: int64

In [11]:
# no. of empty strings in each column
# ===================================

print(p_df.shape)

for i in p_df.columns:
    print(i, '\t', p_df[p_df[i]==''].shape[0])

(2072, 17)
Patient Number 	 767
State Patient Number 	 1783
Date Announced 	 873
Age Bracket 	 1536
Gender 	 1495
Detected City 	 1403
Detected District 	 1029
Detected State 	 873
Current Status 	 873
Notes 	 881
Contracted from which Patient (Suspected) 	 1823
Nationality 	 1656
Status Change Date 	 876
Source_1 	 879
Source_2 	 1486
Source_3 	 1931
Backup Notes 	 1712


In [12]:
# replacing empty strings with np.nan
# ==================================-

print(p_df.shape)

p_df = p_df.replace(r'', np.nan, regex=True)
p_df.isna().sum()

(2072, 17)


Patient Number                                767
State Patient Number                         1783
Date Announced                                873
Age Bracket                                  1536
Gender                                       1495
Detected City                                1403
Detected District                            1029
Detected State                                873
Current Status                                873
Notes                                         881
Contracted from which Patient (Suspected)    1823
Nationality                                  1656
Status Change Date                            876
Source_1                                      879
Source_2                                     1486
Source_3                                     1931
Backup Notes                                 1712
dtype: int64

In [13]:
# droping empty rows (row with just row number but without patient entry
# ======================================================================

print(p_df.shape)
p_df.dropna(subset=['Detected State'], inplace=True)
print(p_df.shape)

(2072, 17)
(1199, 17)


In [14]:
p_df.columns

Index(['Patient Number', 'State Patient Number', 'Date Announced',
       'Age Bracket', 'Gender', 'Detected City', 'Detected District',
       'Detected State', 'Current Status', 'Notes',
       'Contracted from which Patient (Suspected)', 'Nationality',
       'Status Change Date', 'Source_1', 'Source_2', 'Source_3',
       'Backup Notes'],
      dtype='object')

In [15]:
# rename dateframe columns 
# ========================

p_df.columns = ['_'.join(col.lower().split()) for col in p_df.columns]
p_df.rename(columns = {'contracted_from_which_patient_(suspected)':'suspected_contacted_patient'})
p_df.sample(5)

Unnamed: 0,patient_number,state_patient_number,date_announced,age_bracket,gender,detected_city,detected_district,detected_state,current_status,notes,contracted_from_which_patient_(suspected),nationality,status_change_date,source_1,source_2,source_3,backup_notes
1152,1153,,30/03/2020,41.0,M,,Jodhpur,Rajasthan,Hospitalized,Travlled from Iran Resident of Ladakh,,,30/03/2020,https://twitter.com/ANI/status/124448466809679...,,,
879,880,,27/03/2020,,M,Jabalpur,Jabalpur,Madhya Pradesh,Hospitalized,Travelled from Dubai.,,,27/03/2020,https://twitter.com/PTI_News/status/1243582277...,https://www.businessinsider.in/india/news/coro...,,
409,410,,23/03/2020,,,Mumbai,Mumbai,Maharashtra,Hospitalized,Details awaited,,,23/03/2020,https://twitter.com/PTI_News/status/1241931502...,https://www.livemint.com/news/india/coronaviru...,,
239,240,,20/03/2020,27.0,M,Kasaragod,Kasaragod,Kerala,Hospitalized,Travelled from Dubai via Mangalore,,India,20/03/2020,https://twitter.com/ANI/status/124099693767274...,https://www.thenewsminute.com/article/covid-19...,https://www.thehindu.com/news/national/kerala/...,Returned from Dubai via Mangalore
629,630,KA-P43,25/03/2020,63.0,M,Bengaluru,Bengaluru,Karnataka,Hospitalized,"Travel History to Brazil, Argentina and arrive...",,,25/03/2020,https://twitter.com/DHFWKA/status/124280466314...,State Health Bulletin,,


# Saving Data

In [16]:
p_df.sample(5)

Unnamed: 0,patient_number,state_patient_number,date_announced,age_bracket,gender,detected_city,detected_district,detected_state,current_status,notes,contracted_from_which_patient_(suspected),nationality,status_change_date,source_1,source_2,source_3,backup_notes
43,44,,09/03/2020,,M,Pune,Pune,Maharashtra,Recovered,"Travelled from Dubai on 1 March, Husband of P45",,India,25/03/2020,https://www.indiatoday.in/india/story/two-with...,https://www.firstpost.com/health/coronavirus-o...,https://www.thehindu.com/news/national/coronav...,"Returned from Dubai on 1 March, Husband of P45"
191,192,TS-P16,19/03/2020,,,Hyderabad,Hyderabad,Telangana,Hospitalized,Travelled from UK,,India,19/03/2020,https://twitter.com/IPRTelangana/status/124068...,https://www.indiatoday.in/india/story/coronavi...,,Returned from London
30,31,DL-P3,06/03/2020,,,Uttam Nagar,West Delhi,Delhi,Recovered,Travelled from Thailand and Malaysia,,India,15/03/2020,https://indianexpress.com/article/india/corona...,https://www.indiatoday.in/india/story/delhi-po...,http://health.delhigovt.nic.in/wps/wcm/connect...,Travelled to Thailand and Malaysia
865,866,,27/03/2020,6.0,F,,Nadia,West Bengal,Hospitalized,"Family, History of contact to a positive case ...",,,27/03/2020,https://www.wbhealth.gov.in/uploaded_files/cor...,,,
253,254,,20/03/2020,32.0,M,West Delhi,West Delhi,Delhi,Hospitalized,Travelled from France,,India,20/03/2020,https://twitter.com/PTI_News/status/1241050375...,https://timesofindia.indiatimes.com/city/delhi...,,Travel history to France


In [17]:
p_df.to_csv('patients_data.csv', index=False)