In [1]:
# libraries

from datetime import datetime
import os
import glob
import requests 
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

# Web scrapping

In [2]:
# download data 
# =============

link = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vSc_2y5N0I67wDU38DjDh35IZSIS30rQf7_NYZhtYYGU1jJYT6_kDx4YpF-qw0LSlGsBYP8pqM_a1Pd/pubhtml#'
req = requests.get(link)
soup = BeautifulSoup(req.content, "html.parser")

In [3]:
tbody = soup.find_all('tbody')[0]
body = tbody.find_all('tr')

# print(tbody)
# print(body)

In [4]:
head_row = [i.text for i in body[0].find_all('td')]
head_row

['Patient Number',
 'State Patient Number',
 'Date Announced',
 'Age Bracket',
 'Gender',
 'Detected City',
 'Detected District',
 'Detected State',
 'State code',
 'Current Status',
 'Notes',
 'Contracted from which Patient (Suspected)',
 'Nationality',
 'Type of transmission',
 'Status Change Date',
 'Source_1',
 'Source_2',
 'Source_3',
 'Backup Notes',
 '',
 '',
 '',
 '',
 '',
 '',
 '']

In [5]:
contents = []

for i in range(len(body)):
    contents.append([i.text for i in body[i].find_all('td')])

# Saving to Dataframe

In [6]:
p_df = pd.DataFrame(contents[2:len(contents)], columns=head_row)
p_df.head()

Unnamed: 0,Patient Number,State Patient Number,Date Announced,Age Bracket,Gender,Detected City,Detected District,Detected State,State code,Current Status,...,Source_2,Source_3,Backup Notes,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21
0,1,KL-TS-P1,30/01/2020,20.0,F,Thrissur,Thrissur,Kerala,KL,Recovered,...,https://weather.com/en-IN/india/news/news/2020...,Student from Wuhan,,,,,,,,
1,2,KL-AL-P1,02/02/2020,,,Alappuzha,Alappuzha,Kerala,KL,Recovered,...,https://weather.com/en-IN/india/news/news/2020...,,Student from Wuhan,,,,,,,
2,3,KL-KS-P1,03/02/2020,,,Kasaragod,Kasaragod,Kerala,KL,Recovered,...,https://twitter.com/ANI/status/122422148580539...,https://weather.com/en-IN/india/news/news/2020...,Student from Wuhan,,,,,,,
3,4,DL-P1,02/03/2020,45.0,M,East Delhi (Mayur Vihar),East Delhi,Delhi,DL,Recovered,...,https://economictimes.indiatimes.com/news/poli...,,Travel history to Italy and Austria,,,,,,,
4,5,TS-P1,02/03/2020,24.0,M,Hyderabad,Hyderabad,Telangana,TG,Recovered,...,https://www.indiatoday.in/india/story/coronavi...,https://www.thehindu.com/news/national/coronav...,"Travel history to Dubai, Singapore contact",,,,,,,


# Data cleaning and transformations

In [7]:
# shape of dataframe
# ==================

p_df.shape

(5099, 26)

In [8]:
# columns
# =======

p_df.columns

Index(['Patient Number', 'State Patient Number', 'Date Announced',
       'Age Bracket', 'Gender', 'Detected City', 'Detected District',
       'Detected State', 'State code', 'Current Status', 'Notes',
       'Contracted from which Patient (Suspected)', 'Nationality',
       'Type of transmission', 'Status Change Date', 'Source_1', 'Source_2',
       'Source_3', 'Backup Notes', '', '', '', '', '', '', ''],
      dtype='object')

In [9]:
# selecting important columns only
# ================================

p_df = p_df.loc[:, :'Backup Notes']

In [10]:
# looking for missing values 
# ==========================

p_df.isna().sum()

Patient Number                               0
State Patient Number                         0
Date Announced                               0
Age Bracket                                  0
Gender                                       0
Detected City                                0
Detected District                            0
Detected State                               0
State code                                   0
Current Status                               0
Notes                                        0
Contracted from which Patient (Suspected)    0
Nationality                                  0
Type of transmission                         0
Status Change Date                           0
Source_1                                     0
Source_2                                     0
Source_3                                     0
Backup Notes                                 0
dtype: int64

In [11]:
# no. of empty strings in each column
# ===================================

print(p_df.shape)

for i in p_df.columns:
    print(i, '\t', p_df[p_df[i]==''].shape[0])

(5099, 19)
Patient Number 	 85
State Patient Number 	 4039
Date Announced 	 277
Age Bracket 	 4147
Gender 	 3777
Detected City 	 4232
Detected District 	 1157
Detected State 	 277
State code 	 277
Current Status 	 278
Notes 	 1499
Contracted from which Patient (Suspected) 	 3990
Nationality 	 4663
Type of transmission 	 3264
Status Change Date 	 387
Source_1 	 465
Source_2 	 3576
Source_3 	 4826
Backup Notes 	 4739


In [12]:
# replacing empty strings with np.nan
# ==================================-

print(p_df.shape)

p_df = p_df.replace(r'', np.nan, regex=True)
p_df.isna().sum()

(5099, 19)


Patient Number                                 85
State Patient Number                         4039
Date Announced                                277
Age Bracket                                  4147
Gender                                       3777
Detected City                                4232
Detected District                            1157
Detected State                                277
State code                                    277
Current Status                                278
Notes                                        1499
Contracted from which Patient (Suspected)    3990
Nationality                                  4663
Type of transmission                         3264
Status Change Date                            387
Source_1                                      465
Source_2                                     3576
Source_3                                     4826
Backup Notes                                 4739
dtype: int64

In [13]:
# droping empty rows (row with just row number but without patient entry
# ======================================================================

p_df.dropna(subset=['Detected State'], inplace=True)
print(p_df.shape)
p_df.isna().sum()

(4822, 19)


Patient Number                                  0
State Patient Number                         3762
Date Announced                                  0
Age Bracket                                  3870
Gender                                       3500
Detected City                                3955
Detected District                             880
Detected State                                  0
State code                                      0
Current Status                                  1
Notes                                        1222
Contracted from which Patient (Suspected)    3713
Nationality                                  4386
Type of transmission                         2987
Status Change Date                            110
Source_1                                      188
Source_2                                     3299
Source_3                                     4549
Backup Notes                                 4462
dtype: int64

In [14]:
p_df.columns

Index(['Patient Number', 'State Patient Number', 'Date Announced',
       'Age Bracket', 'Gender', 'Detected City', 'Detected District',
       'Detected State', 'State code', 'Current Status', 'Notes',
       'Contracted from which Patient (Suspected)', 'Nationality',
       'Type of transmission', 'Status Change Date', 'Source_1', 'Source_2',
       'Source_3', 'Backup Notes'],
      dtype='object')

In [15]:
# rename dateframe columns 
# ========================

p_df.columns = ['_'.join(col.lower().split()) for col in p_df.columns]
p_df.rename(columns = {'contracted_from_which_patient_(suspected)':'suspected_contacted_patient'}, inplace=True)
p_df.sample(5)

Unnamed: 0,patient_number,state_patient_number,date_announced,age_bracket,gender,detected_city,detected_district,detected_state,state_code,current_status,notes,suspected_contacted_patient,nationality,type_of_transmission,status_change_date,source_1,source_2,source_3,backup_notes
2971,2972,,03/04/2020,,,,,Telangana,TG,Hospitalized,Details awaited,,,,03/04/2020,https://twitter.com/Eatala_Rajender/status/124...,,,
285,286,,21/03/2020,,,Noida,Gautam Buddha Nagar,Uttar Pradesh,UP,Hospitalized,"Supertech Capetown, Sector 74, NOIDA",,India,Imported,21/03/2020,https://twitter.com/ANINewsUP/status/124123429...,,,"Supertech Capetown, Sector 74, NOIDA"
3887,3888,TN-P537,05/04/2020,,F,,Namakkal,Tamil Nadu,TN,Hospitalized,"Contact of Patient, ID Unknown",,,,05/04/2020,State Bulletin: 05 Apr Evening,,,
3031,3032,,03/04/2020,,,,Thane,Maharashtra,MH,Hospitalized,,,,,03/04/2020,https://twitter.com/PTI_News/status/1246097072...,https://experience.arcgis.com/experience/8167a...,,
4718,4719,,06/04/2020,,,,Bhopal,Madhya Pradesh,MP,Hospitalized,Details awaited,,,,06/04/2020,https://twitter.com/JansamparkMP/status/124718...,,,


In [16]:
# creating patient id column from patient number
# ===============================================

p_df['p_id'] = p_df['patient_number'].apply(lambda x : 'P'+str(x))
p_df.columns

Index(['patient_number', 'state_patient_number', 'date_announced',
       'age_bracket', 'gender', 'detected_city', 'detected_district',
       'detected_state', 'state_code', 'current_status', 'notes',
       'suspected_contacted_patient', 'nationality', 'type_of_transmission',
       'status_change_date', 'source_1', 'source_2', 'source_3',
       'backup_notes', 'p_id'],
      dtype='object')

In [17]:
p_df = p_df.loc[:, :'backup_notes']

# Saving Data

In [18]:
p_df.sample(5)

Unnamed: 0,patient_number,state_patient_number,date_announced,age_bracket,gender,detected_city,detected_district,detected_state,state_code,current_status,notes,suspected_contacted_patient,nationality,type_of_transmission,status_change_date,source_1,source_2,source_3,backup_notes
3660,3661,,04/04/2020,,,,Mumbai,Maharashtra,MH,Hospitalized,,,,,,https://twitter.com/rajeshtope11/status/124648...,,,
126,127,TS-P3,16/03/2020,,M,Hyderabad,Hyderabad,Telangana,TG,Hospitalized,Travelled from Scotland,,India,Imported,16/03/2020,https://www.thenewsminute.com/article/another-...,,,Returned from Scotland
612,613,,25/03/2020,,,,Thane,Maharashtra,MH,Hospitalized,Details awaited,,,TBD,25/03/2020,https://twitter.com/ANI/status/124277728098574...,,,
1602,1603,,31/03/2020,,,,,Delhi,DL,Hospitalized,Details Awaited,,,TBD,31/03/2020,https://twitter.com/ANI/status/124502337994339...,,,
1608,1609,,31/03/2020,,,,Raipur,Chhattisgarh,CT,Hospitalized,Travelled from UK,,,Imported,31/03/2020,https://twitter.com/ANI/status/124502522709808...,,,


In [19]:
p_df.to_csv('patients_data.csv', index=False)