# Exploring COVID-19 data and analyzing various factors

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/covid19-in-india/IndividualDetails.csv
/kaggle/input/covid19-in-india/StatewiseTestingDetails.csv
/kaggle/input/covid19-in-india/HospitalBedsIndia.csv
/kaggle/input/covid19-in-india/covid_19_india.csv
/kaggle/input/covid19-in-india/AgeGroupDetails.csv
/kaggle/input/covid19-in-india/ICMRTestingLabs.csv
/kaggle/input/covid19-in-india/population_india_census2011.csv
/kaggle/input/lat-lon-indian-states/datasets_652259_1374178_Total_India_covid-19.csv


# Importing required libraries

In [2]:
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import folium

# Reading required data for further analysis

In [3]:
# 'covid_19_india.csv' contains cured, deaths and confirmed cases on a day-to-day basis.
cases = pd.read_csv("/kaggle/input/covid19-in-india/covid_19_india.csv")
cases.head()

# It contains latitude and longitude coordinates of indian states.
lat_long = pd.read_csv("/kaggle/input/lat-lon-indian-states/datasets_652259_1374178_Total_India_covid-19.csv")
lat_long.head()

# It can provide us the ratio or rate or percentage of positive cases per population statewise.
popul = pd.read_csv("/kaggle/input/covid19-in-india/population_india_census2011.csv")
popul

Unnamed: 0,Sno,State / Union Territory,Population,Rural population,Urban population,Area,Density,Gender Ratio
0,1,Uttar Pradesh,199812341,155317278,44495063,"240,928 km2 (93,023 sq mi)","828/km2 (2,140/sq mi)",912
1,2,Maharashtra,112374333,61556074,50818259,"307,713 km2 (118,809 sq mi)",365/km2 (950/sq mi),929
2,3,Bihar,104099452,92341436,11758016,"94,163 km2 (36,357 sq mi)","1,102/km2 (2,850/sq mi)",918
3,4,West Bengal,91276115,62183113,29093002,"88,752 km2 (34,267 sq mi)","1,029/km2 (2,670/sq mi)",953
4,5,Madhya Pradesh,72626809,52557404,20069405,"308,245 km2 (119,014 sq mi)",236/km2 (610/sq mi),931
5,6,Tamil Nadu,72147030,37229590,34917440,"130,058 km2 (50,216 sq mi)","555/km2 (1,440/sq mi)",996
6,7,Rajasthan,68548437,51500352,17048085,"342,239 km2 (132,139 sq mi)",201/km2 (520/sq mi),928
7,8,Karnataka,61095297,37469335,23625962,"191,791 km2 (74,051 sq mi)",319/km2 (830/sq mi),973
8,9,Gujarat,60439692,34694609,25745083,"196,024 km2 (75,685 sq mi)",308/km2 (800/sq mi),919
9,10,Andhra Pradesh,49577103,34966693,14610410,"162,968 km2 (62,922 sq mi)",303/km2 (780/sq mi),993


# Checking whether the data contains any null values

In [4]:
# No null value or no null record is found.
cases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5756 entries, 0 to 5755
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Sno                       5756 non-null   int64 
 1   Date                      5756 non-null   object
 2   Time                      5756 non-null   object
 3   State/UnionTerritory      5756 non-null   object
 4   ConfirmedIndianNational   5756 non-null   object
 5   ConfirmedForeignNational  5756 non-null   object
 6   Cured                     5756 non-null   int64 
 7   Deaths                    5756 non-null   int64 
 8   Confirmed                 5756 non-null   int64 
dtypes: int64(4), object(5)
memory usage: 404.8+ KB


# Renaming state names that've different names for same state
Since the given .csv file has different names for same state like "Telangana" -> "Telangana***" or "Telengana". So, replacing the name as "Telangana" only. We have entries in state names like "Unassigned" or "Cases being reassigned to states" so, for now we are going to ignore them and removing those rows from the data.

In [5]:
cases['State/UnionTerritory'].unique()

array(['Kerala', 'Telengana', 'Delhi', 'Rajasthan', 'Uttar Pradesh',
       'Haryana', 'Ladakh', 'Tamil Nadu', 'Karnataka', 'Maharashtra',
       'Punjab', 'Jammu and Kashmir', 'Andhra Pradesh', 'Uttarakhand',
       'Odisha', 'Puducherry', 'West Bengal', 'Chhattisgarh',
       'Chandigarh', 'Gujarat', 'Himachal Pradesh', 'Madhya Pradesh',
       'Bihar', 'Manipur', 'Mizoram', 'Andaman and Nicobar Islands',
       'Goa', 'Unassigned', 'Assam', 'Jharkhand', 'Arunachal Pradesh',
       'Tripura', 'Nagaland', 'Meghalaya', 'Dadar Nagar Haveli',
       'Cases being reassigned to states', 'Sikkim', 'Daman & Diu',
       'Dadra and Nagar Haveli and Daman and Diu', 'Telangana',
       'Telangana***', 'Telengana***'], dtype=object)

In [6]:
cases['State/UnionTerritory'].replace({"Telengana" : "Telangana", "Telengana***" : "Telangana",
                                        "Telangana***" : "Telangana"}, inplace = True)

cases['State/UnionTerritory'].replace({"Daman & Diu" : "Dadra and Nagar Haveli and Daman and Diu",
                                          "Dadar Nagar Haveli" : "Dadra and Nagar Haveli and Daman and Diu"},
                                         inplace = True)
cases = cases[(cases['State/UnionTerritory'] != 'Unassigned') &
                    (cases['State/UnionTerritory'] != 'Cases being reassigned to states')]
cases['State/UnionTerritory'].unique()

array(['Kerala', 'Telangana', 'Delhi', 'Rajasthan', 'Uttar Pradesh',
       'Haryana', 'Ladakh', 'Tamil Nadu', 'Karnataka', 'Maharashtra',
       'Punjab', 'Jammu and Kashmir', 'Andhra Pradesh', 'Uttarakhand',
       'Odisha', 'Puducherry', 'West Bengal', 'Chhattisgarh',
       'Chandigarh', 'Gujarat', 'Himachal Pradesh', 'Madhya Pradesh',
       'Bihar', 'Manipur', 'Mizoram', 'Andaman and Nicobar Islands',
       'Goa', 'Assam', 'Jharkhand', 'Arunachal Pradesh', 'Tripura',
       'Nagaland', 'Meghalaya',
       'Dadra and Nagar Haveli and Daman and Diu', 'Sikkim'], dtype=object)

In [7]:
popul.rename(columns={'State / Union Territory':'State/UnionTerritory'}, inplace=True)
popul['State/UnionTerritory'].replace({"Telengana" : "Telangana"})

0                                Uttar Pradesh
1                                  Maharashtra
2                                        Bihar
3                                  West Bengal
4                               Madhya Pradesh
5                                   Tamil Nadu
6                                    Rajasthan
7                                    Karnataka
8                                      Gujarat
9                               Andhra Pradesh
10                                      Odisha
11                                   Telangana
12                                      Kerala
13                                   Jharkhand
14                                       Assam
15                                      Punjab
16                                Chhattisgarh
17                                     Haryana
18                                 Uttarakhand
19                            Himachal Pradesh
20                                     Tripura
21           

# Changing dates into datetime & removing unnecessary columns

In [8]:
cases.Date = pd.to_datetime(cases.Date, dayfirst=True)

cases.drop(['Sno', 'Time', 'ConfirmedIndianNational', 'ConfirmedForeignNational'], axis = 1, inplace=True)
cases.head()

Unnamed: 0,Date,State/UnionTerritory,Cured,Deaths,Confirmed
0,2020-01-30,Kerala,0,0,1
1,2020-01-31,Kerala,0,0,1
2,2020-02-01,Kerala,0,0,2
3,2020-02-02,Kerala,0,0,3
4,2020-02-03,Kerala,0,0,3


# Getting the time-period of given data

In [9]:
print("Starting date : ", min(cases.Date.values))
print("Ending date : ", max(cases.Date.values))

Starting date :  2020-01-30T00:00:00.000000000
Ending date :  2020-08-30T00:00:00.000000000


# Calculating total active cases per day across India

In [10]:
daily_cases = cases.groupby('Date').sum().reset_index()
daily_cases['Active'] = 1

for val in daily_cases.index:
    if val != 0:
        daily_cases['Active'].loc[val] = daily_cases['Confirmed'].loc[val] - daily_cases['Cured'].loc[val-1] - daily_cases['Deaths'].loc[val-1]
    
daily_cases

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Unnamed: 0,Date,Cured,Deaths,Confirmed,Active
0,2020-01-30,0,0,1,1
1,2020-01-31,0,0,1,1
2,2020-02-01,0,0,2,2
3,2020-02-02,0,0,3,3
4,2020-02-03,0,0,3,3
...,...,...,...,...,...
209,2020-08-26,2467758,59449,3234474,771499
210,2020-08-27,2523771,60472,3310234,783027
211,2020-08-28,2583948,61529,3387500,803257
212,2020-08-29,2648998,62550,3463972,818495


# Visualization of Cured, Death, Confirmed & Active Cases by Date 
The following graph shows us how the cases are increasing day by day.

In [11]:
fig = go.Figure()
fig.add_trace(go.Scatter(x = daily_cases.Date, y = daily_cases.Confirmed, name = 'Confirmed'))
fig.add_trace(go.Scatter(x = daily_cases.Date, y = daily_cases.Cured, name = 'Cured'))
fig.add_trace(go.Scatter(x = daily_cases.Date, y = daily_cases.Deaths, name = 'Deaths'))
fig.add_trace(go.Scatter(x = daily_cases.Date, y = daily_cases.Active, name = 'Active Cases'))

fig.update_layout(title = 'CORONA VIRUS CASES IN INDIA', yaxis_title = 'Cases Count (in lakhs)')

fig.show()

# Analyzing total cases uptill now 
Considering the total cases uptill now on a day-to-day basis to further visualize the increase in cases.

In [12]:
country_cases = cases[cases.Date == max(cases.Date)]
print(country_cases.shape)
country_cases.head()

(35, 5)


Unnamed: 0,Date,State/UnionTerritory,Cured,Deaths,Confirmed
5721,2020-08-30,Andaman and Nicobar Islands,2519,44,3081
5722,2020-08-30,Andhra Pradesh,312687,3796,414164
5723,2020-08-30,Arunachal Pradesh,2754,5,3877
5724,2020-08-30,Assam,82510,289,103794
5725,2020-08-30,Bihar,114772,561,133003


This '**lat_long**' dataframe contains the latitude & longitude coordinates which we use to locate the states on the map so to properly visualize the number of cases a state have till the date.

In [13]:
lat_long.rename(columns={"State":"State/UnionTerritory"}, inplace=True)
lat_long = lat_long[['State/UnionTerritory', 'Latitude', 'Longitude']]
country_cases = pd.merge(country_cases, lat_long, on='State/UnionTerritory')
country_cases.head()

Unnamed: 0,Date,State/UnionTerritory,Cured,Deaths,Confirmed,Latitude,Longitude
0,2020-08-30,Andaman and Nicobar Islands,2519,44,3081,11.7401,92.6586
1,2020-08-30,Andhra Pradesh,312687,3796,414164,15.9129,79.74
2,2020-08-30,Arunachal Pradesh,2754,5,3877,28.218,94.7278
3,2020-08-30,Assam,82510,289,103794,26.2006,92.9376
4,2020-08-30,Bihar,114772,561,133003,25.0961,85.3131


# Visualization of Confirmed cases on Indian map using Folium

In [14]:
# Make an empty map
m = folium.Map(location=[28,77], zoom_start=4)
country_cases['Confirmed'] = country_cases['Confirmed'].astype(float)

# I can add marker one by one on the map
for i in range(0,len(country_cases)):
    folium.Circle(location = [country_cases.iloc[i]['Latitude'], country_cases.iloc[i]['Longitude']],
                popup = [country_cases.iloc[i]['State/UnionTerritory'],country_cases.iloc[i]['Confirmed']],
                radius = country_cases.iloc[i]['Confirmed']/2,
                color = 'crimson', fill = True, fill_color='crimson').add_to(m)
    
m

# Calculating how much percentage of population gets infected till the date

In [15]:
# We're saving it for further use
total_pop = popul['Population'].sum()
print("The total population of India is : ", total_pop)

popul = cases.merge(popul[['State/UnionTerritory', 'Population']])
popul['ConfirmPerc'] = 0
popul['ConfirmPerc'] = (popul['Confirmed']/popul['Population'])*100

The total population of India is :  1210568111


# Visualizing how different states have increament in positive cases by considering the percentage

In [16]:
fig = go.Figure()
for st in popul['State/UnionTerritory'].unique():
    df = popul[popul['State/UnionTerritory'] == st]
    fig.add_trace(go.Scatter(x = df.Date, y = df.ConfirmPerc, name = st))
    
fig.update_layout(title = 'Positive Cases Percentage Per Population', yaxis_title = 'Percentage (%)')
fig.show()

# Analyzing the data & calculating positive percentage on a day-to-day basis

In [17]:
# Here, we're grouping the data by date bcoz we want to visualize the data on per basis
popul_dates = popul.drop('ConfirmPerc', axis=1).groupby('Date').sum()

# In this, population should be same, as we're talking about the positive percentage per population
popul_dates['Population'] = total_pop

# Calculating total percentage of positive cases
popul_dates['TotConfirmPerc'] = (popul_dates['Confirmed']/popul_dates['Population'])*100
popul_dates

Unnamed: 0_level_0,Cured,Deaths,Confirmed,Population,TotConfirmPerc
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-30,0,0,1,1210568111,8.260584e-08
2020-01-31,0,0,1,1210568111,8.260584e-08
2020-02-01,0,0,2,1210568111,1.652117e-07
2020-02-02,0,0,3,1210568111,2.478175e-07
2020-02-03,0,0,3,1210568111,2.478175e-07
...,...,...,...,...,...
2020-08-26,2382535,58669,3122786,1210568111,2.579604e-01
2020-08-27,2437676,59684,3195751,1210568111,2.639877e-01
2020-08-28,2496273,60730,3270085,1210568111,2.701281e-01
2020-08-29,2559648,61742,3343806,1210568111,2.762179e-01


# Visualizing the increase in percentage

In [18]:
fig = go.Figure()
fig.add_trace(go.Scatter(x = popul_dates.index, y = popul_dates.TotConfirmPerc))
fig.update_layout(title = 'Percentage of positive cases across India', yaxis_title = 'Percentage (%)')
fig.show()

If you want to know how to flatten the curve or wanted to get the idea of capacity of patients a state can have look at my another notebook where I've described the hospitals capacity of sustaining patients and how many more beds it needed to have. 
**Link:-** https://www.kaggle.com/ayushirastogi15/covid-19-flattening-the-curve

If you want to look at the interactive & interesting visualization of how the corona virus cases increases in each state then take a look at my another notebook which shows you the interesting bar-chart race of Indian states. 
**Link:-** https://www.kaggle.com/ayushirastogi15/covid-19-india-bar-chart-race

You can also look at the analysis & interesting visualizations of corona-virus cases all over the world. How the confirmed cases increasing, death cases increasing & the recovery rate as well. The link for the notebook is given below.
**Link:-** https://www.kaggle.com/ayushirastogi15/covid-19

*If you found anything wrong, do let me know. I'll be happy to get any kind of feed back.
Thanks in advance for the whole data science kaggle community.*