# Data Cleaning and Data Analysis on Covid-19 dataset (India).

![corona](https://c.files.bbci.co.uk/14A35/production/_115033548_gettyimages-1226314512.jpg)



In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import missingno
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

ModuleNotFoundError: No module named 'geopandas'

## Loading the dataset.

In [None]:
data = pd.read_csv("/kaggle/input/covid19-in-india/covid_19_india.csv",index_col=0)

### **We will be analyzing the data of Covid-19 spread over the Indian states. The data ranges from 01-January-2020 to 26-November-2020.**

In [None]:
data = data.iloc[:8835]

In [None]:
data.head()

### Exploring the missing values in the dataset using missingno package and info of the dataset.

In [None]:
missingno.matrix(data)

### Checking duplicated values in the dataset.

In [None]:
data.duplicated().sum()

In [None]:
data.info()

### Now we are looking for unique timestamps given in the dataset, further we will merge the ***date*** and the ***time*** columns.

In [None]:
data['Time'].unique()

In [None]:
data.rename(columns={"Date" : "Datetime"},inplace=True)

### Convert the Datetime column to dtype of datetime64[ns]

In [None]:
def timeconv(df):
    alltime = []
    for i in df["Time"]:
        mer = i[-2:]
        
        time = i[:-3]
        if len(time) ==4:
            time = "0"+time
        if mer == "PM":
            time = str(12+int(time[:2]))+time[-3:]
        alltime.append(time)
    assert df.shape[0] == len(alltime)
    df['Datetime'] = df['Datetime'] +" "+ pd.Series(alltime)
            
       
        
        
timeconv(data)

In [None]:
data.drop(["Time"],axis=1,inplace = True)

In [None]:
data["Datetime"] = pd.to_datetime(data["Datetime"])

In [None]:
timest = data.iloc[-2]["Datetime"]
data.iloc[-1,0] = timest

### We discovered that there are several missing values marked as "-", let's take a look.

In [None]:
data

In [None]:
l = data.groupby('State/UnionTerritory')

In [None]:
data = data.replace("-",np.nan)

In [None]:
missingno.matrix(data)

### Whoa! most of the values are missing in 3rd and 4th column, we better drop those columns for now.

In [None]:
data.drop(list(data.columns)[2:4],axis=1,inplace = True)

### Moving forward, let's take a look at distinct state names for further analysis.

In [None]:
data['State/UnionTerritory'].unique()

### Keeping it simple, we will drop the rows with state name ending with "***" as it is seems to be rows with incomplete values.

In [None]:
def drop_star(df):
    for i in df['State/UnionTerritory'].iteritems():
        if i[1][-3:] == "***":
            df.drop(i[0],inplace=True)
        
drop_star(data)
data['State/UnionTerritory'].unique()

### There are still several typos in state names, we will deal with this manually.

In [None]:
data.drop(data[(data['State/UnionTerritory']=='Telangana')|(data['State/UnionTerritory']=='Daman & Diu')|(data['State/UnionTerritory']=='Dadar Nagar Haveli')].index,inplace=True)
data['State/UnionTerritory'].unique()

In [None]:
data.info()

In [None]:
data[data['State/UnionTerritory']=='Tripura']

### Extracting the latest insights from the data, we will later visualize the trends.

In [None]:
l = data.groupby('State/UnionTerritory')
current = l.last()

In [None]:
current

### Plotting a bar plot to show the spread of Covid-19 across the states in decreasing order.

In [None]:
fig ,ax = plt.subplots(figsize= (12,8))
fig.set_facecolor("white")
current = current.sort_values("Confirmed",ascending=False)
p = sns.barplot(ax=ax,x= current.index,y=current['Confirmed'])
p.set_xticklabels(labels = current.index,rotation=90)

p.set_yticklabels(labels=(p.get_yticks()*1).astype(int))


### Now let's look at the Cured/Death ratio of these states using pie plot.

In [None]:
fig, axs = plt.subplots(12,3, figsize=(16,30))
fig.delaxes(axs[11,2])
fig.set_facecolor("white")
def plotpie(ax,cplot,data,state):
    labels = ['Cured', 'Deaths','Ambiguous']
    colors = ['green', 'red','gray']
    amb = data.loc[state]['Confirmed'] - data.loc[state]['Cured']+data.loc[state]['Deaths']
    size = [data.loc[state]['Cured'],data.loc[state]['Deaths'],amb]
    x = cplot//3
    y = cplot%3
    ax[x,y].pie(size,labels=labels, colors=colors, startangle=0, autopct='%1.1f%%')
    ax[x,y].set_title(state+'\n'+"Total cases : {}".format(data.loc[state]['Confirmed']))
    ax[x,y].axis('equal')

cplot = 0
for i in sorted(list(current.index)):
    if i in ['Cases being reassigned to states', 'Unassigned'] :
        continue
    plotpie(axs,cplot,current,i)
    cplot+=1
fig.tight_layout()
plt.plot()

### Zooming to the span of 21 days lockdown from 25-March-2020. 


In [None]:
import datetime
import matplotlib.dates as mdates
fig, axs = plt.subplots(18,2, figsize=(16,100))
fig.set_facecolor("white")
fig.delaxes(axs[17,1])

def statewise_timeplot(ax,cplot,data,state):
    toplot = data[data["State/UnionTerritory"] == state]
    x = cplot//2
    y = cplot%2
    sd = pd.to_datetime('2020-3-25') 
    td = datetime.timedelta(days=21)
    ed = sd+td
    #print(sd,ed)
    
    #toplot = toplot.set_index("Datetime")
    toplot = toplot.loc[(toplot["Datetime"] > sd) & (toplot['Datetime']< ed)]
    #print(toplot)
    toplot = toplot.set_index("Datetime")
    sns.lineplot(data=toplot,x=toplot.index,y=toplot["Confirmed"],ax= ax[x,y],label='Confirmed')
    sns.lineplot(data=toplot,x=toplot.index,y=toplot["Cured"],ax= ax[x,y],label="Cured")
    sns.lineplot(data=toplot,x=toplot.index,y=toplot["Deaths"],ax= ax[x,y],label="Deaths")
    ax[x,y].set_title(state)
    ax[x,y].set_xlim(pd.Timestamp('2020-3-25'),pd.Timestamp("2020-04-11"))
    ax[x,y].set_ylim(0,1000000)
    ax[x,y].xaxis.set_major_formatter(mdates.DateFormatter("%m-%d"))
    ax[x,y].xaxis.set_minor_formatter(mdates.DateFormatter("%m-%d"))
    ax[x,y].tick_params(axis='x', rotation=45)
    ax[x,y].legend()



cplot = 0
for i in sorted(list(current.index)):
    if i in ['Cases being reassigned to states', 'Unassigned'] :
        continue
    statewise_timeplot(axs,cplot,data,i)
    cplot+=1
    
fig.tight_layout()
plt.plot()


### Prepariing data for geoplot using geopandas. 

In [None]:
fp = "/kaggle/input/india-states/Igismap/Indian_States.shp"
map_df = gpd.read_file(fp)
display(map_df)
current
current.rename(index={"Andaman and Nicobar Islands":"Andaman & Nicobar Island","Delhi":"NCT of Delhi","Arunachal Pradesh":"Arunanchal Pradesh","Dadra and Nagar Haveli and Daman and Diu":"Dadara & Nagar Havelli","Jammu and Kashmir":"Jammu & Kashmir","Telengana":"Telangana"},inplace=True)

In [None]:
current.drop(['Cases being reassigned to states', 'Unassigned'], axis = 0).reset_index()
merged = map_df.merge(current, left_on = 'st_nm', right_on = 'State/UnionTerritory', how = 'left')
merged = merged[~merged['Datetime'].isna()]
merged.reset_index().drop('index', axis = 1)
merged


### Finally, here is the geoplot showing the Covid-19 spread across the Indian states with severity.

In [None]:
fig, ax = plt.subplots(1, figsize=(10, 10))
ax.axis('off')
ax.set_title('Covi19 data', fontdict={'fontsize': '25', 'fontweight' : '10'})

merged.plot(column='Confirmed',cmap='YlOrRd', linewidth=0.8, ax=ax, edgecolor='0', legend=True,markersize=[39.739192, -104.990337])