# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

# Reading the data

In [2]:
df = pd.read_csv('covid_19_data.csv')

# EDA

In [3]:
df.head(10)

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,1,01/22/2020,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
1,2,01/22/2020,Beijing,Mainland China,1/22/2020 17:00,14.0,0.0,0.0
2,3,01/22/2020,Chongqing,Mainland China,1/22/2020 17:00,6.0,0.0,0.0
3,4,01/22/2020,Fujian,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
4,5,01/22/2020,Gansu,Mainland China,1/22/2020 17:00,0.0,0.0,0.0
5,6,01/22/2020,Guangdong,Mainland China,1/22/2020 17:00,26.0,0.0,0.0
6,7,01/22/2020,Guangxi,Mainland China,1/22/2020 17:00,2.0,0.0,0.0
7,8,01/22/2020,Guizhou,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
8,9,01/22/2020,Hainan,Mainland China,1/22/2020 17:00,4.0,0.0,0.0
9,10,01/22/2020,Hebei,Mainland China,1/22/2020 17:00,1.0,0.0,0.0


In [4]:
df.drop(['SNo', 'Last Update'], axis=1, inplace=True)

In [5]:
names_dict = {'ObservationDate':'Date'
              , 'Province/State':'State'
              ,'Country/Region':'Country'}
df.rename(columns=names_dict, inplace=True)
df['Date'] = pd.to_datetime(df['Date'])
df.head()

Unnamed: 0,Date,State,Country,Confirmed,Deaths,Recovered
0,2020-01-22,Anhui,Mainland China,1.0,0.0,0.0
1,2020-01-22,Beijing,Mainland China,14.0,0.0,0.0
2,2020-01-22,Chongqing,Mainland China,6.0,0.0,0.0
3,2020-01-22,Fujian,Mainland China,1.0,0.0,0.0
4,2020-01-22,Gansu,Mainland China,0.0,0.0,0.0


# Impute values

In [6]:
imputer = SimpleImputer(strategy='constant')
df2 = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

In [7]:
df2.head()

Unnamed: 0,Date,State,Country,Confirmed,Deaths,Recovered
0,2020-01-22,Anhui,Mainland China,1.0,0.0,0.0
1,2020-01-22,Beijing,Mainland China,14.0,0.0,0.0
2,2020-01-22,Chongqing,Mainland China,6.0,0.0,0.0
3,2020-01-22,Fujian,Mainland China,1.0,0.0,0.0
4,2020-01-22,Gansu,Mainland China,0.0,0.0,0.0


# Exploring Groupby

In [9]:
df3 = df2.groupby('Country')[['Confirmed', 'Deaths', 'Recovered']].sum().reset_index()

In [10]:
df3.head()

Unnamed: 0,Country,Confirmed,Deaths,Recovered
0,Azerbaijan,1.0,0.0,0.0
1,"('St. Martin',)",2.0,0.0,0.0
2,Afghanistan,303643.0,6712.0,31482.0
3,Albania,50228.0,1893.0,32569.0
4,Algeria,314591.0,28423.0,152649.0


In [14]:
df4 = df2.groupby(['Country', 'Date'])[['Confirmed', 'Deaths', 'Recovered']].sum().reset_index()

In [15]:
df4.head()

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered
0,Azerbaijan,2020-02-28,1.0,0.0,0.0
1,"('St. Martin',)",2020-03-10,2.0,0.0,0.0
2,Afghanistan,2020-02-24,1.0,0.0,0.0
3,Afghanistan,2020-02-25,1.0,0.0,0.0
4,Afghanistan,2020-02-26,1.0,0.0,0.0


In [16]:
confirmed_cases_more_than_hundred = df4[df4['Confirmed'] > 100]

In [17]:
confirmed_cases_more_than_hundred

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered
34,Afghanistan,2020-03-27,110.0,4.0,2.0
35,Afghanistan,2020-03-28,110.0,4.0,2.0
36,Afghanistan,2020-03-29,120.0,4.0,2.0
37,Afghanistan,2020-03-30,170.0,4.0,2.0
38,Afghanistan,2020-03-31,174.0,4.0,5.0
...,...,...,...,...,...
17539,Zimbabwe,2020-05-30,174.0,4.0,29.0
17540,Zimbabwe,2020-05-31,178.0,4.0,29.0
17541,Zimbabwe,2020-06-01,203.0,4.0,29.0
17542,Zimbabwe,2020-06-02,206.0,4.0,29.0


In [18]:
# end for now