# Data Cleaning Covid 19 India

In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
data = pd.read_csv('covid_19_india.csv')

In [51]:
data

Unnamed: 0,Sno,Date,Time,State/UnionTerritory,ConfirmedIndianNational,ConfirmedForeignNational,Cured,Deaths,Confirmed
0,1,30/01/20,6:00 PM,Kerala,1,0,0,0,1
1,2,31/01/20,6:00 PM,Kerala,1,0,0,0,1
2,3,01/02/20,6:00 PM,Kerala,2,0,0,0,2
3,4,02/02/20,6:00 PM,Kerala,3,0,0,0,3
4,5,03/02/20,6:00 PM,Kerala,3,0,0,0,3
...,...,...,...,...,...,...,...,...,...
9286,9287,09/12/20,8:00 AM,Telengana,-,-,266120,1480,275261
9287,9288,09/12/20,8:00 AM,Tripura,-,-,32169,373,32945
9288,9289,09/12/20,8:00 AM,Uttarakhand,-,-,72435,1307,79141
9289,9290,09/12/20,8:00 AM,Uttar Pradesh,-,-,528832,7967,558173


##### Dari tabel data tersebut terdapat baris dengan isi data kosong yang ditandai dengan '-' pada column 'ConfirmedIndianNational' dan 'ConfirmedForeignNational'.

In [39]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9291 entries, 0 to 9290
Data columns (total 9 columns):
Sno                         9291 non-null int64
Date                        9291 non-null object
Time                        9291 non-null object
State/UnionTerritory        9291 non-null object
ConfirmedIndianNational     9291 non-null object
ConfirmedForeignNational    9291 non-null object
Cured                       9291 non-null int64
Deaths                      9291 non-null int64
Confirmed                   9291 non-null int64
dtypes: int64(4), object(5)
memory usage: 653.4+ KB


##### Seperti yang dapat dilihat pada info tabel tidak memuat data null, tetapi jika di cek didalam tabel ada data kosong yang diisi dengan   simbol   '-'

In [40]:
data.isna()

Unnamed: 0,Sno,Date,Time,State/UnionTerritory,ConfirmedIndianNational,ConfirmedForeignNational,Cured,Deaths,Confirmed
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
9286,False,False,False,False,False,False,False,False,False
9287,False,False,False,False,False,False,False,False,False
9288,False,False,False,False,False,False,False,False,False
9289,False,False,False,False,False,False,False,False,False


In [41]:
data.isnull().sum()

Sno                         0
Date                        0
Time                        0
State/UnionTerritory        0
ConfirmedIndianNational     0
ConfirmedForeignNational    0
Cured                       0
Deaths                      0
Confirmed                   0
dtype: int64

In [49]:
data.describe()

Unnamed: 0,Sno,Cured,Deaths,Confirmed
count,9291.0,9291.0,9291.0,9291.0
mean,4646.0,78632.66,1487.620385,91839.78
std,2682.225009,193110.2,4713.81369,216601.4
min,1.0,0.0,0.0,0.0
25%,2323.5,152.0,2.0,538.5
50%,4646.0,4308.0,66.0,6832.0
75%,6968.5,57726.5,926.5,78856.0
max,9291.0,1737080.0,47827.0,1859367.0


In [43]:
data['ConfirmedIndianNational'].unique()

array(['1', '2', '3', '0', '6', '7', '8', '9', '4', '15', '5', '17', '10',
       '11', '19', '14', '22', '32', '12', '23', '24', '36', '25', '39',
       '44', '18', '16', '26', '49', '33', '60', '13', '28', '45', '64',
       '21', '29', '71', '30', '37', '87', '86', '41', '101', '125', '34',
       '35', '42', '55', '110', '20', '121', '40', '129', '127', '38',
       '168', '177', '52', '46', '54', '-'], dtype=object)

In [45]:
data['ConfirmedForeignNational'].unique()

array(['0', '1', '2', '14', '3', '9', '7', '11', '10', '8', '6', '-'],
      dtype=object)

##### Maka dilakukan langkah mengubah data kosong dari '-' menjadi 'NaN'

In [55]:
df = data
df['ConfirmedIndianNational'] = np.where(data['ConfirmedIndianNational'] == '-','NaN',data['ConfirmedIndianNational'])

In [56]:
df['ConfirmedForeignNational'] = np.where(data['ConfirmedForeignNational'] == '-','NaN',data['ConfirmedForeignNational'])

In [57]:
df

Unnamed: 0,Sno,Date,Time,State/UnionTerritory,ConfirmedIndianNational,ConfirmedForeignNational,Cured,Deaths,Confirmed
0,1,30/01/20,6:00 PM,Kerala,1,0,0,0,1
1,2,31/01/20,6:00 PM,Kerala,1,0,0,0,1
2,3,01/02/20,6:00 PM,Kerala,2,0,0,0,2
3,4,02/02/20,6:00 PM,Kerala,3,0,0,0,3
4,5,03/02/20,6:00 PM,Kerala,3,0,0,0,3
...,...,...,...,...,...,...,...,...,...
9286,9287,09/12/20,8:00 AM,Telengana,,,266120,1480,275261
9287,9288,09/12/20,8:00 AM,Tripura,,,32169,373,32945
9288,9289,09/12/20,8:00 AM,Uttarakhand,,,72435,1307,79141
9289,9290,09/12/20,8:00 AM,Uttar Pradesh,,,528832,7967,558173


#### Tahap selanjuutnya yaitu menghapus kolom yang dianggap tidak dibutuhkan, dalam kasus kali ini kolom yang di drop atau dihapus adalah 'Sno' dan 'Time'

In [58]:
data_prepos = df
data_prepos.drop(['Sno','Time'],axis=1, inplace=True)

#### Berikut adalah hasil akhirnya.Pemakaian simbol '-' untuk mengisi data yang kosong sangat jarang ditemui, biasanya diisi dengan 'NaN' atau 'null'. Untuk penghapusan tabel dapat dilakukan bila kolom terebut dianggap tidak diperlukan dalam pengolahan data nantinya.

In [59]:
data_prepos

Unnamed: 0,Date,State/UnionTerritory,ConfirmedIndianNational,ConfirmedForeignNational,Cured,Deaths,Confirmed
0,30/01/20,Kerala,1,0,0,0,1
1,31/01/20,Kerala,1,0,0,0,1
2,01/02/20,Kerala,2,0,0,0,2
3,02/02/20,Kerala,3,0,0,0,3
4,03/02/20,Kerala,3,0,0,0,3
...,...,...,...,...,...,...,...
9286,09/12/20,Telengana,,,266120,1480,275261
9287,09/12/20,Tripura,,,32169,373,32945
9288,09/12/20,Uttarakhand,,,72435,1307,79141
9289,09/12/20,Uttar Pradesh,,,528832,7967,558173
