## IMPORT PACKAGE

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Untuk Data Split
from sklearn.model_selection import train_test_split

# Untuk Data Transform
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Untuk Data Cleaning
from sklearn.impute import SimpleImputer

# Untuk Data Encoding
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

## AMBIL DATASET "covid_19_india.csv" SERTA INFONYA 

In [9]:
df = pd.read_csv('covid_19_india.csv') # untuk read csv
df.head(10) # Untuk liat 10 record teratas

Unnamed: 0,Sno,Date,Time,State/UnionTerritory,ConfirmedIndianNational,ConfirmedForeignNational,Cured,Deaths,Confirmed
0,1,2020-01-30,6:00 PM,Kerala,1,0,0,0,1
1,2,2020-01-31,6:00 PM,Kerala,1,0,0,0,1
2,3,2020-02-01,6:00 PM,Kerala,2,0,0,0,2
3,4,2020-02-02,6:00 PM,Kerala,3,0,0,0,3
4,5,2020-02-03,6:00 PM,Kerala,3,0,0,0,3
5,6,2020-02-04,6:00 PM,Kerala,3,0,0,0,3
6,7,2020-02-05,6:00 PM,Kerala,3,0,0,0,3
7,8,2020-02-06,6:00 PM,Kerala,3,0,0,0,3
8,9,2020-02-07,6:00 PM,Kerala,3,0,0,0,3
9,10,2020-02-08,6:00 PM,Kerala,3,0,0,0,3


In [15]:
df.info() # Cek Info Dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18110 entries, 0 to 18109
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Sno                       18110 non-null  int64 
 1   Date                      18110 non-null  object
 2   Time                      18110 non-null  object
 3   State/UnionTerritory      18110 non-null  object
 4   ConfirmedIndianNational   18110 non-null  object
 5   ConfirmedForeignNational  18110 non-null  object
 6   Cured                     18110 non-null  int64 
 7   Deaths                    18110 non-null  int64 
 8   Confirmed                 18110 non-null  int64 
dtypes: int64(4), object(5)
memory usage: 1.2+ MB


In [14]:
df.isna().sum() # = Cek Jumlah NaN

Sno                         0
Date                        0
Time                        0
State/UnionTerritory        0
ConfirmedIndianNational     0
ConfirmedForeignNational    0
Cured                       0
Deaths                      0
Confirmed                   0
dtype: int64

## DATA SPLIT

In [12]:
data = df[['Date', 'Time', 'ConfirmedIndianNational', 'ConfirmedForeignNational', 'Cured', 'Deaths', 'Confirmed']]
label = df['State/UnionTerritory']

data_train, data_test, label_train, label_test = train_test_split(data, label, test_size=.3) # Untuk split data beserta masing-masing labelnya

In [13]:
print(f'Panjang data training {len(data_train)}',
     f'Panjang label training {len(label_train)}',
     f'Panjang data testing {len(data_test)}',
     f'Panjang label testing {len(label_test)}',
     sep='\n') # untuk ngeprint panjang data yg sudah ter split beserta masing-masing labelnya 

Panjang data training 12677
Panjang label training 12677
Panjang data testing 5433
Panjang label testing 5433


## NORMALISASI

In [18]:
df_normalisasi = df.copy() # Mengcopy Dataset

In [21]:
scaler = MinMaxScaler() #deklarasi minmax scaler

dataset_normalisasi = scaler.fit_transform(df_normalisasi[['Cured', 'Deaths', 'Confirmed']]) # scaling

dataset_normalisasi = pd.DataFrame(dataset_normalisasi) #jadikan isi dataset_normalisasi jadi dataframe
dataset_normalisasi.rename(columns={0:'Cured', 1:'Deaths', 2:'Confirmed'}, inplace=True) # ganti nama atribut

dataset_normalisasi.head(5)

Unnamed: 0,Cured,Deaths,Confirmed
0,0.0,0.0,1.571477e-07
1,0.0,0.0,1.571477e-07
2,0.0,0.0,3.142953e-07
3,0.0,0.0,4.71443e-07
4,0.0,0.0,4.71443e-07


## STANDARISASI

In [26]:
# menduplikat dataset
dataset_standarirasi = df.copy()
dataset_pembanding = df.copy()

#menghapus atribut bertipe object
dataset_standarirasi.drop('Date', axis=1, inplace=True)
dataset_standarirasi.drop('Time', axis=1, inplace=True)
dataset_standarirasi.drop('State/UnionTerritory', axis=1, inplace=True)
dataset_standarirasi.drop('ConfirmedIndianNational', axis=1, inplace=True)
dataset_standarirasi.drop('ConfirmedForeignNational', axis=1, inplace=True)

#standarisasi
standart_std = StandardScaler()
scaled_data = standart_std.fit_transform(dataset_standarirasi)

#menghapus atribut bertipe object dataset pembanding
dataset_pembanding.drop('Date', axis=1, inplace=True)
dataset_pembanding.drop('Time', axis=1, inplace=True)
dataset_pembanding.drop('State/UnionTerritory', axis=1, inplace=True)
dataset_pembanding.drop('ConfirmedIndianNational', axis=1, inplace=True)
dataset_pembanding.drop('ConfirmedForeignNational', axis=1, inplace=True)


print('Dataset sebelum scalling:',  # SEBELUM
      dataset_pembanding.head(6),
      f'Nilai standar deviasi:\n{np.std(dataset_pembanding)}',
      sep='\n', end='\n\n')

print('Dataset setelah scalling:',  # SESUDAH
      scaled_data[:6],
      f'Nilai standar deviasi: {np.std(scaled_data)}',
      sep='\n')

Dataset sebelum scalling:
   Sno  Cured  Deaths  Confirmed
0    1      0       0          1
1    2      0       0          1
2    3      0       0          2
3    4      0       0          3
4    5      0       0          3
5    6      0       0          3
Nilai standar deviasi:
Sno            5227.906680
Cured        614873.917633
Deaths        10918.774942
Confirmed    656130.757064
dtype: float64

Dataset setelah scalling:
[[-1.73195517 -0.45316204 -0.37114074 -0.45879636]
 [-1.73176389 -0.45316204 -0.37114074 -0.45879636]
 [-1.73157261 -0.45316204 -0.37114074 -0.45879483]
 [-1.73138133 -0.45316204 -0.37114074 -0.45879331]
 [-1.73119005 -0.45316204 -0.37114074 -0.45879331]
 [-1.73099876 -0.45316204 -0.37114074 -0.45879331]]
Nilai standar deviasi: 0.9999999999999999


## AMBIL DATASET "covid_vaccine_statewise.csv" SERTA INFONYA 

In [70]:
df2 = pd.read_csv('dataset_rusak.csv') # read dataset rusak
df2.head(10)

Unnamed: 0,Updated On,State,Total Doses Administered,Sessions,Sites,First Dose Administered,Second Dose Administered,Male (Doses Administered),Female (Doses Administered),Transgender (Doses Administered),...,18-44 Years (Doses Administered),45-60 Years (Doses Administered),60+ Years (Doses Administered),18-44 Years(Individuals Vaccinated),45-60 Years(Individuals Vaccinated),60+ Years(Individuals Vaccinated),Male(Individuals Vaccinated),Female(Individuals Vaccinated),Transgender(Individuals Vaccinated),Total Individuals Vaccinated
0,16/01/2021,India,48276.0,3455.0,2957.0,48276.0,0.0,,,,...,,,,,,,23757.0,24517.0,2.0,48276.0
1,17/01/2021,India,58604.0,8532.0,4954.0,58604.0,0.0,,,,...,,,,,,,27348.0,31252.0,4.0,58604.0
2,18/01/2021,India,99449.0,13611.0,6583.0,99449.0,0.0,,,,...,,,,,,,41361.0,58083.0,5.0,99449.0
3,19/01/2021,India,195525.0,17855.0,7951.0,195525.0,0.0,,,,...,,,,,,,81901.0,113613.0,11.0,195525.0
4,20/01/2021,India,251280.0,25472.0,10504.0,251280.0,0.0,,,,...,,,,,,,98111.0,153145.0,24.0,251280.0
5,21/01/2021,India,365965.0,32226.0,12600.0,365965.0,0.0,,,,...,,,,,,,132784.0,233143.0,38.0,365965.0
6,22/01/2021,India,549381.0,36988.0,14115.0,549381.0,0.0,,,,...,,,,,,,193899.0,355402.0,80.0,549381.0
7,23/01/2021,India,759008.0,43076.0,15605.0,759008.0,0.0,,,,...,,,,,,,267856.0,491049.0,103.0,759008.0
8,24/01/2021,India,835058.0,49851.0,18111.0,835058.0,0.0,,,,...,,,,,,,296283.0,538647.0,128.0,835058.0
9,25/01/2021,India,1277104.0,55151.0,19682.0,1277104.0,0.0,,,,...,,,,,,,444137.0,832766.0,201.0,1277104.0


In [71]:
dataset_rusak = df2.copy() # buat duplikat dataset

In [72]:
dataset_rusak.info() # Cek Info Dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8016 entries, 0 to 8015
Data columns (total 24 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Updated On                           8016 non-null   object 
 1   State                                8016 non-null   object 
 2   Total Doses Administered             7792 non-null   float64
 3   Sessions                             7792 non-null   float64
 4    Sites                               7792 non-null   float64
 5   First Dose Administered              7792 non-null   float64
 6   Second Dose Administered             7792 non-null   float64
 7   Male (Doses Administered)            7475 non-null   float64
 8   Female (Doses Administered)          7475 non-null   float64
 9   Transgender (Doses Administered)     7475 non-null   float64
 10   Covaxin (Doses Administered)        7792 non-null   float64
 11  CoviShield (Doses Administered

In [73]:
dataset_rusak.isna().sum() # = Cek Jumlah NaN

Updated On                                0
State                                     0
Total Doses Administered                224
Sessions                                224
 Sites                                  224
First Dose Administered                 224
Second Dose Administered                224
Male (Doses Administered)               541
Female (Doses Administered)             541
Transgender (Doses Administered)        541
 Covaxin (Doses Administered)           224
CoviShield (Doses Administered)         224
Sputnik V (Doses Administered)         4972
AEFI                                   2463
18-44 Years (Doses Administered)       6300
45-60 Years (Doses Administered)       6300
60+ Years (Doses Administered)         6300
18-44 Years(Individuals Vaccinated)    4182
45-60 Years(Individuals Vaccinated)    4181
60+ Years(Individuals Vaccinated)      4181
Male(Individuals Vaccinated)           7699
Female(Individuals Vaccinated)         7699
Transgender(Individuals Vaccinat

In [74]:
dataset_rusak.duplicated().sum() #cek jumlah record duplikat 

171

## DATA CLEANING

In [75]:
# membuat variabel yang berisi strategi simple imputer
mean = SimpleImputer(strategy='mean')
median = SimpleImputer(strategy='median')
modus = SimpleImputer(strategy='most_frequent')

In [77]:
# mengisi data NaN dari tiap tiap record
dataset_rusak['Total Doses Administered'] = mean.fit_transform(df2[['Total Doses Administered']])
dataset_rusak['Sessions'] = mean.fit_transform(df2[['Sessions']])
dataset_rusak[' Sites '] = mean.fit_transform(df2[[' Sites ']])
dataset_rusak['First Dose Administered'] = mean.fit_transform(df2[['First Dose Administered']])
dataset_rusak['Second Dose Administered'] = mean.fit_transform(df2[['Second Dose Administered']])
dataset_rusak['Male (Doses Administered)'] = mean.fit_transform(df2[['Male (Doses Administered)']])
dataset_rusak['Female (Doses Administered)'] = mean.fit_transform(df2[['Female (Doses Administered)']])
dataset_rusak['Transgender (Doses Administered)'] = mean.fit_transform(df2[['Transgender (Doses Administered)']])
dataset_rusak[' Covaxin (Doses Administered)'] = mean.fit_transform(df2[[' Covaxin (Doses Administered)']])
dataset_rusak['CoviShield (Doses Administered)'] = mean.fit_transform(df2[['Total Doses Administered']])
dataset_rusak['Sputnik V (Doses Administered)'] = mean.fit_transform(df2[['CoviShield (Doses Administered)']])
dataset_rusak['AEFI'] = mean.fit_transform(df2[['AEFI']])
dataset_rusak['18-44 Years (Doses Administered)'] = mean.fit_transform(df2[['18-44 Years (Doses Administered)']])
dataset_rusak['45-60 Years (Doses Administered)'] = mean.fit_transform(df2[['45-60 Years (Doses Administered)']])
dataset_rusak['60+ Years (Doses Administered)'] = mean.fit_transform(df2[['60+ Years (Doses Administered)']])
dataset_rusak['18-44 Years(Individuals Vaccinated)'] = mean.fit_transform(df2[['18-44 Years(Individuals Vaccinated)']])
dataset_rusak['45-60 Years(Individuals Vaccinated)'] = mean.fit_transform(df2[['45-60 Years(Individuals Vaccinated)']])
dataset_rusak['60+ Years(Individuals Vaccinated)'] = mean.fit_transform(df2[['60+ Years(Individuals Vaccinated)']])
dataset_rusak['Male(Individuals Vaccinated)'] = mean.fit_transform(df2[['Male(Individuals Vaccinated)']])
dataset_rusak['Female(Individuals Vaccinated)'] = mean.fit_transform(df2[['Female(Individuals Vaccinated)']])
dataset_rusak['Transgender(Individuals Vaccinated)'] = mean.fit_transform(df2[['Transgender(Individuals Vaccinated)']])
dataset_rusak['Total Individuals Vaccinated'] = mean.fit_transform(df2[['Total Individuals Vaccinated']])

# cek jumlah NaN tiap tiap atribut
dataset_rusak.isna().sum()

Updated On                             0
State                                  0
Total Doses Administered               0
Sessions                               0
 Sites                                 0
First Dose Administered                0
Second Dose Administered               0
Male (Doses Administered)              0
Female (Doses Administered)            0
Transgender (Doses Administered)       0
 Covaxin (Doses Administered)          0
CoviShield (Doses Administered)        0
Sputnik V (Doses Administered)         0
AEFI                                   0
18-44 Years (Doses Administered)       0
45-60 Years (Doses Administered)       0
60+ Years (Doses Administered)         0
18-44 Years(Individuals Vaccinated)    0
45-60 Years(Individuals Vaccinated)    0
60+ Years(Individuals Vaccinated)      0
Male(Individuals Vaccinated)           0
Female(Individuals Vaccinated)         0
Transgender(Individuals Vaccinated)    0
Total Individuals Vaccinated           0
dtype: int64

In [79]:
# menghapus data duplikat 
dataset_rusak.drop_duplicates(inplace=True) 

# membandingkan dataset setelah dan sebelum dihapus duplikat nya 
print(f'Jumlah data duplikat sebelum dibersihkan : {df2.duplicated().sum()}',
     f'Jumlah data duplikat setelah dibersihkan : {dataset_rusak.duplicated().sum()}',
      sep='\n'
     )

Jumlah data duplikat sebelum dibersihkan : 171
Jumlah data duplikat setelah dibersihkan : 0


## ENCODING


In [89]:
# mengcopy dataset
dataset_ordinal = dataset_rusak.copy()
dataset_one_hot = dataset_rusak.copy()

In [90]:
# cek isi dari atribut State secara unique
dataset_rusak['State'].unique()

array(['India', 'Andaman and Nicobar Islands', 'Andhra Pradesh',
       'Arunachal Pradesh', 'Assam', 'Bihar', 'Chandigarh',
       'Chhattisgarh', 'Dadra and Nagar Haveli and Daman and Diu',
       'Delhi', 'Goa', 'Gujarat', 'Haryana', 'Himachal Pradesh',
       'Jammu and Kashmir', 'Jharkhand', 'Karnataka', 'Kerala', 'Ladakh',
       'Lakshadweep', 'Madhya Pradesh', 'Maharashtra', 'Manipur',
       'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha', 'Puducherry',
       'Punjab', 'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana',
       'Tripura', 'Uttar Pradesh', 'Uttarakhand', 'West Bengal'],
      dtype=object)

#### ORDINAL

In [91]:
ordinal = OrdinalEncoder() #deklarasi ordinal encoder

dataset_ordinal['State'] = ordinal.fit_transform(dataset_ordinal[['State']]).astype('int64') #encoding

dataset_ordinal.head() #Cek Hasil

Unnamed: 0,Updated On,State,Total Doses Administered,Sessions,Sites,First Dose Administered,Second Dose Administered,Male (Doses Administered),Female (Doses Administered),Transgender (Doses Administered),...,18-44 Years (Doses Administered),45-60 Years (Doses Administered),60+ Years (Doses Administered),18-44 Years(Individuals Vaccinated),45-60 Years(Individuals Vaccinated),60+ Years(Individuals Vaccinated),Male(Individuals Vaccinated),Female(Individuals Vaccinated),Transgender(Individuals Vaccinated),Total Individuals Vaccinated
0,16/01/2021,13,48276.0,3455.0,2957.0,48276.0,0.0,3953314.0,3454847.0,1268.463144,...,9726643.0,8340992.0,6366581.0,2039368.0,4259870.0,3837511.0,23757.0,24517.0,2.0,48276.0
1,17/01/2021,13,58604.0,8532.0,4954.0,58604.0,0.0,3953314.0,3454847.0,1268.463144,...,9726643.0,8340992.0,6366581.0,2039368.0,4259870.0,3837511.0,27348.0,31252.0,4.0,58604.0
2,18/01/2021,13,99449.0,13611.0,6583.0,99449.0,0.0,3953314.0,3454847.0,1268.463144,...,9726643.0,8340992.0,6366581.0,2039368.0,4259870.0,3837511.0,41361.0,58083.0,5.0,99449.0
3,19/01/2021,13,195525.0,17855.0,7951.0,195525.0,0.0,3953314.0,3454847.0,1268.463144,...,9726643.0,8340992.0,6366581.0,2039368.0,4259870.0,3837511.0,81901.0,113613.0,11.0,195525.0
4,20/01/2021,13,251280.0,25472.0,10504.0,251280.0,0.0,3953314.0,3454847.0,1268.463144,...,9726643.0,8340992.0,6366581.0,2039368.0,4259870.0,3837511.0,98111.0,153145.0,24.0,251280.0


In [92]:
# cek isi dari atribut State secara unique
dataset_ordinal['State'].unique()

array([13,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36], dtype=int64)

#### ONE-HOT

In [93]:
temp_dataset = pd.get_dummies(dataset_one_hot[['State']]) # ambil dummies atribut state untuk di encode

temp_dataset = pd.DataFrame(temp_dataset) # jadikan dataset nya sebagai dataframe

dataset_one_hot.drop('State', axis=1, inplace=True) #hapus atribut state

dataset_one_hot = dataset_one_hot.join(temp_dataset) # nambahin atribut hasil encoding
dataset_one_hot.head(5)

Unnamed: 0,Updated On,Total Doses Administered,Sessions,Sites,First Dose Administered,Second Dose Administered,Male (Doses Administered),Female (Doses Administered),Transgender (Doses Administered),Covaxin (Doses Administered),...,State_Puducherry,State_Punjab,State_Rajasthan,State_Sikkim,State_Tamil Nadu,State_Telangana,State_Tripura,State_Uttar Pradesh,State_Uttarakhand,State_West Bengal
0,16/01/2021,48276.0,3455.0,2957.0,48276.0,0.0,3953314.0,3454847.0,1268.463144,579.0,...,0,0,0,0,0,0,0,0,0,0
1,17/01/2021,58604.0,8532.0,4954.0,58604.0,0.0,3953314.0,3454847.0,1268.463144,635.0,...,0,0,0,0,0,0,0,0,0,0
2,18/01/2021,99449.0,13611.0,6583.0,99449.0,0.0,3953314.0,3454847.0,1268.463144,1299.0,...,0,0,0,0,0,0,0,0,0,0
3,19/01/2021,195525.0,17855.0,7951.0,195525.0,0.0,3953314.0,3454847.0,1268.463144,3017.0,...,0,0,0,0,0,0,0,0,0,0
4,20/01/2021,251280.0,25472.0,10504.0,251280.0,0.0,3953314.0,3454847.0,1268.463144,3946.0,...,0,0,0,0,0,0,0,0,0,0
