In [50]:
import seaborn as sns

### Import Dataset

In [51]:
df = sns.load_dataset('penguins')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [52]:
df.shape

(344, 7)

## Data Cleaning

In [53]:
df.isna().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

### Clean Null

**SimpleImputer**

In [54]:
from sklearn.impute import SimpleImputer

In [55]:
#Membuat Copy Dataset
dfSimpleImputer = df.copy()

In [56]:
imputer = SimpleImputer(strategy='mean') #Mean,Median,Mode,Constant

dfSimpleImputer['flipper_length_mm'] = imputer.fit_transform(dfSimpleImputer[['flipper_length_mm']])

In [57]:
dfSimpleImputer.isna().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     0
body_mass_g           2
sex                  11
dtype: int64

**Fillna**

In [58]:
dfSimpleImputer['bill_length_mm'].fillna(dfSimpleImputer['bill_length_mm'].mean(),inplace=True)

In [59]:
dfSimpleImputer.isna().sum()

species               0
island                0
bill_length_mm        0
bill_depth_mm         2
flipper_length_mm     0
body_mass_g           2
sex                  11
dtype: int64

**Automasi Pengisian Null**
- Tipe Data Float => Median
- Tipe Data Integer => Mean (Rata - Rata)
- Tipe Data Object => Modus

In [60]:
#Membuat Copy Dataset
dfImpute = df.copy()
dfImpute.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [61]:
dfImpute.isna().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [62]:
# Pilih Kolom yang memiliki nilai null dan tipe data float
dfImputeFloat = dfImpute.select_dtypes(include='float64').columns

# Pilih Kolom yang memiliki nilai null dan tipe data integer
dfImputeInteger = dfImpute.select_dtypes(include='int64').columns

# Pilih Kolom yang memiliki nilai null dan tipe data object
dfImputeObject = dfImpute.select_dtypes(include='object').columns

In [63]:
# Mengisi Nilai Null tipe data float dengan median
for i in dfImputeFloat:
  dfImpute[i].fillna(dfImpute[i].median(),inplace=True)

# Mengisi Nilai Null tipe data int dengan rata - rata
for i in dfImputeInteger:
  dfImpute[i].fillna(dfImpute[i].mean(),inplace=True)

# Mengisi Nilai Null tipe data object dengan modus
for i in dfImputeObject:
  dfImpute[i].fillna(dfImpute[i].mode()[0],inplace=True)

**Membuat Null Secara Manual**

In [64]:
dfImpute.loc[0,"sex"] = None

In [65]:
dfImpute.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,44.45,17.3,197.0,4050.0,Male
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [66]:
import numpy as np

dfImpute.loc[0,"island"] = np.nan

dfImpute.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,,39.1,18.7,181.0,3750.0,
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,44.45,17.3,197.0,4050.0,Male
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


### Atasi Duplikat

In [67]:
dfDuplikat = df.copy()
dfDuplikat.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [68]:
#Periksa adanya duplikat
print(f'Jumlah Duplikat dalam dataset : {dfDuplikat.duplicated().sum()}')

Jumlah Duplikat dalam dataset : 0


**Membuat Duplikat**

In [69]:
dfDuplikat.iloc[0,:] = dfDuplikat.iloc[1,:]
dfDuplikat.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [70]:
#Periksa adanya duplikat
print(f'Jumlah Duplikat dalam dataset : {dfDuplikat.duplicated().sum()}')

Jumlah Duplikat dalam dataset : 1


**Hapus Duplikat**

In [71]:
dfDuplikat.drop_duplicates(inplace=True)
print(f'Jumlah Duplikat dalam dataset : {dfDuplikat.duplicated().sum()}')

Jumlah Duplikat dalam dataset : 0


In [72]:
dfDuplikat.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


### Mengubah Format Data

In [73]:
df['flipper_length_mm'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 344 entries, 0 to 343
Series name: flipper_length_mm
Non-Null Count  Dtype  
--------------  -----  
342 non-null    float64
dtypes: float64(1)
memory usage: 2.8 KB


**Ubah Tipe Data**

In [74]:
dfImpute['flipper_length_mm'] = dfImpute['flipper_length_mm'].astype('int64')

In [75]:
dfImpute['flipper_length_mm'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 344 entries, 0 to 343
Series name: flipper_length_mm
Non-Null Count  Dtype
--------------  -----
344 non-null    int64
dtypes: int64(1)
memory usage: 2.8 KB


## Data Transforming

### **Normalisasi**

In [76]:
from sklearn.preprocessing import MinMaxScaler

In [77]:
dfMinMax = dfImpute.copy()
dfMinMax.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,,39.1,18.7,181,3750.0,
1,Adelie,Torgersen,39.5,17.4,186,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195,3250.0,Female
3,Adelie,Torgersen,44.45,17.3,197,4050.0,Male
4,Adelie,Torgersen,36.7,19.3,193,3450.0,Female


In [78]:
minmax = MinMaxScaler() #Buat Objek scaler

dfMinMax['bill_length_mm'] = minmax.fit_transform(dfMinMax[['bill_length_mm']])

In [79]:
print('Validasi MinMaxScaler')
print(f'Nilai Min : ', dfMinMax['bill_length_mm'].min())
print(f'Nilai Max : ', dfMinMax['bill_length_mm'].max())

Validasi MinMaxScaler
Nilai Min :  0.0
Nilai Max :  0.9999999999999998


### **Standarisasi**

In [80]:
from sklearn.preprocessing import StandardScaler

In [81]:
dfStandard = dfImpute.copy()
dfStandard.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,,39.1,18.7,181,3750.0,
1,Adelie,Torgersen,39.5,17.4,186,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195,3250.0,Female
3,Adelie,Torgersen,44.45,17.3,197,4050.0,Male
4,Adelie,Torgersen,36.7,19.3,193,3450.0,Female


In [82]:
ss = StandardScaler() #Buat Objek scaler

dfStandard['bill_length_mm'] = ss.fit_transform(dfStandard[['bill_length_mm']])

In [83]:
print('Validasi Standard Scaler')
print(f'Nilai Rata-Rata       : ', dfStandard['bill_length_mm'].mean())
print(f'Nilai Standar Deviasi : ', dfStandard['bill_length_mm'].std())

Validasi Standard Scaler
Nilai Rata-Rata       :  -7.435912350977793e-16
Nilai Standar Deviasi :  1.0014566650110446


## Data Encoding

### **Ordinal Encoding**

In [84]:
from sklearn.preprocessing import OrdinalEncoder

In [85]:
oe = OrdinalEncoder()

df['species'] = oe.fit_transform(df[['species']])

In [86]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,0.0,Torgersen,39.1,18.7,181.0,3750.0,Male
1,0.0,Torgersen,39.5,17.4,186.0,3800.0,Female
2,0.0,Torgersen,40.3,18.0,195.0,3250.0,Female
3,0.0,Torgersen,,,,,
4,0.0,Torgersen,36.7,19.3,193.0,3450.0,Female


### **One Hot Encoding**

**One Hot Encoder**

In [87]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

In [88]:
OneHot = OneHotEncoder(sparse_output=False)

dfEnc = OneHot.fit_transform(df[['island']])

#Ubah Ke Dataframe
dfEnc = pd.DataFrame(
    dfEnc
)

In [89]:
dfEnc.head()

Unnamed: 0,0,1,2
0,0.0,0.0,1.0
1,0.0,0.0,1.0
2,0.0,0.0,1.0
3,0.0,0.0,1.0
4,0.0,0.0,1.0


In [90]:
#Buat Kolom baru
df['Torgersen'] = dfEnc[0]
df['Biscoe'] = dfEnc[1]
df['Dream'] = dfEnc[2]

In [91]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,Torgersen,Biscoe,Dream
0,0.0,Torgersen,39.1,18.7,181.0,3750.0,Male,0.0,0.0,1.0
1,0.0,Torgersen,39.5,17.4,186.0,3800.0,Female,0.0,0.0,1.0
2,0.0,Torgersen,40.3,18.0,195.0,3250.0,Female,0.0,0.0,1.0
3,0.0,Torgersen,,,,,,0.0,0.0,1.0
4,0.0,Torgersen,36.7,19.3,193.0,3450.0,Female,0.0,0.0,1.0


**get_dummies (Pandas)**

In [92]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,Torgersen,Biscoe,Dream
0,0.0,Torgersen,39.1,18.7,181.0,3750.0,Male,0.0,0.0,1.0
1,0.0,Torgersen,39.5,17.4,186.0,3800.0,Female,0.0,0.0,1.0
2,0.0,Torgersen,40.3,18.0,195.0,3250.0,Female,0.0,0.0,1.0
3,0.0,Torgersen,,,,,,0.0,0.0,1.0
4,0.0,Torgersen,36.7,19.3,193.0,3450.0,Female,0.0,0.0,1.0


In [93]:
enc = pd.get_dummies(df[['island']]) 

df = df.join(enc.astype('int64')) #Gabungkan dan Ubah menjadi integer

df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,Torgersen,Biscoe,Dream,island_Biscoe,island_Dream,island_Torgersen
0,0.0,Torgersen,39.1,18.7,181.0,3750.0,Male,0.0,0.0,1.0,0,0,1
1,0.0,Torgersen,39.5,17.4,186.0,3800.0,Female,0.0,0.0,1.0,0,0,1
2,0.0,Torgersen,40.3,18.0,195.0,3250.0,Female,0.0,0.0,1.0,0,0,1
3,0.0,Torgersen,,,,,,0.0,0.0,1.0,0,0,1
4,0.0,Torgersen,36.7,19.3,193.0,3450.0,Female,0.0,0.0,1.0,0,0,1


## Data Splitting

In [94]:
from sklearn.model_selection import train_test_split

In [95]:
#Pisah Target dan Feature
X = df.drop('species',axis=1) #Feature
y = df['species'] #Target

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2) # Train => 80% & Test => 20%

In [97]:
print("Dimensi X_train : ", X_train.shape)
print("Dimensi X_train : ", X_test.shape)
print("Dimensi y_train : ", y_train.shape)
print("Dimensi y_test : ", y_test.shape)

Dimensi X_train :  (275, 12)
Dimensi X_train :  (69, 12)
Dimensi y_train :  (275,)
Dimensi y_test :  (69,)
