In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv')

# Data Preprocessing

## Data Splitting

In [3]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [5]:
print("Dimensi X_train : ", X_train.shape)
print("Dimensi X_train : ", X_test.shape)
print("Dimensi y_train : ", y_train.shape)
print("Dimensi y_test : ", y_test.shape)

Dimensi X_train :  (120, 4)
Dimensi X_train :  (30, 4)
Dimensi y_train :  (120,)
Dimensi y_test :  (30,)


## Data Transformation - Standarisasi

In [6]:
from sklearn.preprocessing import StandardScaler
import numpy as np

In [7]:
data = np.asarray([[100, 0.001],
				[8, 0.05],
				[50, 0.005],
				[88, 0.07],])

In [8]:
ss = StandardScaler()
Scaled_data = ss.fit_transform(data)

In [9]:
print("Nilai Data Sebelum scaling : ")
print(data)
print("Nilai standar deviasi: ", np.std(data))

print("\nNilai Data setelah scaling : ")
print(Scaled_data)
print("Nilai standar deviasi: ", np.std(Scaled_data))

Nilai Data Sebelum scaling : 
[[1.0e+02 1.0e-03]
 [8.0e+00 5.0e-02]
 [5.0e+01 5.0e-03]
 [8.8e+01 7.0e-02]]
Nilai standar deviasi:  39.89949316955668

Nilai Data setelah scaling : 
[[ 1.06996056 -1.03748098]
 [-1.48682831  0.62929174]
 [-0.31959861 -0.9014179 ]
 [ 0.73646636  1.30960714]]
Nilai standar deviasi:  1.0


## Data Cleaning - Null Handling

### Simple Imputer

In [10]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer

Nama = ["Lusi","Aisyah","Rizky","Dendi","Bagus","Ajeng","Putri","Yusuf","Kulus","Lusi"]
Jurusan = ["Informatika","Kimia","Sipil","Kimia","Informatika","Elektro","Industri","Sipil","Kimia","Informatika"]
IPK = [2.5,3.2,np.nan,2.5,3.1,3.8,3.1,np.nan,3.8,2.5]

df = pd.DataFrame(
   {
       "Nama":Nama,
       "Jurusan" : Jurusan,
       "IPK" : IPK
   } 
)


print("Jumlah record yang memiliki nilai null: ")
print(df.isna().sum())

imputer = SimpleImputer(strategy='mean')
df["IPK"] = imputer.fit_transform(df[["IPK"]])

print("\nJumlah null setelah menggunakan SimpleImputer: ")
print(df.isna().sum())

Jumlah record yang memiliki nilai null: 
Nama       0
Jurusan    0
IPK        2
dtype: int64

Jumlah null setelah menggunakan SimpleImputer: 
Nama       0
Jurusan    0
IPK        0
dtype: int64


In [11]:
df.isna().sum()

Nama       0
Jurusan    0
IPK        0
dtype: int64

### fillna

In [18]:
import pandas as pd
import numpy as np

# Buat item untuk dataframe
Nama = ["Lusi","Aisyah","Rizky","Dendi","Bagus","Ajeng","Putri","Yusuf","Kulus","Lusi"]
Jurusan = ["Informatika","Kimia","Sipil","Kimia","Informatika","Elektro","Industri","Sipil","Kimia","Informatika"]
IPK = [2.5,3.2,np.nan,2.5,3.1,3.8,3.1,np.nan,3.8,2.5]

# Buat dataframe
df = pd.DataFrame(
   {
       "Nama":Nama,
       "Jurusan" : Jurusan,
       "IPK" : IPK
   } 
)

print("Jumlah record yang memiliki nilai null: ")
print(df.isna().sum())

#Gunakan fungsi fillna pada kolom yang memiliki nilai null
df["IPK"].fillna(df["IPK"].mean(),inplace=True)

print("\nJumlah null setelah menggunakan fungsi fillna(): ")
print(df.isna().sum())

Jumlah record yang memiliki nilai null: 
Nama       0
Jurusan    0
IPK        2
dtype: int64

Jumlah null setelah menggunakan fungsi fillna(): 
Nama       0
Jurusan    0
IPK        0
dtype: int64


### Dropna

In [19]:
import pandas as pd
import numpy as np

# Buat item untuk dataframe
Nama = ["Lusi","Aisyah","Rizky","Dendi","Bagus","Ajeng","Putri","Yusuf"]
IPK = [2.5,3.2,np.nan,np.nan,3.1,3.8,np.nan,np.nan]

# Buat dataframe
df = pd.DataFrame(
   {
       "Nama":Nama,
       "IPK" : IPK
   } 
)

print("Sebelum : \n",df)

df.dropna(axis=1,inplace = True)


print("Sesudah : \n",df)

Sebelum : 
      Nama  IPK
0    Lusi  2.5
1  Aisyah  3.2
2   Rizky  NaN
3   Dendi  NaN
4   Bagus  3.1
5   Ajeng  3.8
6   Putri  NaN
7   Yusuf  NaN
Sesudah : 
      Nama
0    Lusi
1  Aisyah
2   Rizky
3   Dendi
4   Bagus
5   Ajeng
6   Putri
7   Yusuf


## Data Cleaning - Drop Duplicates

In [17]:
import pandas as pd

# Buat item untuk dataframe
Nama = ["Lusi","Aisyah","Rizky","Dendi","Bagus","Ajeng","Putri","Yusuf","Kulus","Lusi"]
IPK = [2.5,3.2,2.5,2.5,3.1,3.8,3.1,2.4,3.8,2.5]

# Buat dataframe
df = pd.DataFrame(
   {
       "Nama":Nama,
       "IPK" : IPK
   } 
)

print("Nilai Duplikat: ")
print(df[df.duplicated()])
print("Jumlah Nilai Duplikat:",df.duplicated().sum())

df.drop_duplicates(inplace=True)

print("\nSetelah menggunakan fungsi drop_duplicates()")
print("Jumlah Nilai Duplikat:",df.duplicated().sum())


Nilai Duplikat: 
   Nama  IPK
9  Lusi  2.5
Jumlah Nilai Duplikat: 1

Setelah menggunakan fungsi drop_duplicates()
Jumlah Nilai Duplikat: 0


## Data Encoding

### Ordinal Encoder

In [14]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

# Buat item untuk dataframe
Nama = ["Lusi","Aisyah","Rizky","Dendi","Bagus","Ajeng","Putri","Yusuf","Kulus","Lusi"]
Jurusan = ["Informatika","Kimia","Sipil","Kimia","Informatika","Elektro","Industri","Sipil","Kimia","Informatika"]

# Buat dataframe
df = pd.DataFrame(
   {
       "Nama":Nama,
       "Jurusan" : Jurusan,
   } 
)

encoder = OrdinalEncoder()

df["JurusanEnc"] = encoder.fit_transform(df[["Jurusan"]])

print(df.head())

     Nama      Jurusan  JurusanEnc
0    Lusi  Informatika         2.0
1  Aisyah        Kimia         3.0
2   Rizky        Sipil         4.0
3   Dendi        Kimia         3.0
4   Bagus  Informatika         2.0


### OneHotEncoder

In [15]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Buat item untuk dataframe
Nama = ["Lusi","Aisyah","Rizky","Dendi","Bagus","Ajeng","Putri","Yusuf","Kulus","Lusi"]
Jurusan = ["Informatika","Kimia","Sipil","Kimia","Informatika","Elektro","Industri","Sipil","Kimia","Informatika"]

# Buat dataframe
df = pd.DataFrame(
   {
       "Nama":Nama,
       "Jurusan" : Jurusan,
   } 
)

encoder = OneHotEncoder(sparse=False)

jurusanEnc = encoder.fit_transform(df[["Jurusan"]])

jurusanEnc = pd.DataFrame(jurusanEnc)

df = df.join(jurusanEnc)

print(df)

     Nama      Jurusan    0    1    2    3    4
0    Lusi  Informatika  0.0  0.0  1.0  0.0  0.0
1  Aisyah        Kimia  0.0  0.0  0.0  1.0  0.0
2   Rizky        Sipil  0.0  0.0  0.0  0.0  1.0
3   Dendi        Kimia  0.0  0.0  0.0  1.0  0.0
4   Bagus  Informatika  0.0  0.0  1.0  0.0  0.0
5   Ajeng      Elektro  1.0  0.0  0.0  0.0  0.0
6   Putri     Industri  0.0  1.0  0.0  0.0  0.0
7   Yusuf        Sipil  0.0  0.0  0.0  0.0  1.0
8   Kulus        Kimia  0.0  0.0  0.0  1.0  0.0
9    Lusi  Informatika  0.0  0.0  1.0  0.0  0.0


### get_dummies

In [16]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Buat item untuk dataframe
IPK = [2.5,3.2,2.5,2.5,3.1,3.8,3.1,2.4,3.8,2.5]
Jurusan = ["Informatika","Kimia","Sipil","Kimia","Informatika","Elektro","Industri","Sipil","Kimia","Informatika"]

# Buat dataframe
df = pd.DataFrame(
   {
       "Nama":Nama,
       "Jurusan" : Jurusan,
   } 
)

enc = pd.get_dummies(df[["Jurusan"]])

jurusanEnc = pd.DataFrame(enc)

df = df.join(jurusanEnc)

df.head()

Unnamed: 0,Nama,Jurusan,Jurusan_Elektro,Jurusan_Industri,Jurusan_Informatika,Jurusan_Kimia,Jurusan_Sipil
0,Lusi,Informatika,0,0,1,0,0
1,Aisyah,Kimia,0,0,0,1,0
2,Rizky,Sipil,0,0,0,0,1
3,Dendi,Kimia,0,0,0,1,0
4,Bagus,Informatika,0,0,1,0,0
