In [1]:
import pandas as pd
import numpy as np

# Handling Missing Data

In [3]:
data = {
    "Name" : ["Michael", "Andreas", "Alice", "Bobby", "Watson"],
    "Age" : [20, np.nan, 22, 19, 18],
    "Score" : [85, 90, np.nan, 78, 92],
    "Grade" : ["A", "B", np.nan, "C", "A"]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Score,Grade
0,Michael,20.0,85.0,A
1,Andreas,,90.0,B
2,Alice,22.0,,
3,Bobby,19.0,78.0,C
4,Watson,18.0,92.0,A


Mean/Median/Mode Imputation

In [4]:
df.describe()

Unnamed: 0,Age,Score
count,4.0,4.0
mean,19.75,86.25
std,1.707825,6.238322
min,18.0,78.0
25%,18.75,83.25
50%,19.5,87.5
75%,20.5,90.5
max,22.0,92.0


In [5]:
print("Median kolom Age adalah :", df['Age'].median())

df["Age"].fillna(df["Age"].median()) #.fillna adalah fungsi untuk mengisi nilai-nilai yang kosong

Median kolom Age adalah : 19.5


Unnamed: 0,Age
0,20.0
1,19.5
2,22.0
3,19.0
4,18.0


In [6]:
df["Age"] = df["Age"].fillna(df["Age"].median()) #menambahkan hasil median ke kolom Age
df["Age"]

Unnamed: 0,Age
0,20.0
1,19.5
2,22.0
3,19.0
4,18.0


Forward Fill/Backward Fill

In [7]:
df["Grade"]

Unnamed: 0,Grade
0,A
1,B
2,
3,C
4,A


In [8]:
df["Grade"].fillna(method="ffill") #menambahkan nilai dengan forward fill

  df["Grade"].fillna(method="ffill") #menambahkan nilai dengan forward fill


Unnamed: 0,Grade
0,A
1,B
2,B
3,C
4,A


In [9]:
df["Grade"].fillna(method="bfill") #menambahkan nilai dengan backward fill

  df["Grade"].fillna(method="bfill") #menambahkan nilai dengan backward fill


Unnamed: 0,Grade
0,A
1,B
2,C
3,C
4,A


Interpolation

In [13]:
df["Score"].interpolate() #menambahkan nilai dengan interpolasi


Unnamed: 0,Score
0,85.0
1,90.0
2,84.0
3,78.0
4,92.0


In [14]:
df["Score"] = df["Score"].interpolate() #menambahkan nilai dengan interpolasi pada kolom

# Handling Categorical Data

In [15]:
data = {
    "Fruit":["Grape", "Apple", "Orange", "Banana", "Melon"],
    "Color":["Purple", "Red", "Orange", "Yellow", "Green"]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Fruit,Color
0,Grape,Purple
1,Apple,Red
2,Orange,Orange
3,Banana,Yellow
4,Melon,Green


One-hot Encoding

In [17]:
from sklearn.preprocessing import OneHotEncoder

#siapkan encoder
encoder = OneHotEncoder(sparse_output=False, drop="first")

#apply encoder
encoded = encoder.fit_transform(df[["Fruit", "Color"]]) # assign to new variable encoded
feature_names = encoder.get_feature_names_out(["Fruit", "Color"]) #call get_feature_names_out on the original encoder object
df_one_hot = pd.DataFrame(encoded, columns=feature_names)

df_one_hot

Unnamed: 0,Fruit_Banana,Fruit_Grape,Fruit_Melon,Fruit_Orange,Color_Orange,Color_Purple,Color_Red,Color_Yellow
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [18]:
feature_names

array(['Fruit_Banana', 'Fruit_Grape', 'Fruit_Melon', 'Fruit_Orange',
       'Color_Orange', 'Color_Purple', 'Color_Red', 'Color_Yellow'],
      dtype=object)

Label Encoding

In [19]:
from sklearn.preprocessing import LabelEncoder

#siapkan encoder
encoder = LabelEncoder()

#apply encoder
df_label_encoded = df.copy()
df_label_encoded["Fruit"] = encoder.fit_transform(df["Fruit"])
df_label_encoded["Color"] = encoder.fit_transform(df["Color"])

df_label_encoded

Unnamed: 0,Fruit,Color
0,2,2
1,0,3
2,4,1
3,1,4
4,3,0


# Feature Scaling

In [20]:
data = {
    "Age":[20, 21, 22, 19, 18],
    "Score":[85, 90, 93, 78, 92],
    "Grade":[90, 85, 80, 70, 95]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Age,Score,Grade
0,20,85,90
1,21,90,85
2,22,93,80
3,19,78,70
4,18,92,95


Min-Max Scaling

In [21]:
from sklearn.preprocessing import MinMaxScaler

# siapkan scaler
scaler = MinMaxScaler()

# apply scaler
df_minmax = scaler.fit_transform(df)
df_minmax = pd.DataFrame(df_minmax, columns=df.columns)
df_minmax

Unnamed: 0,Age,Score,Grade
0,0.5,0.466667,0.8
1,0.75,0.8,0.6
2,1.0,1.0,0.4
3,0.25,0.0,0.0
4,0.0,0.933333,1.0


Standardization

In [22]:
from sklearn.preprocessing import StandardScaler

# siapkan scaler
scaler = StandardScaler()

# apply scaler
df_scaler = scaler.fit_transform(df)
df_scaler = pd.DataFrame(df_scaler, columns=df.columns)
df_scaler

Unnamed: 0,Age,Score,Grade
0,0.0,-0.469709,0.697486
1,0.707107,0.433578,0.116248
2,1.414214,0.97555,-0.464991
3,-0.707107,-1.73431,-1.627467
4,-1.414214,0.794892,1.278724
