In [None]:
# ==========================================
# TUGAS DATA MINING - PREPROCESSING DATA
# Nama  : Alfina Damayanti (A11.2023.14968)
# Kelas : A11.4517
# File  : preprocessing.ipynb
# ==========================================

# Tujuan:
# Melakukan tahapan preprocessing data berdasarkan materi kuliah Data Mining.
# Tahapan yang dilakukan:
# 1. Membaca dataset resmi dari BPS
# 2. Menangani missing value
# 3. Encoding data kategori (Pulau)
# 4. Membagi dataset menjadi training/testing
# 5. Feature scaling



In [2]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split


In [3]:
dataset = pd.read_csv('data_ipm.csv')
print("=== DATASET ASLI ===")
display(dataset)


=== DATASET ASLI ===


Unnamed: 0,Provinsi,IPM,Harapan_Hidup,Rata_Lama_Sekolah,Pengeluaran_per_Kapita,Pulau
0,Aceh,71.99,69.6,9.14,11000000,Sumatera
1,Sumatera Utara,71.77,69.1,9.3,11200000,Sumatera
2,DKI Jakarta,80.76,74.91,11.26,21400000,Jawa
3,Jawa Barat,72.09,70.3,8.3,12500000,Jawa
4,Jawa Tengah,72.73,71.2,8.4,11800000,Jawa
5,Yogyakarta,80.22,74.52,10.68,17000000,Jawa
6,Kalimantan Timur,76.24,73.29,9.97,19000000,Kalimantan
7,Sulawesi Selatan,75.31,70.67,9.83,13500000,Sulawesi
8,Bali,77.17,73.83,9.81,18800000,Bali
9,Papua,60.44,65.15,6.7,9200000,Papua


In [4]:
X = dataset[['Harapan_Hidup', 'Rata_Lama_Sekolah', 'Pengeluaran_per_Kapita', 'Pulau']].values
Y = dataset['IPM'].values

print("=== FITUR (X) ===")
print(X[:5])
print("\n=== TARGET (Y) ===")
print(Y[:5])


=== FITUR (X) ===
[[69.6 9.14 11000000 'Sumatera']
 [69.1 9.3 11200000 'Sumatera']
 [74.91 11.26 21400000 'Jawa']
 [70.3 8.3 12500000 'Jawa']
 [71.2 8.4 11800000 'Jawa']]

=== TARGET (Y) ===
[71.99 71.77 80.76 72.09 72.73]


In [5]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X[:, 0:3] = imputer.fit_transform(X[:, 0:3])

print("=== SETELAH MENGISI NILAI HILANG ===")
print(X[:5])


=== SETELAH MENGISI NILAI HILANG ===
[[69.6 9.14 11000000.0 'Sumatera']
 [69.1 9.3 11200000.0 'Sumatera']
 [74.91 11.26 21400000.0 'Jawa']
 [70.3 8.3 12500000.0 'Jawa']
 [71.2 8.4 11800000.0 'Jawa']]


In [6]:
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(), [3])],
    remainder='passthrough'
)
X = np.array(ct.fit_transform(X))

print("=== SETELAH ENCODING (Pulau → Numerik) ===")
print(X[:5])


=== SETELAH ENCODING (Pulau → Numerik) ===
[[0.0 0.0 0.0 0.0 0.0 1.0 69.6 9.14 11000000.0]
 [0.0 0.0 0.0 0.0 0.0 1.0 69.1 9.3 11200000.0]
 [0.0 1.0 0.0 0.0 0.0 0.0 74.91 11.26 21400000.0]
 [0.0 1.0 0.0 0.0 0.0 0.0 70.3 8.3 12500000.0]
 [0.0 1.0 0.0 0.0 0.0 0.0 71.2 8.4 11800000.0]]


In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

print("=== JUMLAH DATA TRAINING ===", len(X_train))
print("=== JUMLAH DATA TESTING ===", len(X_test))
print("\nContoh Data Training (5 baris):")
print(X_train[:5])


=== JUMLAH DATA TRAINING === 8
=== JUMLAH DATA TESTING === 2

Contoh Data Training (5 baris):
[[0.0 1.0 0.0 0.0 0.0 0.0 74.52 10.68 17000000.0]
 [0.0 0.0 0.0 0.0 0.0 1.0 69.6 9.14 11000000.0]
 [0.0 0.0 0.0 0.0 1.0 0.0 70.67 9.83 13500000.0]
 [0.0 1.0 0.0 0.0 0.0 0.0 74.91 11.26 21400000.0]
 [0.0 0.0 0.0 1.0 0.0 0.0 65.15 6.7 9200000.0]]


In [8]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

print("=== SETELAH FEATURE SCALING ===")
print(X_train[:5])


=== SETELAH FEATURE SCALING ===
[[ 0.          1.         -0.37796447 -0.37796447 -0.37796447 -0.37796447
   1.12544529  1.01640862  0.64705692]
 [ 0.         -1.         -0.37796447 -0.37796447 -0.37796447  2.64575131
  -0.54489885 -0.10564821 -0.86064853]
 [ 0.         -1.         -0.37796447 -0.37796447  2.64575131 -0.37796447
  -0.18163295  0.39709154 -0.23243792]
 [ 0.          1.         -0.37796447 -0.37796447 -0.37796447 -0.37796447
   1.25785061  1.43900145  1.75270759]
 [ 0.         -1.         -0.37796447  2.64575131 -0.37796447 -0.37796447
  -2.05567759 -1.88345253 -1.31296017]]


In [1]:
print("✅ Berhasil melakukan processing data!\n")
print("""
Tahapan yang telah saya lakukan:
1. Membaca dataset IPM Indonesia (BPS)
2. Mengatasi missing value
3. Encoding data kategori 'Pulau'
4. Membagi dataset menjadi training & testing
5. Feature scaling (StandardScaler)

Hasil data siap untuk digunakan pada algoritma data mining berikutnya.
""")


✅ Berhasil melakukan processing data!


Tahapan yang telah saya lakukan:
1. Membaca dataset IPM Indonesia (BPS)
2. Mengatasi missing value
3. Encoding data kategori 'Pulau'
4. Membagi dataset menjadi training & testing
5. Feature scaling (StandardScaler)

Hasil data siap untuk digunakan pada algoritma data mining berikutnya.

