<a href="https://colab.research.google.com/github/buburayam112/221230029-PengantarML/blob/main/praktikum_2_numpy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

In [2]:
'''Implementasi Preprocessing Pipeline (Modifikasi)'''
import numpy as np

# Dataset simulasi: 120 sampel, 4 fitur
np.random.seed(123)
X = np.random.randn(120, 4) * 8 + 10  # Mean=10, Std=8

# Tahap 1: Normalisasi Z-score → (x - mean) / std
def zscore_normalize(data):
    mean = np.mean(data, axis=0)
    std = np.std(data, axis=0)
    return (data - mean) / std

X_norm = zscore_normalize(X)

# Tahap 2: Penanganan outlier → batas ±2.5 std
def clip_outliers(data, std_limit=2.5):
    mean = np.mean(data, axis=0)
    std = np.std(data, axis=0)
    low = mean - std_limit * std
    high = mean + std_limit * std
    return np.clip(data, low, high)

X_clean = clip_outliers(X_norm)

# Tahap 3: One-hot encoding untuk label numerik
def one_hot_encode(labels):
    classes = np.unique(labels)
    one_hot = np.zeros((len(labels), len(classes)))
    for i, label in enumerate(labels):
        class_idx = np.where(classes == label)[0][0]
        one_hot[i, class_idx] = 1
    return one_hot

# Contoh label kategori (3 kelas)
labels = np.array([0, 1, 2, 1, 0, 2, 1, 0])
one_hot_labels = one_hot_encode(labels)

# Tahap 4: Split manual data latih dan uji
def manual_train_test_split(X, y, test_ratio=0.25):
    n = X.shape[0]
    n_test = int(n * test_ratio)
    indices = np.random.permutation(n)
    test_idx = indices[:n_test]
    train_idx = indices[n_test:]
    return X[train_idx], X[test_idx], y[train_idx], y[test_idx]

# Buat label acak untuk 120 data (kelas 0–2)
y = np.random.randint(0, 3, 120)
X_train, X_test, y_train, y_test = manual_train_test_split(X, y, test_ratio=0.25)

# --- OUTPUT ---
print("Original shape:", X.shape)
print("Z-score mean:", np.mean(X_norm, axis=0).round(3))
print("Z-score std:", np.std(X_norm, axis=0).round(3))
print("Cleaned data sample (first 3 rows):\n", X_clean[:3])
print("One-hot labels shape:", one_hot_labels.shape)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

# --- VALIDASI ---
assert X_norm.shape == X.shape, "Shape data harus tetap sama"
assert np.allclose(X_norm.mean(axis=0), 0, atol=1e-8), "Mean harus ~0"
assert np.allclose(X_norm.std(axis=0), 1, atol=1e-8), "Std harus ~1"
assert one_hot_labels.shape[1] == len(np.unique(labels)), "Jumlah kolom = jumlah kelas unik"

print("\n✅ Semua proses berhasil! Pipeline berjalan dengan benar.")


Original shape: (120, 4)
Z-score mean: [0. 0. 0. 0.]
Z-score std: [1. 1. 1. 1.]
Cleaned data sample (first 3 rows):
 [[-1.26086833e+00  1.13105114e+00  3.36124225e-01 -1.32254786e+00]
 [-7.00832048e-01  1.83135250e+00 -2.21649714e+00 -3.14037298e-01]
 [ 1.33653587e+00 -8.64728759e-01 -5.69996211e-01 -1.19756701e-03]]
One-hot labels shape: (8, 3)
Train shape: (90, 4) Test shape: (30, 4)

✅ Semua proses berhasil! Pipeline berjalan dengan benar.
