# Preprocessing Data - Latihan
Template notebook untuk melakukan preprocessing sesuai slide.

Letakkan file `Data.csv` di folder yang sama sebelum menjalankan.

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv('Data.csv')

df.info()
df.head()

## Menangani Missing Value
Imputasi mean untuk numerik, modus untuk kategorikal.

In [None]:
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()

from sklearn.impute import SimpleImputer
if num_cols:
    imp_num = SimpleImputer(strategy='mean')
    df[num_cols] = imp_num.fit_transform(df[num_cols])
if cat_cols:
    imp_cat = SimpleImputer(strategy='most_frequent')
    df[cat_cols] = imp_cat.fit_transform(df[cat_cols])

print(df.isnull().sum())
df.head()

## Encoding dan Split

In [None]:
from sklearn.preprocessing import LabelEncoder
candidates = [c for c in df.columns if c.lower() in ['target','lulus_tepat_waktu','kelulusan','status']]
if candidates:
    target = candidates[0]
    le = LabelEncoder()
    df[target] = le.fit_transform(df[target])
    print('Target:', target)
else:
    target = None

cat_cols = [c for c in cat_cols if c!=target] if target else cat_cols
if cat_cols:
    df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

if target:
    X = df.drop(columns=[target])
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
    print('Train shape:', X_train.shape)

from sklearn.preprocessing import StandardScaler
num_feats = X.select_dtypes(include=[np.number]).columns.tolist()
if num_feats:
    scaler = StandardScaler()
    X[num_feats] = scaler.fit_transform(X[num_feats])

# Save processed
df.to_csv('processed_data.csv',index=False)
print('Saved processed_data.csv')

## Catatan
Setelah selesai, commit file (`preprocessing.ipynb` atau `preprocessing.py`) dan `processed_data.csv` ke GitHub dan berikan URL repository pada tugas.