In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE


In [None]:
# Load dataset
df = pd.read_csv("readyForModeling.csv")
df.head()


In [None]:
# Pisahkan fitur dan target
X = df.drop(columns=["cuaca"])
y = df["cuaca"]


In [None]:
# Tentukan fitur numerik dan kategorikal
numeric_features = ['Tn', 'Tx', 'Tavg', 'RH_avg', 'RR', 'ss', 'ff_x', 'ddd_x', 'ff_avg']
categorical_features = ['ddd_car']

# Pipeline numerik
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Pipeline kategorikal
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

# Gabung preprocessing
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])


In [None]:
# Pipeline lengkap dengan SMOTE
pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(k_neighbors=3, random_state=42)),
    ('classifier', RandomForestClassifier(random_state=42, class_weight='balanced'))
])


In [None]:
from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y, cv=cv, scoring='f1_macro')
print("F1 Macro Scores (CV):", scores)
print("Mean F1 Macro:", scores.mean())


In [None]:
# Split untuk evaluasi akhir
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Fit model
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# Confusion Matrix
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap='Blues')
plt.title("Confusion Matrix - Random Forest")
plt.show()

# Laporan klasifikasi
print(classification_report(y_test, y_pred))


## 📌 Penjelasan Ilmiah & Best Practice

1. **Pipeline** digunakan untuk memastikan **tidak terjadi data leakage** — semua preprocessing (imputasi, scaling, encoding) dilakukan di dalam pipeline yang hanya dilatih di training data.
2. **SMOTE** digunakan untuk mengatasi **class imbalance**. SMOTE ditaruh setelah preprocessing agar bekerja di ruang fitur tertransformasi.
3. **Cross-validation (StratifiedKFold)** digunakan agar pembagian data tetap menjaga proporsi kelas (stratifikasi), dan hasil evaluasi lebih stabil.
4. **RandomForest** digunakan dengan `class_weight='balanced'` untuk memperhatikan kelas minoritas, menambah robustness terhadap ketidakseimbangan.
5. Evaluasi akhir dilakukan di test set dengan **Confusion Matrix** dan **Classification Report** agar bisa melihat per kelas.


In [None]:
# Visualisasi confusion matrix dari cross_val_predict
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

y_pred_cv = cross_val_predict(pipeline, X, y, cv=cv)
cm = confusion_matrix(y, y_pred_cv)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title("Cross-Validated Confusion Matrix")
plt.show()


In [None]:
# Prediksi data baru
new_data = pd.DataFrame({
    'Tn': [24.4],
    'Tx': [31.3],
    'Tavg': [26.5],
    'RH_avg': [86.0],
    'RR': [0.0],
    'ss': [7.2],
    'ff_x': [3.0],
    'ddd_x': [70.0],
    'ff_avg': [1.0],
    'ddd_car': [8]
})

predicted_class = pipeline.predict(new_data)[0]
print(f"Prediksi kelas cuaca untuk data baru: {predicted_class}")
