In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE


In [None]:
# Load dataset
df = pd.read_csv("readyForModeling.csv")
df.head()


In [None]:
plt.figure(figsize=(10, 8))
corr_matrix = df.corr(numeric_only=True)
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", square=True)
plt.title("Heatmap Korelasi antar Fitur dan Target (cuaca)")
plt.tight_layout()
plt.show()


In [None]:
# Pisahkan fitur dan target
X = df.drop(columns=["cuaca"])
y = df["cuaca"]

# Tentukan fitur numerik dan kategorikal
numeric_features = ['Tn', 'Tx', 'Tavg', 'RH_avg', 'RR', 'ss', 'ff_x', 'ddd_x', 'ff_avg']
categorical_features = ['ddd_car']

# Pipeline preprocessing
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])


In [None]:
# Pipeline lengkap
pipeline = ImbPipeline([
    ('preprocessing', preprocessor),
    ('smote', SMOTE(k_neighbors=3, random_state=42)),
    ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))
])

# Evaluasi cross-validation
scores = cross_val_score(pipeline, X, y, cv=5, scoring='f1_macro')
print("F1 Macro CV Scores:", scores)
print("Mean F1 Macro:", scores.mean())


In [None]:
# Split data untuk evaluasi final
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Fit dan prediksi
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# Confusion matrix
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap="Blues")
plt.title("Confusion Matrix")
plt.show()

# Laporan klasifikasi
print(classification_report(y_test, y_pred))
