In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split,  cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, accuracy_score
import joblib
import numpy as np

risiko_list = ['RH', 'RD', 'RJ', 'RO']

for risiko in risiko_list:
    df = pd.read_csv(f'sampling_{risiko}.csv')

    x = df.drop(columns=risiko_list)
    y = df[risiko]

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

    rf = RandomForestClassifier(
        n_estimators=80,
        max_depth=10,
        min_samples_leaf=10,
        max_features='sqrt',
        class_weight=None, 
        random_state=42
    )
    rf.fit(x_train, y_train)
    
    cv_scores = cross_val_score(rf, x_train, y_train, cv=5, scoring='accuracy')

    calibrate_method = 'sigmoid' 
    model = CalibratedClassifierCV(rf, method=calibrate_method, cv=3)
    model.fit(x_train, y_train)
    
    filename = f'model_{risiko.lower()}.pkl'
    joblib.dump(model, filename)

    print(f"Cross-Validation Akurasi (rata-rata) untuk {risiko}: {np.mean(cv_scores):.4f}")
    print(f"Cross-Validation Akurasi per Fold: {cv_scores}")
    
    print(f"Evaluasi Model {risiko}")
    y_pred = model.predict(x_test)
    print("Akurasi (Test set):", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    
    importances = rf.feature_importances_
    feature_names = x.columns
    importance_df = pd.DataFrame({'Fitur': feature_names, 'Importance': importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)
   
    print(f"\nFeature Importance untuk Risiko {risiko}:")
    print(importance_df.to_string(index=False))
    

Cross-Validation Akurasi (rata-rata) untuk RH: 0.9267
Cross-Validation Akurasi per Fold: [0.92268041 0.92268041 0.92783505 0.93814433 0.92227979]
Evaluasi Model RH
Akurasi (Test set): 0.934156378600823
              precision    recall  f1-score   support

           0       0.92      0.95      0.93       119
           1       0.95      0.92      0.93       124

    accuracy                           0.93       243
   macro avg       0.93      0.93      0.93       243
weighted avg       0.93      0.93      0.93       243


Feature Importance untuk Risiko RH:
Fitur  Importance
   I4    0.238142
   C1    0.178171
   C3    0.150720
   I2    0.133359
   C2    0.069541
   I1    0.039427
   I3    0.022301
   C4    0.019945
   E4    0.005856
   N3    0.005819
   N1    0.005034
   H2    0.004888
   O3    0.004580
   K3    0.004559
   M3    0.004538
   K1    0.004397
   G1    0.004173
   A2    0.004070
   H4    0.003913
   N2    0.003860
   E2    0.003782
   F2    0.003780
   D4    0.003741
  