In [2]:
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import optuna

In [3]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_float('max_features', 0.1, 1.0),
        'random_state': 42,
        'class_weight': 'balanced'
    }
    
    model = RandomForestClassifier(**params)
    
    # Use f1 score for optimization
    scores = cross_val_score(model, X_resampled, y_resampled, cv=5, scoring='f1')
    
    return scores.mean()

In [4]:
dev_data = pd.read_csv("Dev_data_to_be_shared.csv")
validation_data = pd.read_csv("validation_data_to_be_shared.csv")

# Columns to drop
columns_to_drop = ['account_number',
    'bureau_enquiry_17', 'bureau_16', 'bureau_38', 'bureau_enquiry_37', 'bureau_182', 'bureau_282',
    'bureau_120', 'bureau_142', 'bureau_362', 'bureau_242', 'bureau_162', 'bureau_382', 'bureau_262',
    'bureau_322', 'bureau_202', 'bureau_100', 'bureau_342', 'onus_attribute_28', 'bureau_56', 'bureau_222',
    'bureau_302', 'bureau_423', 'bureau_402', 'bureau_80', 'bureau_enquiry_7', 'bureau_4', 'bureau_26',
    'bureau_enquiry_27', 'bureau_47', 'bureau_enquiry_47', 'bureau_292', 'bureau_172', 'bureau_392',
    'bureau_192', 'bureau_131', 'bureau_252', 'bureau_110', 'bureau_352', 'bureau_272', 'bureau_152',
    'bureau_372', 'bureau_212', 'bureau_312', 'bureau_232', 'bureau_332', 'bureau_412', 'bureau_90', 'bureau_70'
]

# Drop specified columns
dev_data.drop(columns=columns_to_drop, axis=1, errors='ignore', inplace=True)
validation_data.drop(columns=columns_to_drop, axis=1, errors='ignore', inplace=True)

# Drop columns with all missing values
dev_data.dropna(axis=1, how="all", inplace=True)
validation_data.dropna(axis=1, how="all", inplace=True)

# Separate target variable from features
X_dev = dev_data.drop(columns=["bad_flag"])
y_dev = dev_data["bad_flag"]

# Handle missing values by imputing with median
imputer = SimpleImputer(strategy="median")
X_dev_imputed = pd.DataFrame(imputer.fit_transform(X_dev), columns=X_dev.columns)
validation_data_imputed = pd.DataFrame(imputer.transform(validation_data), columns=validation_data.columns)

# Add the target column back to the development data
dev_data_imputed = X_dev_imputed.copy()
dev_data_imputed["bad_flag"] = y_dev.reset_index(drop=True)

# Define features and target
X = dev_data_imputed.drop(columns=['bad_flag'], axis=1)
y = dev_data_imputed['bad_flag']

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply SMOTE for oversampling
smote = SMOTE(random_state=17)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Split resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.04, random_state=42)

# # Train a Random Forest Classifier
# model = RandomForestClassifier(random_state=42, n_estimators=101, class_weight='balanced')
# model.fit(X_train, y_train)

# # Evaluate the model on the test set
# y_pred = model.predict(X_test)
# Create and run study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Print results and train final model
print("Best parameters:", study.best_params)
print("Best f1-score:", study.best_value)

# Train final model with best parameters
final_model = RandomForestClassifier(**study.best_params)
final_model.fit(X_train, y_train)

# Evaluate
y_pred = final_model.predict(X_test)
print("\nFinal F1 Score:", f1_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Save model
joblib.dump(final_model, 'optimized_rf_model.pkl')

y_pred_proba = final_model.predict_proba(X_test)[:, 1]

: 

In [3]:
print("Classification Report:")
print(classification_report(y_test, y_pred, digits = 8))
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_proba):.8f}")

# Step 2: Prepare validation data
validation_X = validation_data_imputed.drop(columns=['bad_flag'], axis=1, errors='ignore')  # Assume bad_flag may not exist
validation_X_scaled = scaler.transform(validation_X)

# Predict probabilities for validation data
validation_pred_proba = model.predict_proba(validation_X_scaled)[:, 1]

# Save the results
validation_data_imputed['predicted_probability'] = validation_pred_proba
validation_data_imputed[['account_number', 'predicted_probability']].to_csv(
    "validation_predictions.csv", index=False
)

print("Validation predictions saved as 'validation_predictions.csv'.")

# # Step 3: Save the trained model
# joblib.dump(model, 'random_forest_model.pkl')  # Save model in .h5 format (though .pkl is more typical)


# print("Model saved as 'random_forest_model.pkl'.")

Classification Report:
              precision    recall  f1-score   support

           0  0.99073584 0.99973291 0.99521404      3744
           1  0.99974073 0.99100488 0.99535364      3891

    accuracy                      0.99528487      7635
   macro avg  0.99523829 0.99536889 0.99528384      7635
weighted avg  0.99532497 0.99528487 0.99528518      7635

ROC AUC Score: 0.99990723
Validation predictions saved as 'validation_predictions.csv'.
Model saved as 'random_forest_model.pkl'.


In [9]:
# Ensure validation set columns match training set columns
validation_X_imputed = validation_data_imputed.drop(columns=['bad_flag'], axis=1, errors='ignore')

# Reindex the validation set columns to match the training set's columns
validation_X_imputed = validation_X_imputed.reindex(columns=X.columns, fill_value=0)

# Standardize the validation set using the same scaler
validation_X_scaled = scaler.transform(validation_X_imputed)

# Convert to DataFrame to retain column names after scaling
validation_X_scaled_df = pd.DataFrame(validation_X_scaled, columns=X.columns)

# Predict probabilities for validation data
validation_pred_proba = model.predict_proba(validation_X_scaled_df)[:, 1]

# Save the results with 7 decimal places
validation_data_imputed['predicted_probability'] = validation_pred_proba

# Save to CSV
validation_data_imputed[['account_number', 'predicted_probability']].to_csv(
    "validation_predictions.csv", index=False
)

print("Validation predictions saved as 'validation_predictions.csv'.")




Validation predictions saved as 'validation_predictions.csv'.


In [None]:
# Add this code block after getting y_pred
import matplotlib.pyplot as plt

# Create and plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Not Default', 'Default'])
disp.plot(cmap='Blues', values_format='d')
plt.title('Confusion Matrix')
plt.show()

# Print numeric values
print("\nConfusion Matrix:")
print(cm)