In [None]:
import pandas as pd
import numpy as np

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [None]:
data = fetch_openml(name='credit-g', version=1, as_frame=True)
df = data.frame

In [None]:
X = df.drop('class', axis=1)
y = df['class'].map({'good': 0, 'bad': 1})

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [None]:
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

In [None]:
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, num_cols),
    ('cat', categorical_pipeline, cat_cols)
])

In [None]:
log_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

log_scores = cross_val_score(
    log_pipeline,
    X_train,
    y_train,
    cv=5,
    scoring='f1'
)

print("Logistic Regression CV F1:", log_scores.mean())

Logistic Regression CV F1: 0.1701432140299318


In [None]:
rf_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

rf_scores = cross_val_score(
    rf_pipeline,
    X_train,
    y_train,
    cv=5,
    scoring='f1'
)

print("Random Forest CV F1:", rf_scores.mean())

Random Forest CV F1: 0.327830964536987


In [None]:
svm_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', SVC())
])

svm_scores = cross_val_score(
    svm_pipeline,
    X_train,
    y_train,
    cv=5,
    scoring='f1'
)

print("SVM CV F1:", svm_scores.mean())

SVM CV F1: 0.12885143710703267


In [None]:
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 5, 10],
    'classifier__min_samples_split': [2, 5]
}

grid_search = GridSearchCV(
    rf_pipeline,
    param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)

Best Parameters: {'classifier__max_depth': None, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}
Best CV Score: 0.3449250674996009


In [None]:
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[120  20]
 [ 41  19]]

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.86      0.80       140
           1       0.49      0.32      0.38        60

    accuracy                           0.69       200
   macro avg       0.62      0.59      0.59       200
weighted avg       0.67      0.69      0.67       200



In [None]:
joblib.dump(best_model, "best_credit_model.pkl")
print("Model saved successfully!")

Model saved successfully!
