In [19]:
from sklearn.datasets import fetch_openml
import pandas as pd

data = fetch_openml(name='credit-g', version=1, as_frame=True)
df = data.frame

In [20]:
X = df.drop('class', axis=1)
y = df['class'].map({'good':0, 'bad':1})

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [22]:
num_cols = X.select_dtypes(include=['int64','float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

In [23]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, num_cols),
    ('cat', categorical_pipeline, cat_cols)
])

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

log_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

In [25]:
from sklearn.model_selection import cross_val_score

log_scores = cross_val_score(
    log_pipeline,
    X_train,
    y_train,
    cv=5,
    scoring='f1'
)

print("Logistic Regression CV F1:", log_scores.mean())

Logistic Regression CV F1: 0.1701432140299318


In [26]:
from sklearn.ensemble import RandomForestClassifier

rf_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [27]:
rf_scores = cross_val_score(
    rf_pipeline,
    X_train,
    y_train,
    cv=5,
    scoring='f1'
)

print("Random Forest CV F1:", rf_scores.mean())

Random Forest CV F1: 0.327830964536987


In [28]:
from sklearn.svm import SVC

svm_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', SVC())
])

In [29]:
svm_scores = cross_val_score(
    svm_pipeline,
    X_train,
    y_train,
    cv=5,
    scoring='f1'
)

print("SVM CV F1:", svm_scores.mean())

SVM CV F1: 0.12885143710703267
