In [5]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.impute import SimpleImputer

# ===============================
# 1. LOAD DATA
# ===============================
columns = [import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.impute import SimpleImputer

# ===============================
# OPTIONAL ONNX SUPPORT
# ===============================
ONNX_AVAILABLE = True
try:
    from skl2onnx import convert_sklearn
    from skl2onnx.common.data_types import FloatTensorType, StringTensorType
except ImportError:
    ONNX_AVAILABLE = False
    print("‚ö†Ô∏è skl2onnx not installed. ONNX export will be skipped.")

# ===============================
# 1. LOAD DATA
# ===============================
columns = [
    "status", "duration", "credit_history", "purpose", "credit_amount",
    "savings", "employment", "installment_rate", "personal_status_sex",
    "other_debtors", "residence_since", "property", "age",
    "other_installment_plans", "housing", "existing_credits",
    "job", "people_liable", "telephone", "foreign_worker", "target"
]

data = pd.read_csv(
    "german.data",
    sep=" ",
    header=None,
    names=columns
)

# ===============================
# 2. TARGET CLEANING
# ===============================
# 1 = Good Credit, 2 = Bad Credit
data["target"] = data["target"].map({1: 1, 2: 0})

X = data.drop("target", axis=1)
y = data["target"]

# ===============================
# 3. FEATURE TYPES
# ===============================
numeric_features = [
    "duration",
    "credit_amount",
    "installment_rate",
    "residence_since",
    "age",
    "existing_credits",
    "people_liable"
]

categorical_features = [c for c in X.columns if c not in numeric_features]

# ===============================
# 4. PREPROCESSING
# ===============================
numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, numeric_features),
        ("cat", categorical_pipeline, categorical_features)
    ]
)

# ===============================
# 5. MODEL (Explainable + Stable)
# ===============================
model = GradientBoostingClassifier(
    n_estimators=150,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", model)
])

# ===============================
# 6. TRAIN / TEST SPLIT
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# ===============================
# 7. TRAIN MODEL
# ===============================
pipeline.fit(X_train, y_train)

# ===============================
# 8. EVALUATION
# ===============================
y_pred = pipeline.predict(X_test)
y_prob = pipeline.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

print("\nüìä Model Performance")
print(f"Accuracy : {accuracy:.4f}")
print(f"ROC-AUC  : {auc:.4f}")

# ===============================
# 9. SAVE SKLEARN MODEL (.pkl)
# ===============================
joblib.dump(pipeline, "fintrust_credit_model.pkl")
print("\n‚úÖ Saved sklearn model: fintrust_credit_model.pkl")

# ===============================
# 10. OPTIONAL: CONVERT TO ONNX
# ===============================
if ONNX_AVAILABLE:
    print("\nüîÑ Converting model to ONNX...")

    initial_types = []

    for col in numeric_features:
        initial_types.append((col, FloatTensorType([None, 1])))

    for col in categorical_features:
        initial_types.append((col, StringTensorType([None, 1])))

    onnx_model = convert_sklearn(
        pipeline,
        initial_types=initial_types,
        target_opset=15
    )

    with open("fintrust_credit_model.onnx", "wb") as f:
        f.write(onnx_model.SerializeToString())

    print("‚úÖ Saved ONNX model: fintrust_credit_model.onnx")
else:
    print("\n‚ÑπÔ∏è ONNX export skipped (skl2onnx not installed)")

    "status", "duration", "credit_history", "purpose", "credit_amount",
    "savings", "employment", "installment_rate", "personal_status_sex",
    "other_debtors", "residence_since", "property", "age",
    "other_installment_plans", "housing", "existing_credits",
    "job", "people_liable", "telephone", "foreign_worker", "target"
]

data = pd.read_csv(
    "german.data",
    sep=" ",
    header=None,
    names=columns
)

# ===============================
# 2. TARGET CLEANING
# ===============================
# Original: 1 = Good, 2 = Bad
data["target"] = data["target"].map({1: 1, 2: 0})

X = data.drop("target", axis=1)
y = data["target"]

# ===============================
# 3. FEATURE TYPES
# ===============================
numeric_features = [
    "duration", "credit_amount", "installment_rate",
    "residence_since", "age", "existing_credits", "people_liable"
]

categorical_features = list(set(X.columns) - set(numeric_features))

# ===============================
# 4. PREPROCESSING PIPELINES
# ===============================
numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_pipeline, numeric_features),
    ("cat", categorical_pipeline, categorical_features)
])

# ===============================
# 5. MODEL (EXPLAINABLE)
# ===============================
model = GradientBoostingClassifier(
    n_estimators=150,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", model)
])

# ===============================
# 6. TRAIN / TEST SPLIT
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# ===============================
# 7. TRAIN MODEL
# ===============================
pipeline.fit(X_train, y_train)

# ===============================
# 8. EVALUATION
# ===============================
y_pred = pipeline.predict(X_test)
y_prob = pipeline.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

print("Model Performance:")
print(f"Accuracy : {accuracy:.4f}")
print(f"ROC-AUC  : {auc:.4f}")

# ===============================
# 9. SAVE MODEL
# ===============================
joblib.dump(pipeline, "fintrust_credit_model.pkl")

print("\nModel saved as: fintrust_credit_model.pkl")


SyntaxError: invalid syntax (3369205545.py, line 16)

In [3]:
pip install skl2onnx onnx onnxruntime


^C
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.impute import SimpleImputer

# ONNX imports
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType, StringTensorType

# ===============================
# 1. LOAD DATA
# ===============================
columns = [
    "status", "duration", "credit_history", "purpose", "credit_amount",
    "savings", "employment", "installment_rate", "personal_status_sex",
    "other_debtors", "residence_since", "property", "age",
    "other_installment_plans", "housing", "existing_credits",
    "job", "people_liable", "telephone", "foreign_worker", "target"
]

data = pd.read_csv(
    "german.data",
    sep=" ",
    header=None,
    names=columns
)

# ===============================
# 2. TARGET CLEANING
# ===============================
# Original: 1 = Good, 2 = Bad
data["target"] = data["target"].map({1: 1, 2: 0})

X = data.drop("target", axis=1)
y = data["target"]

# ===============================
# 3. FEATURE TYPES
# ===============================
numeric_features = [
    "duration",
    "credit_amount",
    "installment_rate",
    "residence_since",
    "age",
    "existing_credits",
    "people_liable"
]

categorical_features = list(set(X.columns) - set(numeric_features))

# ===============================
# 4. PREPROCESSING PIPELINES
# ===============================
numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

# Categorical pipeline: OneHotEncoder handles unknown/missing with handle_unknown="ignore"
categorical_pipeline = Pipeline(steps=[
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_pipeline, numeric_features),
    ("cat", categorical_pipeline, categorical_features)
])

# ===============================
# 5. MODEL
# ===============================
model = GradientBoostingClassifier(
    n_estimators=150,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", model)
])

# ===============================
# 6. TRAIN / TEST SPLIT
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# ===============================
# 7. TRAIN MODEL
# ===============================
pipeline.fit(X_train, y_train)

# ===============================
# 8. EVALUATION
# ===============================
y_pred = pipeline.predict(X_test)
y_prob = pipeline.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

print("Model Performance:")
print(f"Accuracy : {accuracy:.4f}")
print(f"ROC-AUC  : {auc:.4f}")

# ===============================
# 9. SAVE SKLEARN MODEL
# ===============================
joblib.dump(pipeline, "fintrust_credit_model.pkl")
print("Saved sklearn model: fintrust_credit_model.pkl")

# ===============================
# 10. CONVERT TO ONNX
# ===============================

# Define ONNX input schema
initial_types = []

for col in numeric_features:
    initial_types.append((col, FloatTensorType([None, 1])))

for col in categorical_features:
    initial_types.append((col, StringTensorType([None, 1])))

# Convert pipeline to ONNX
onnx_model = convert_sklearn(
    pipeline,
    initial_types=initial_types,
    target_opset=15
)

# Save ONNX model
with open("fintrust_credit_model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

print("Saved ONNX model: fintrust_credit_model.onnx")


Model Performance:
Accuracy : 0.7600
ROC-AUC  : 0.7862
Saved sklearn model: fintrust_credit_model.pkl
Saved ONNX model: fintrust_credit_model.onnx
