In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score

import xgboost as xgb

In [27]:
df = pd.read_csv('CreditScoring.csv')

In [28]:
df.columns = df.columns.str.lower()

In [29]:
status_values = {
    1: 'ok',
    2: 'default',
    0: 'unk'
    }
df.status = df.status.map(status_values)

home_values = {
    1: 'rent',
    2: 'owner',
    3: 'private',
    4: 'ignore',
    5: 'parents',
    6: 'other',
    0: 'unk'
}
df.home = df.home.map(home_values)

marital_values = {
    1: 'single',
    2: 'married',
    3: 'widow',
    4: 'separated',
    5: 'divorced',
    0: 'unk'
}
df.marital = df.marital.map(marital_values)

records_values = {
    1: 'no',
    2: 'yes',
    0: 'unk'
}
df.records = df.records.map(records_values)

job_values = {
    1: 'fixed',
    2: 'partime',
    3: 'freelance',
    4: 'others',
    0: 'unk'
}
df.job = df.job.map(job_values)

In [30]:
df[['income', 'assets', 'debt']] = df[['income', 'assets', 'debt']].replace(to_replace=99999999, value=np.nan)

In [31]:
df = df[df.status != 'unk'].reset_index(drop=True)

In [32]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=11)

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = (df_train.status == 'default').astype('int').values
y_test = (df_test.status == 'default').astype('int').values

del df_train['status']
del df_test['status']

In [33]:
dv = DictVectorizer(sparse=False)

train_dicts = df_train.fillna(0).to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)
test_dicts = df_test.fillna(0).to_dict(orient='records')
X_test = dv.fit_transform(test_dicts)


In [34]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [35]:
xgb_params = {
    'eta': 0.1,
    'max_depth': 3,
    'min_child_weight': 1,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'nthread': 4,
    'seed': 1,
    'verbosity': 1, 
}
model = xgb.train(xgb_params, dtrain, num_boost_round=175)

In [36]:
y_pred = model.predict(dtest)

In [37]:
roc_auc_score(y_test, y_pred)

0.6555153707052441

#### Saving the model

In [38]:
import bentoml

In [39]:
bentoml.xgboost.save_model(
    'credit_risk_model',
    model,
    custom_objects={'dict_vectorizer': dv},
    signatures={'predict': {
        'batchable': True,
        'batch_dim': 0,
    }}
    )

Model(tag="credit_risk_model:z65gwykpu6p2s5cm", path="C:\Users\Alex\bentoml\models\credit_risk_model\z65gwykpu6p2s5cm\")

In [40]:
import json

In [41]:
json.dumps(test_dicts[1])

'{"seniority": 10, "home": "rent", "time": 60, "age": 28, "marital": "married", "records": "no", "job": "fixed", "expenses": 78, "income": 325.0, "assets": 18.0, "debt": 3000.0, "amount": 2250, "price": 2250}'

In [42]:
X_test[1]

array([2.80e+01, 2.25e+03, 1.80e+01, 3.00e+03, 7.80e+01, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 1.00e+00, 3.25e+02,
       1.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 1.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 2.25e+03, 1.00e+00, 0.00e+00,
       1.00e+01, 6.00e+01])

In [43]:
X_test_1 = dv.transform(test_dicts[1])
dtest = xgb.DMatrix(
    X_test_1,
    feature_names=list(dv.get_feature_names_out())
)
model.predict(dtest)

array([0.92579806], dtype=float32)