### GLM and Gradient Boosted Decision Tree Example

#### Data Loading and Preparation

In [3]:
import pandas as pd
from modeva import DataSet # Import Data Processing Library
df = pd.read_csv("credit_example.csv") # Loading data into pd dataframe
ds = DataSet() # Create dataset object holder
ds.load_dataframe(data = df) 
df

✓ Auth code found in local storage.
Authenticating Modeva...
✓ License is active and valid.
✓ Authenticated successfully!


Unnamed: 0,Mortgage,Balance,Amount Past Due,Delinquency,Inquiry,Open Trade,Utilization,Gender,Race,Status
0,139734.22,2717.87,0.00,0,0,0,0.785162,0,0,0
1,243359.62,193.60,0.00,0,0,0,0.254759,0,0,1
2,187784.19,395.05,0.00,0,1,0,0.360995,0,0,1
3,594626.89,180.94,0.00,0,0,0,0.128144,0,1,1
4,166771.42,1241.13,0.00,0,0,0,0.702958,0,0,0
...,...,...,...,...,...,...,...,...,...,...
19995,226806.34,670.99,0.00,0,1,0,0.922122,0,1,1
19996,308625.65,3223.94,0.00,0,0,0,0.989716,1,1,1
19997,375035.34,133.05,131.15,1,0,0,0.092523,0,0,1
19998,165377.42,2256.07,0.00,0,0,0,0.630330,1,0,1


##### Data Preprocessing

In [107]:
ds.encode_categorical(method="ordinal") # Encoding categorical variables as ordinal
ds.scale_numerical(features=tuple(ds.feature_names_numerical), 
                   method="standardize") # standardized numerical features
ds.set_inactive_features(['Gender','Race'])  # Exclude Gender and Race from modeling
ds.set_target('Status') # set target
ds.preprocess() # Run pre-processing
ds.set_random_split(test_ratio = 0.2) # Split training and testing

##### Basic EDA

In [16]:
result = ds.eda_2d(feature_x="Utilization", feature_y="Status")
result.plot(figsize=(5,4))

#### Modeling using GLM and xgboost

In [65]:
from modeva.models import MoXGBClassifier             # Import xgboost library
from modeva.models import MoLogisticRegression        # Import GLM librat

# Stup model objects
model_xgb1 = MoXGBClassifier(name = "XGB1_model", max_depth=1) # GAM with xgboost depth-1
model_xgb2 = MoXGBClassifier(name = "XGB2_model", max_depth=2) # GAMI with xgboost depth-2
# for GLM
model_glm = MoLogisticRegression(name="GLM",
                             feature_names=ds.feature_names,
                             feature_types=ds.feature_types)  # GLM Model

##### Train GLM Model

In [66]:
model_glm.fit(ds.train_x, ds.train_y)

In [67]:
# Check logistic regression result
from modeva import TestSuite  # Import evaluation/testing library
ts_glm = TestSuite(ds, model_glm) # store bundle of dataset and model in ts
# View model performance metrics 
results_glm = ts_glm.diagnose_accuracy_table()
results_glm.table

Unnamed: 0,AUC,ACC,F1,LogLoss,Brier
train,0.809769,0.727437,0.768414,0.535235,0.178494
test,0.810225,0.72975,0.767577,0.530331,0.177499
GAP,0.000457,0.002313,-0.000837,-0.004904,-0.000995


In [68]:
# Check feature importance
results_glm = ts_glm.interpret_coef(features=tuple(ds.feature_names))
results_glm.plot(figsize=(5,4))

##### Train GAM with xgboost Depth-1

In [69]:
# train model with input: ds.train_x and target: ds.train_y
model_xgb1.fit(ds.train_x, ds.train_y)


In [70]:
# Check Performance of xgboost
from modeva import TestSuite
ts_xgb1 = TestSuite(ds, model_xgb1) # store bundle of dataset and model in ts
# View model performance metrics 
results_xgb1 = ts_xgb1.diagnose_accuracy_table()
results_xgb1.table

Unnamed: 0,AUC,ACC,F1,LogLoss,Brier
train,0.834243,0.758062,0.788065,0.501044,0.165555
test,0.834383,0.75975,0.786776,0.500489,0.165062
GAP,0.00014,0.001688,-0.001288,-0.000555,-0.000493


In [71]:
# Check feature importance
results_xgb1 = ts_xgb1.interpret_fi()
results_xgb1.plot(n_bars=5, figsize=(5,4))

In [75]:
# Check feature effect of most important feature
results_xgb1 = ts_xgb1.interpret_effects(features = "Utilization")
results_xgb1.plot(figsize = (6,4))

##### Train GAMI with xgboost Depth-2

In [76]:
# train model with input: ds.train_x and target: ds.train_y
model_xgb2.fit(ds.train_x, ds.train_y)

In [77]:
ts_xgb2 = TestSuite(ds, model_xgb2) # store bundle of dataset and model in ts
# View model performance metrics 
results_xgb2 = ts_xgb2.diagnose_accuracy_table()
results_xgb2.table

Unnamed: 0,AUC,ACC,F1,LogLoss,Brier
train,0.8513,0.768875,0.796701,0.473657,0.156304
test,0.848288,0.7695,0.794563,0.479782,0.157486
GAP,-0.003012,0.000625,-0.002138,0.006125,0.001182


In [78]:
# Check feature importance
results_xgb2 = ts_xgb2.interpret_ei()
results_xgb2.plot(n_bars=10, figsize=(5,4))

In [88]:
# Check feature effect of most important feature
results_xgb2 = ts_xgb2.interpret_effects(features = "Utilization", grid_size=200)
results_xgb2.plot(figsize = (6,4))

#### Train monotonic xgboost by applying monotonicity constraints

In [85]:
# for xgboost with monotonic constraints
model_xgb2_mono = MoXGBClassifier(name = "XGB2_Mono", max_depth=2, monotone_constraints="(1, 1, -1, -1, -1, -1, -1)")
# train model with input: ds.train_x and target: ds.train_y
model_xgb2_mono.fit(ds.train_x, ds.train_y)

In [86]:
ts_xgb2_mono = TestSuite(ds, model_xgb2_mono) # store bundle of dataset and model in ts
# View model performance metrics 
results_xgb2_mono = ts_xgb2_mono.diagnose_accuracy_table()
results_xgb2_mono.table

Unnamed: 0,AUC,ACC,F1,LogLoss,Brier
train,0.847934,0.766625,0.794948,0.47935,0.15816
test,0.845584,0.76875,0.793941,0.483397,0.158977
GAP,-0.002351,0.002125,-0.001007,0.004047,0.000817


In [87]:
# Check feature importance
results_xgb2 = ts_xgb2.interpret_ei()
results_xgb2.plot(n_bars=10, figsize=(5,4))

In [90]:
# Check feature effect of most important feature
results_xgb2_mono = ts_xgb2_mono.interpret_effects(features = "Utilization", grid_size=200)
results_xgb2_mono.plot(figsize = (6,4))

#### Tune Monotonic GAMI model

In [91]:
from modeva.models import ModelTuneGridSearch
param_grid = {"n_estimators": [300, 500, 1000],
                "learning_rate": [0.01, 0.1, 0.3]}
model = MoXGBClassifier(max_depth = 2, monotone_constraints="(1, 1, -1, -1, -1, -1, -1)", verbose=-1)
hpo = ModelTuneGridSearch(dataset=ds, model=model)
result = hpo.run(param_grid=param_grid, n_jobs = 20,
                 metric=("AUC", "ACC", "LogLoss", "Brier"),
                 cv=5)
result.table

Unnamed: 0,n_estimators,learning_rate,AUC,ACC,LogLoss,Brier,AUC_rank,ACC_rank,LogLoss_rank,Brier_rank,Time
3,300,0.1,0.838027,0.758438,0.493099,0.163268,1,1,1,1,0.709649
4,500,0.1,0.83778,0.758312,0.493858,0.163432,2,2,2,2,1.003222
5,1000,0.1,0.837707,0.75825,0.494074,0.16348,3,3,3,3,1.180231
6,300,0.3,0.837481,0.75825,0.494539,0.163679,4,3,5,5,0.542775
7,500,0.3,0.837481,0.75825,0.494539,0.163679,4,3,6,6,0.693511
8,1000,0.3,0.837481,0.75825,0.494539,0.163679,4,3,4,4,0.994753
2,1000,0.01,0.835732,0.75625,0.498568,0.16482,7,7,7,7,1.612083
1,500,0.01,0.828375,0.75125,0.512048,0.169303,8,8,8,8,1.226749
0,300,0.01,0.820926,0.743062,0.526623,0.174455,9,9,9,9,0.953711


In [92]:
result.plot("parallel", figsize=(8, 6))

In [104]:
model_xgb2_mono_tuned = MoXGBClassifier(**result.value["params"][3],
                               name="XGB2-Mono-Tuned", max_depth = 2, monotone_constraints="(1, 1, -1, -1, -1, -1, -1)",
                               verbose=-1)
model_xgb2_mono_tuned.fit(ds.train_x, ds.train_y)

In [105]:
ts_xgb2_mono_tuned = TestSuite(ds, model_xgb2_mono_tuned) # store bundle of dataset and model in ts
# View model performance metrics 
results_xgb2_mono_tuned = ts_xgb2_mono_tuned.diagnose_accuracy_table()
results_xgb2_mono_tuned.table

Unnamed: 0,AUC,ACC,F1,LogLoss,Brier
train,0.848142,0.767375,0.795315,0.479263,0.158045
test,0.845525,0.77075,0.795084,0.483798,0.159026
GAP,-0.002617,0.003375,-0.000231,0.004534,0.000982


##### Compare model performance

In [99]:
results_xgb2_mono_tuned.plot()

In [106]:
tsc = TestSuite(ds, models=[model_glm, model_xgb1, model_xgb2, model_xgb2_mono_tuned])
results = tsc.compare_accuracy_table(train_dataset="train", test_dataset="test",
                                  metric=("AUC", "Brier", "LogLoss"))
results.plot()