### Robustness Analysis Example

#### Data Loading and Preparation

In [108]:
import pandas as pd
from modeva import DataSet # Import Data Processing Library
df = pd.read_csv("credit_example.csv") # Loading data into pd dataframe
ds = DataSet() # Create dataset object holder
ds.load_dataframe(data = df) 
df

Unnamed: 0,Mortgage,Balance,Amount Past Due,Delinquency,Inquiry,Open Trade,Utilization,Gender,Race,Status
0,139734.22,2717.87,0.00,0,0,0,0.785162,0,0,0
1,243359.62,193.60,0.00,0,0,0,0.254759,0,0,1
2,187784.19,395.05,0.00,0,1,0,0.360995,0,0,1
3,594626.89,180.94,0.00,0,0,0,0.128144,0,1,1
4,166771.42,1241.13,0.00,0,0,0,0.702958,0,0,0
...,...,...,...,...,...,...,...,...,...,...
19995,226806.34,670.99,0.00,0,1,0,0.922122,0,1,1
19996,308625.65,3223.94,0.00,0,0,0,0.989716,1,1,1
19997,375035.34,133.05,131.15,1,0,0,0.092523,0,0,1
19998,165377.42,2256.07,0.00,0,0,0,0.630330,1,0,1


##### Data Preprocessing

In [109]:
ds.encode_categorical(method="ordinal") # Encoding categorical variables as ordinal
ds.scale_numerical(features=tuple(ds.feature_names_numerical), 
                   method="standardize") # standardized numerical features
ds.set_inactive_features(['Gender','Race'])  # Exclude Gender and Race from modeling
ds.set_target('Status') # set target
ds.preprocess() # Run pre-processing
ds.set_random_split(test_ratio = 0.2) # Split training and testing

#### Modeling using GAMI-Tree and GAMI-Net

In [111]:
# GLM Model
from modeva.models import MoLogisticRegression        # Import GLM library
model_glm = MoLogisticRegression(name="GLM",
                             feature_names=ds.feature_names,
                             feature_types=ds.feature_types)  # GLM Model
model_glm.fit(ds.train_x, ds.train_y)

# GAMI-Tree Model
from modeva.models import MoXGBClassifier             # Import xgboost library
model_xgb2 = MoXGBClassifier(name = "XGB2_model", max_depth=2) # GAMI with xgboost depth-2
model_xgb2.fit(ds.train_x, ds.train_y)

# GAMI-Net Model
from modeva.models import MoGAMINetClassifier
model_GAMI = MoGAMINetClassifier(name = "GAMI_Net", feature_names = ds.feature_names,
                                 subnet_size_main_effect=(20,), subnet_size_interaction=(20, 20),
                                 learning_rates=(0.001, 0.001, 0.001),
                                 reg_mono=0.1, mono_increasing_list=(["Mortgage", "Balance"]), 
                                 mono_decreasing_list=tuple(["Delinquency", "Utilization"]))
# train model with input: ds.train_x and target: ds.train_y
model_GAMI.fit(ds.train_x, ds.train_y.ravel())

##### Compare performance

In [115]:
tsc = TestSuite(ds, models=[model_xgb2, model_GAMI, model_glm])
results = tsc.compare_accuracy_table(train_dataset="train", test_dataset="test",
                                  metric=("AUC", "Brier", "LogLoss"))
results.table

Unnamed: 0_level_0,XGB2_model,XGB2_model,XGB2_model,GAMI_Net,GAMI_Net,GAMI_Net,GLM,GLM,GLM
Unnamed: 0_level_1,AUC,Brier,LogLoss,AUC,Brier,LogLoss,AUC,Brier,LogLoss
train,0.8513,0.156304,0.473657,0.839359,0.162869,0.492682,0.809769,0.178494,0.535235
test,0.848288,0.157486,0.479782,0.843605,0.160227,0.486758,0.810225,0.177499,0.530331
GAP,-0.003012,0.001182,0.006125,0.004246,-0.002642,-0.005925,0.000457,-0.000995,-0.004904


#### Robustness Analysis

In [116]:
# robustness comparison of 2 models specified in tsc
results = tsc.compare_robustness(
    perturb_features=None,
    noise_levels=(0.1, 0.2, 0.3, 0.4),
    perturb_method="quantile",
    metric="AUC")
results.plot()

##### Robustness Clustering

In [118]:
# Robustness Check
ts = TestSuite(ds, model=model_xgb2) 
results = ts.diagnose_residual_cluster(
   dataset="test", # dataset
   response_type="abs_residual_perturb", # response type for robustness clustering
   metric="AUC", # metric
   n_clusters=10, # number of clusters
   cluster_method="pam", # clustering method
   sample_size=2000, # sample size
   rf_n_estimators=100, # number of trees
   rf_max_depth=5, # max depth of trees
)
# Show cluster residuals
results.plot("cluster_residual", figsize=(6,4))

In [119]:
# Check feature importance
results.plot("feature_importance", figsize=(6,4))

In [121]:
# Check data distribution
cluster_id = 1 # cluster id
data_results = ds.data_drift_test(
   **results.value["clusters"][cluster_id]["data_info"], # use the cluster_id
   distance_metric="PSI", # distance metric using PSI
   psi_method="uniform", # psi method using uniform distribution
   psi_bins=10 # psi bins
)
data_results.plot("summary", figsize=(6,4)) # plot summary of data drift test

In [122]:
data_results.plot(name=('density', 'Utilization'), figsize=(6,4)) # plot density plot for feature "credit_score"