#### Outcome Analysis
##### - Performance Weakness
##### - Resilience
##### - Reliability
##### - Robustness

In [1]:
from modeva import DataSet
from modeva import TestSuite
from modeva.models import MoElasticNet, MoMoERegressor, MoXGBRegressor

#### Data Loading and Preparation

In [2]:
ds = DataSet()
ds.load(name="BikeSharing")
ds.data

✓ Auth code found in local storage.
Authenticating Modeva...
✓ License is active and valid.
✓ Authenticated successfully!


Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0000,16
1,1,0,1,1,0,6,0,1,0.22,0.2727,0.80,0.0000,40
2,1,0,1,2,0,6,0,1,0.22,0.2727,0.80,0.0000,32
3,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0000,13
4,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,1,1,12,19,0,1,1,2,0.26,0.2576,0.60,0.1642,119
17375,1,1,12,20,0,1,1,2,0.26,0.2576,0.60,0.1642,89
17376,1,1,12,21,0,1,1,1,0.26,0.2576,0.60,0.1642,90
17377,1,1,12,22,0,1,1,1,0.26,0.2727,0.56,0.1343,61


In [3]:
ds.set_random_split()
ds.set_target("cnt")
ds.scale_numerical(features=("cnt",), method="log1p")
ds.set_active_features(["mnth", "hr", "holiday", "weekday", "weathersit", "temp", "hum", "windspeed"])
ds.preprocess()
ds.feature_names

['mnth', 'hr', 'holiday', 'weekday', 'weathersit', 'temp', 'hum', 'windspeed']

#### Build GLM, Xgboost (Depth-2 and 5), MoE ofXgboost Depth-2

In [4]:
# GLM Model
model_glm = MoElasticNet(name="GLM",
                     feature_names=ds.feature_names,
                     feature_types=ds.feature_types,
                     alpha=0.01)  # GLM Model
model_glm.fit(ds.train_x, ds.train_y)

# xgboost depth 2 (interpretable)
model_xgb2 = MoXGBRegressor(name = "XGB_model_2", max_depth=2)
# train model with input: ds.train_x and target: ds.train_y
model_xgb2.fit(ds.train_x, ds.train_y)

#xgboost depth-5 (black box)
model_xgb5 = MoXGBRegressor(name = "XGB_model_5", max_depth=5)
# train model with input: ds.train_x and target: ds.train_y
model_xgb5.fit(ds.train_x, ds.train_y)

# MoE of xgboost depth-2 (interpretable)
model_moe = MoMoERegressor(name = "MOE_XGB2", max_depth=2, n_estimators = 200, n_clusters = 20, n_jobs = 20)
model_moe.fit(ds.train_x, ds.train_y)

# Performance comparison)
from modeva import TestSuite  # Import evaluation/testing library
tsc = TestSuite(ds, models=[model_glm, model_xgb2, model_xgb5, model_moe])
results = tsc.compare_accuracy_table(train_dataset="train", test_dataset="test",
                                     metric=("MAE", "MSE", "R2"))
results.plot("MSE",figsize=(6,4))

#### Permutation Feature Importance

In [5]:
ts_xgb_2 = TestSuite(ds, model_xgb2) 
results = ts_xgb_2.explain_pfi(dataset='test', sample_size=2000, n_repeats=5, random_state=0)
results.plot(n_bars=5, figsize=(6,3))

#### Feature Importance

In [6]:
results = ts_xgb_2.interpret_fi()
results.plot(n_bars=5, figsize=(6,3))

#### PDP Plot

In [7]:
results = ts_xgb_2.explain_pdp(features="hr", dataset='train', sample_size=2000, percentiles=(0, 1),
                         grid_resolution=100, response_method='auto', random_state=0)
results.plot(figsize=(6,3))

#### fANOVA Main Effect

In [8]:
results = ts_xgb_2.interpret_effects(features="hr")
results.plot(figsize=(6,3))

#### Residual Analysis

In [9]:
results_xgb_res = ts_xgb_2.diagnose_residual_interpret(dataset='test', n_estimators=100, max_depth=2) # train interpretable GBDT model with depth-2
results_xgb_res.plot("feature_importance", figsize=(6,3), n_bars = 5) # plot feature importance

In [10]:
ts_residual = results_xgb_res.value["TestSuite"] # get the testsuite object
ts_residual.interpret_effects("hr", dataset="test").plot(figsize=(6,3)) 

In [11]:
# Test Using Random Forest Proximity
results_RF = ts_xgb_2.diagnose_residual_cluster(
   dataset="test", # dataset to use
   response_type="abs_residual", # response type
   metric="MSE", #metric to use
   n_clusters=10, # number of clusters
   cluster_method="pam", # clustering method
   sample_size=2000, # sample size
   rf_n_estimators=100, # number of trees
   rf_max_depth=5, # max depth of trees
)
results_RF.table # table of cluster performance

Unnamed: 0,MSE,Size,abs_residual
0,0.789152,184.0,0.757785
5,0.494682,201.0,0.604346
9,0.488497,152.0,0.533402
2,0.416254,239.0,0.528869
7,0.293216,243.0,0.427448
8,0.273597,430.0,0.41141
4,0.229774,333.0,0.373669
3,0.203313,590.0,0.347204
6,0.165594,551.0,0.310079
1,0.090054,553.0,0.241253


In [12]:
# Show cluster residuals
results_RF.plot("cluster_residual", figsize=(6,4))

In [13]:
# Check data distribution
cluster_id = 0 # cluster id
data_results = ds.data_drift_test(
   **results_RF.value["clusters"][cluster_id]["data_info"], # use the cluster_id
   distance_metric="PSI", # distance metric using PSI
   psi_method="uniform", # psi method using uniform distribution
   psi_bins=10 # psi bins
)
data_results.plot("summary", figsize=(6,4)) # plot summary of data drift test

In [14]:
data_results.plot(name=('density','hr'), figsize=(6,4)) 

In [15]:
tsc = TestSuite(ds, models=[model_xgb2, model_xgb5, model_moe])
results = tsc.compare_residual_cluster(dataset="test")
results.plot("cluster_performance")

In [16]:
cluster_features = ['hr', 'mnth', 'temp']
model_moe_2 = MoMoERegressor(name="MOE_3_cluster_features",
                         max_depth=2,
                         n_estimators=200,
                         n_clusters=15,
                         feature_names=ds.feature_names,
                         cluster_features=cluster_features)
model_moe_2.fit(ds.train_x, ds.train_y)

In [18]:
tsc = TestSuite(ds, models=[model_xgb2, model_xgb5, model_moe, model_moe_2])
results = tsc.compare_residual_cluster(dataset="test")
results.plot("cluster_performance")