#### Outcome Analysis: Reliability


In [19]:
from modeva import DataSet
from modeva import TestSuite
from modeva.models import MoXGBRegressor

#### Data Loading and Preparation

In [2]:
ds = DataSet()
ds.load(name="BikeSharing")
ds.data

✓ Auth code found in local storage.
Authenticating Modeva...
✓ License is active and valid.
✓ Authenticated successfully!


Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0000,16
1,1,0,1,1,0,6,0,1,0.22,0.2727,0.80,0.0000,40
2,1,0,1,2,0,6,0,1,0.22,0.2727,0.80,0.0000,32
3,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0000,13
4,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,1,1,12,19,0,1,1,2,0.26,0.2576,0.60,0.1642,119
17375,1,1,12,20,0,1,1,2,0.26,0.2576,0.60,0.1642,89
17376,1,1,12,21,0,1,1,1,0.26,0.2576,0.60,0.1642,90
17377,1,1,12,22,0,1,1,1,0.26,0.2727,0.56,0.1343,61


In [3]:
ds.set_random_split()
ds.set_target("cnt")
ds.scale_numerical(features=("cnt",), method="log1p")
ds.set_active_features(["mnth", "hr", "holiday", "weekday", "weathersit", "temp", "hum", "windspeed"])
ds.preprocess()
ds.feature_names

['mnth', 'hr', 'holiday', 'weekday', 'weathersit', 'temp', 'hum', 'windspeed']

#### Build Xgboost and NeuralTree

In [18]:
# xgboost depth 2 (interpretable)
model_xgb2 = MoXGBRegressor(name = "XGB_model_2", max_depth=2)
# train model with input: ds.train_x and target: ds.train_y
model_xgb2.fit(ds.train_x, ds.train_y)

# MoE of xgboost depth-3
model_xgb3 = MoXGBRegressor(name = "XGB_model_3", max_depth=3)
# train model with input: ds.train_x and target: ds.train_y
model_xgb3.fit(ds.train_x, ds.train_y)

# Performance comparison)
from modeva import TestSuite  # Import evaluation/testing library
tsc = TestSuite(ds, models=[model_xgb2, model_xgb3])
results = tsc.compare_accuracy_table(train_dataset="train", test_dataset="test",
                                     metric=("MAE", "MSE", "R2"))
results.plot("MSE",figsize=(6,4))

#### Reliability Analysis

In [20]:
results = tsc.compare_reliability(
    train_dataset="train",
    test_dataset="test",
    test_size=0.5,
    alpha=0.1,
    max_depth=5,
    random_state=0
)
results.table

Unnamed: 0_level_0,XGB_model_2,XGB_model_2,XGB_model_3,XGB_model_3
Unnamed: 0_level_1,Avg.Width,Avg.Coverage,Avg.Width,Avg.Coverage
0,1.299987,0.89931,1.171526,0.894131


In [24]:
ts_xgb2 = TestSuite(ds, model=model_xgb2)
results_xgb2 = ts_xgb2.diagnose_residual_cluster(
    dataset="test", # dataset
    response_type="pi_width", # response type
    metric="MSE", # metric
    n_clusters=10, # number of clusters
    cluster_method="pam", # clustering method
    sample_size=2000, # sample size
    rf_n_estimators=100, # number of trees
    rf_max_depth=5,
)
results_xgb2.table #

Unnamed: 0,MSE,Size,pi_width
7,0.536379,113.0,1.958585
3,0.565227,124.0,1.806343
8,0.426081,142.0,1.720377
1,0.275907,194.0,1.625687
5,0.319123,120.0,1.57444
0,0.234538,197.0,1.570958
4,0.286485,163.0,1.464876
9,0.224802,218.0,1.307055
2,0.215491,211.0,1.306436
6,0.098305,256.0,1.115857


In [26]:
results_xgb2.plot(figsize=(6,4))

In [28]:
# Check data distribution
cluster_id = 7 # cluster id
data_results = ds.data_drift_test(
   **results_xgb2.value["clusters"][cluster_id]["data_info"], # use the cluster_id
   distance_metric="PSI", # distance metric using PSI
   psi_method="uniform", # psi method using uniform distribution
   psi_bins=10 # psi bins
)
data_results.plot("summary", figsize=(6,4)) # plot summary of data drift test

In [29]:
data_results.plot(name=('density','mnth'), figsize=(6,4)) 