In [1]:
import sys

sys.path.append('../')

from pyspark.sql import SparkSession
from pyspark.sql import functions as F

from probabilistic_covshift.constants.automl_constants import AutoMLConfig as AutoMLConfig
from probabilistic_covshift.constants.automl_constants import H2OServerInfo as H2OServerInfo
from probabilistic_covshift.constants.main_constants import OriginFeatures as OriginFeatures
from probabilistic_covshift.constants.main_constants import WeightFeatures as WeightFeatures
from probabilistic_covshift.probabilistic_classification_covshift import ProbabilisticClassification

In [2]:
spark = SparkSession.builder.appName('main').master('local[4]').getOrCreate()

In [3]:
source_df = spark.createDataFrame([
    ('A', 38.9, 40.0, 55, 'E', 10.0), ('A', 88.9, 50.0, 15, 'E', 20.0), ('B', 18.9, 40.0, 55, 'P', 30.0),
    ('A', 38.9, 50.0, 15, 'E', 10.0), ('B', 48.9, 40.0, 55, 'P', 20.0), ('B', 58.9, 50.0, 15, 'E', 30.0), 
    ('A', 38.9, 40.0, 55, 'P', 10.0), ('B', 98.9, 50.0, 15, 'E', 20.0), ('A', 38.9, 40.0, 55, 'E', 10.0),
    ('A', 88.9, 50.0, 15, 'E', 20.0), ('B', 18.9, 40.0, 55, 'P', 30.0), ('A', 38.9, 50.0, 15, 'E', 10.0),
    ('B', 48.9, 40.0, 55, 'P', 20.0), ('B', 58.9, 50.0, 15, 'E', 30.0), ('A', 38.9, 40.0, 55, 'P', 10.0),
    ('B', 98.9, 50.0, 15, 'E', 20.0), ('A', 38.9, 40.0, 55, 'E', 10.0), ('A', 88.9, 50.0, 15, 'E', 20.0),
    ('B', 18.9, 40.0, 55, 'P', 30.0), ('A', 38.9, 50.0, 15, 'E', 10.0), ('B', 48.9, 40.0, 55, 'P', 20.0),
    ('B', 58.9, 50.0, 15, 'E', 30.0), ('A', 38.9, 40.0, 55, 'P', 10.0), ('B', 98.9, 50.0, 15, 'E', 20.0),
    ('A', 38.9, 40.0, 55, 'E', 10.0), ('A', 88.9, 50.0, 15, 'E', 20.0), ('B', 18.9, 40.0, 55, 'P', 30.0),
    ('A', 38.9, 50.0, 15, 'E', 10.0), ('B', 48.9, 40.0, 55, 'P', 20.0), ('B', 58.9, 50.0, 15, 'E', 30.0), 
    ('A', 38.9, 40.0, 55, 'P', 10.0), ('B', 98.9, 50.0, 15, 'E', 20.0), ('A', 38.9, 40.0, 55, 'E', 10.0),
    ('A', 88.9, 50.0, 15, 'E', 20.0), ('B', 18.9, 40.0, 55, 'P', 30.0), ('A', 38.9, 50.0, 15, 'E', 10.0),
    ('B', 48.9, 40.0, 55, 'P', 20.0), ('B', 58.9, 50.0, 15, 'E', 30.0), ('A', 38.9, 40.0, 55, 'P', 10.0),
    ('B', 98.9, 50.0, 15, 'E', 20.0), ('A', 38.9, 40.0, 55, 'E', 10.0), ('A', 88.9, 50.0, 15, 'E', 20.0),
    ('B', 18.9, 40.0, 55, 'P', 30.0), ('A', 38.9, 50.0, 15, 'E', 10.0), ('B', 48.9, 40.0, 55, 'P', 20.0),
    ('B', 58.9, 50.0, 15, 'E', 30.0), ('A', 38.9, 40.0, 55, 'P', 10.0), ('B', 98.9, 50.0, 15, 'E', 20.0)], 
    ['col_a', 'col_b', 'col_c', 'col_d', 'col_e', 'col_f'])

In [4]:
target_df = spark.createDataFrame([
    ('A', 48.9, 40.0, 55, 'E', 10.0), ('A', 58.9, 50.0, 15, 'E', 10.0), ('B', 38.9, 40.0, 55, 'P', 30.0),
    ('A', 48.9, 50.0, 15, 'E', 10.0), ('B', 58.9, 40.0, 55, 'P', 10.0), ('B', 38.9, 50.0, 15, 'E', 30.0),
    ('A', 48.9, 40.0, 55, 'P', 10.0), ('B', 58.9, 50.0, 15, 'E', 10.0), ('B', 38.9, 50.0, 15, 'E', 20.0),
    ('A', 48.9, 40.0, 55, 'E', 10.0), ('A', 58.9, 50.0, 15, 'E', 10.0), ('B', 38.9, 40.0, 55, 'P', 30.0),
    ('A', 48.9, 50.0, 15, 'E', 10.0), ('B', 38.9, 40.0, 55, 'P', 10.0), ('B', 38.9, 50.0, 15, 'E', 30.0),
    ('A', 48.9, 40.0, 55, 'P', 10.0), ('B', 38.9, 50.0, 15, 'E', 10.0), ('B', 38.9, 50.0, 15, 'E', 20.0),
    ('A', 58.9, 40.0, 55, 'E', 10.0), ('A', 38.9, 50.0, 15, 'E', 10.0), ('B', 38.9, 40.0, 55, 'P', 30.0),
    ('A', 58.9, 50.0, 15, 'E', 10.0), ('B', 148.9, 140.0, 55, 'P', 10.0), ('B', 38.9, 50.0, 15, 'E', 30.0),
    ('A', 58.9, 40.0, 55, 'P', 110.0), ('B', 198.9, 150.0, 15, 'E', 10.0), ('B', 38.9, 50.0, 15, 'E', 20.0),
    ('A', 58.9, 40.0, 55, 'E', 110.0), ('A', 38.9, 50.0, 15, 'E', 10.0), ('B', 38.9, 140.0, 55, 'P', 30.0),
    ('A', 58.9, 50.0, 15, 'E', 110.0), ('B', 38.9, 40.0, 55, 'P', 10.0), ('B', 38.9, 150.0, 15, 'E', 30.0),
    ('A', 58.9, 40.0, 55, 'P', 110.0), ('B', 38.9, 50.0, 15, 'E', 10.0), ('B', 38.9, 150.0, 15, 'E', 20.0)], 
    ['col_a', 'col_b', 'col_c', 'col_d', 'col_e', 'col_f'])

In [5]:
automl_conf = {
    AutoMLConfig.DATA: {
        AutoMLConfig.LABEL_COL: OriginFeatures.ORIGIN,
        AutoMLConfig.WEIGHT_COL: WeightFeatures.WEIGHT,
        AutoMLConfig.BASE_TABLE_PATH: 'data/base_table.parquet',
        AutoMLConfig.WEIGHT_PATH: 'data/weight.csv'
    },
    AutoMLConfig.SERVER_CONN_INFO: {
        H2OServerInfo.IP: 'localhost',
        H2OServerInfo.PORT: '54321'
    },
    AutoMLConfig.CROSS_VAL: {
        AutoMLConfig.FOLD_COL: "fold",
        AutoMLConfig.NFOLDS: 8,
    },
    AutoMLConfig.MODELING: {
        AutoMLConfig.MAX_RUNTIME_SECS: 3600,
        AutoMLConfig.MAX_MODELS: 10,
        AutoMLConfig.STOPPING_METRIC: 'logloss',
        AutoMLConfig.SORT_METRIC: 'logloss'
    },
    AutoMLConfig.EXCLUDE_ALGOS: [
        "StackedEnsemble",
        "DeepLearning"
    ],
    AutoMLConfig.MODEL: {
        AutoMLConfig.BEST_MODEL_PATH: 'data/model/'
    },
    AutoMLConfig.SEED: 23
}

In [6]:
pc = ProbabilisticClassification(source_df, target_df, automl_conf)
pc.run()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,1 hour 45 mins
H2O cluster timezone:,Asia/Jakarta
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.2
H2O cluster version age:,2 months and 5 days
H2O cluster name:,H2O_from_python_albertus_kelvin_xeikza
H2O cluster total nodes:,1
H2O cluster free memory:,3.533 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


Parse progress: |█████████████████████████████████████████████████████████| 100%
Base table inferred column types: {'col_b': 'real', 'col_c': 'int', 'col_d': 'int', 'col_f': 'int', 'origin': 'enum', 'row_id': 'int'}


INFO:probabilistic_covshift.automl.trainer:Base table inferred column types: {'col_b': 'real', 'col_c': 'int', 'col_d': 'int', 'col_f': 'int', 'origin': 'enum', 'row_id': 'int'}


AutoML progress: |
19:51:51.192: Fold column fold will be used for cross-validation. nfolds parameter will be ignored.

██████████████
19:52:00.250: Skipping training of model GBM_5_AutoML_20200325_195151 due to exception: water.exceptions.H2OModelBuilderIllegalArgumentException: Illegal argument(s) for GBM model: GBM_5_AutoML_20200325_195151.  Details: ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 84.0.


██████████████████████████████████████████| 100%


model_id,logloss,auc,aucpr,mean_per_class_error,rmse,mse,training_time_ms,predict_time_per_row_ms
GBM_1_AutoML_20200325_195151,0.186556,0.968171,0.938223,0.0833333,0.245425,0.0602334,72,0.032016
XRT_1_AutoML_20200325_195151,0.271628,0.96412,0.935878,0.0833333,0.275913,0.0761279,38,0.018486
DRF_1_AutoML_20200325_195151,0.273576,0.968171,0.938223,0.0833333,0.279075,0.0778831,30,0.019686
GBM_3_AutoML_20200325_195151,0.48854,0.855324,0.801219,0.180556,0.399826,0.159861,34,0.019885
GBM_4_AutoML_20200325_195151,0.494387,0.832755,0.787799,0.225694,0.403841,0.163087,33,0.02046
GBM_2_AutoML_20200325_195151,0.496639,0.83044,0.764129,0.232639,0.40595,0.164795,36,0.018627
XGBoost_3_AutoML_20200325_195151,0.614755,0.74537,0.592607,0.28125,0.461703,0.213169,63,0.014403
GLM_1_AutoML_20200325_195151,0.619592,0.647569,0.634618,0.361111,0.467038,0.218125,14,0.017523
XGBoost_1_AutoML_20200325_195151,0.69784,0.445602,0.398517,0.5,0.502057,0.252061,37,0.01466
XGBoost_2_AutoML_20200325_195151,0.698578,0.320891,0.311554,0.5,0.502556,0.252562,27,0.015767


Leaderboard: 



INFO:probabilistic_covshift.automl.trainer:Leaderboard: 



Cross validation metrics summary


INFO:probabilistic_covshift.automl.util:Cross validation metrics summary



Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid,cv_6_valid,cv_7_valid,cv_8_valid
0,accuracy,0.96666664,0.06172134,0.8666667,0.8666667,1.0,1.0,1.0,1.0,1.0,1.0
1,auc,0.98830354,0.021959204,0.96,0.9464286,1.0,1.0,1.0,1.0,1.0,1.0
2,aucpr,0.64072007,0.2852017,0.88166666,0.8107607,0.5,0.6666667,0.8,0.0,0.8,0.6666667
3,err,0.033333335,0.06172134,0.13333334,0.13333334,0.0,0.0,0.0,0.0,0.0,0.0
4,err_count,0.5,0.9258201,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
5,f0point5,0.9649015,0.06500317,0.86206895,0.85714287,1.0,1.0,1.0,1.0,1.0,1.0
6,f1,0.97077924,0.055859257,0.90909094,0.85714287,1.0,1.0,1.0,1.0,1.0,1.0
7,f2,0.97733516,0.05039543,0.96153843,0.85714287,1.0,1.0,1.0,1.0,1.0,1.0
8,lift_top_group,3.0636904,1.611714,1.5,2.142857,5.0,2.6666667,2.6,6.0,1.6,3.0
9,logloss,0.16246316,0.09694263,0.2795328,0.2753104,0.12911491,0.20788385,0.20727022,0.00010080564,0.10568059,0.09481172



See the whole table with table.as_data_frame()


Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid,cv_6_valid,cv_7_valid,cv_8_valid
0,accuracy,0.96666664,0.06172134,0.8666667,0.8666667,1.0,1.0,1.0,1.0,1.0,1.0
1,auc,0.98830354,0.021959204,0.96,0.9464286,1.0,1.0,1.0,1.0,1.0,1.0
2,aucpr,0.64072007,0.2852017,0.88166666,0.8107607,0.5,0.6666667,0.8,0.0,0.8,0.6666667
3,err,0.033333335,0.06172134,0.13333334,0.13333334,0.0,0.0,0.0,0.0,0.0,0.0
4,err_count,0.5,0.9258201,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
5,f0point5,0.9649015,0.06500317,0.86206895,0.85714287,1.0,1.0,1.0,1.0,1.0,1.0
6,f1,0.97077924,0.055859257,0.90909094,0.85714287,1.0,1.0,1.0,1.0,1.0,1.0
7,f2,0.97733516,0.05039543,0.96153843,0.85714287,1.0,1.0,1.0,1.0,1.0,1.0
8,lift_top_group,3.0636904,1.611714,1.5,2.142857,5.0,2.6666667,2.6,6.0,1.6,3.0
9,logloss,0.16246316,0.09694263,0.2795328,0.2753104,0.12911491,0.20788385,0.20727022,0.00010080564,0.10568059,0.09481172



See the whole table with table.as_data_frame()


INFO:probabilistic_covshift.automl.util:


Cross validation model performance


INFO:probabilistic_covshift.automl.util:Cross validation model performance



ModelMetricsBinomial: gbm
** Reported on cross-validation data. **

MSE: 0.06023344517183121
RMSE: 0.24542502963599946
LogLoss: 0.1865563749719294
Mean Per-Class Error: 0.08333333333333326
AUC: 0.9681712962962963
AUCPR: 0.9382228444309199
Gini: 0.9363425925925926

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.6378108050725811: 


Unnamed: 0,Unnamed: 1,source,target,Error,Rate
0,source,48.0,0.0,0.0,(0.0/48.0)
1,target,6.0,30.0,0.1667,(6.0/36.0)
2,Total,54.0,30.0,0.0714,(6.0/84.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.637811,0.909091,26.0
1,max f2,0.303129,0.923913,32.0
2,max f0point5,0.637811,0.961538,26.0
3,max accuracy,0.637811,0.928571,26.0
4,max precision,0.999816,1.0,0.0
5,max recall,0.092886,1.0,39.0
6,max specificity,0.999816,1.0,0.0
7,max absolute_mcc,0.637811,0.860663,26.0
8,max min_per_class_accuracy,0.364555,0.875,31.0
9,max mean_per_class_accuracy,0.637811,0.916667,26.0



Gains/Lift Table: Avg response rate: 42.86 %, avg score: 42.26 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.011905,0.999664,2.333333,2.333333,1.0,0.999816,1.0,0.999816,0.027778,0.027778,133.333333,133.333333
1,,2,0.02381,0.999163,2.333333,2.333333,1.0,0.999633,1.0,0.999725,0.027778,0.055556,133.333333,133.333333
2,,3,0.035714,0.998818,2.333333,2.333333,1.0,0.998921,1.0,0.999457,0.027778,0.083333,133.333333,133.333333
3,,4,0.047619,0.998655,2.333333,2.333333,1.0,0.998711,1.0,0.99927,0.027778,0.111111,133.333333,133.333333
4,,5,0.059524,0.998501,2.333333,2.333333,1.0,0.998536,1.0,0.999123,0.027778,0.138889,133.333333,133.333333
5,,6,0.107143,0.997284,2.333333,2.333333,1.0,0.998051,1.0,0.998647,0.111111,0.25,133.333333,133.333333
6,,7,0.154762,0.991797,2.333333,2.333333,1.0,0.994838,1.0,0.997475,0.111111,0.361111,133.333333,133.333333
7,,8,0.202381,0.981584,2.333333,2.333333,1.0,0.988946,1.0,0.995468,0.111111,0.472222,133.333333,133.333333
8,,9,0.297619,0.962344,2.333333,2.333333,1.0,0.971351,1.0,0.987751,0.222222,0.694444,133.333333,133.333333
9,,10,0.404762,0.447427,1.296296,2.058824,0.555556,0.715528,0.882353,0.915692,0.138889,0.833333,29.62963,105.882353





ModelMetricsBinomial: gbm
** Reported on cross-validation data. **

MSE: 0.06023344517183121
RMSE: 0.24542502963599946
LogLoss: 0.1865563749719294
Mean Per-Class Error: 0.08333333333333326
AUC: 0.9681712962962963
AUCPR: 0.9382228444309199
Gini: 0.9363425925925926

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.6378108050725811: 


Unnamed: 0,Unnamed: 1,source,target,Error,Rate
0,source,48.0,0.0,0.0,(0.0/48.0)
1,target,6.0,30.0,0.1667,(6.0/36.0)
2,Total,54.0,30.0,0.0714,(6.0/84.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.637811,0.909091,26.0
1,max f2,0.303129,0.923913,32.0
2,max f0point5,0.637811,0.961538,26.0
3,max accuracy,0.637811,0.928571,26.0
4,max precision,0.999816,1.0,0.0
5,max recall,0.092886,1.0,39.0
6,max specificity,0.999816,1.0,0.0
7,max absolute_mcc,0.637811,0.860663,26.0
8,max min_per_class_accuracy,0.364555,0.875,31.0
9,max mean_per_class_accuracy,0.637811,0.916667,26.0



Gains/Lift Table: Avg response rate: 42.86 %, avg score: 42.26 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.011905,0.999664,2.333333,2.333333,1.0,0.999816,1.0,0.999816,0.027778,0.027778,133.333333,133.333333
1,,2,0.02381,0.999163,2.333333,2.333333,1.0,0.999633,1.0,0.999725,0.027778,0.055556,133.333333,133.333333
2,,3,0.035714,0.998818,2.333333,2.333333,1.0,0.998921,1.0,0.999457,0.027778,0.083333,133.333333,133.333333
3,,4,0.047619,0.998655,2.333333,2.333333,1.0,0.998711,1.0,0.99927,0.027778,0.111111,133.333333,133.333333
4,,5,0.059524,0.998501,2.333333,2.333333,1.0,0.998536,1.0,0.999123,0.027778,0.138889,133.333333,133.333333
5,,6,0.107143,0.997284,2.333333,2.333333,1.0,0.998051,1.0,0.998647,0.111111,0.25,133.333333,133.333333
6,,7,0.154762,0.991797,2.333333,2.333333,1.0,0.994838,1.0,0.997475,0.111111,0.361111,133.333333,133.333333
7,,8,0.202381,0.981584,2.333333,2.333333,1.0,0.988946,1.0,0.995468,0.111111,0.472222,133.333333,133.333333
8,,9,0.297619,0.962344,2.333333,2.333333,1.0,0.971351,1.0,0.987751,0.222222,0.694444,133.333333,133.333333
9,,10,0.404762,0.447427,1.296296,2.058824,0.555556,0.715528,0.882353,0.915692,0.138889,0.833333,29.62963,105.882353





INFO:probabilistic_covshift.automl.util:


Threshold that maximizes F1: 0.6378108050725811


INFO:probabilistic_covshift.automl.util:Threshold that maximizes F1: 0.6378108050725811


F1: 0.9090909090909091


INFO:probabilistic_covshift.automl.util:F1: 0.9090909090909091


/Users/albertus.kelvin/Documents/PROJECTS/probabilistic-covshift/example/data/model/GBM_1_AutoML_20200325_195151
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Load the model and leaderboard


INFO:probabilistic_covshift.automl.predictor:Load the model and leaderboard


Type(model): <class 'h2o.estimators.gbm.H2OGradientBoostingEstimator'>


INFO:probabilistic_covshift.automl.predictor:Type(model): <class 'h2o.estimators.gbm.H2OGradientBoostingEstimator'>



Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,89.0,89.0,19129.0,3.0,6.0,5.123596,5.0,20.0,12.505618


Loaded model: 


INFO:probabilistic_covshift.automl.predictor:Loaded model: 


gbm prediction progress: |████████████████████████████████████████████████| 100%
Export File progress: |███████████████████████████████████████████████████| 100%
