In [1]:
import sys

sys.path.append('../')

from pyspark.sql import SparkSession
from pyspark.sql import functions as F

from probabilistic_covshift.constants.automl_constants import AutoMLConfig
from probabilistic_covshift.constants.automl_constants import H2OServerInfo
from probabilistic_covshift.constants.main_constants import OriginFeatures
from probabilistic_covshift.constants.main_constants import WeightFeatures
from probabilistic_covshift.probabilistic_classification_covshift import ProbabilisticClassification

In [2]:
spark = SparkSession.builder.appName('main').master('local[4]').getOrCreate()

In [3]:
source_df = spark.createDataFrame([
    (38.9, 40.0, 55, 10.0), (88.9, 50.0, 15, 20.0),
    (38.9, 50.0, 15, 10.0), (48.9, 40.0, 55, 20.0),
    (38.9, 40.0, 55, 10.0), (98.9, 50.0, 15, 20.0),
    (88.9, 50.0, 15, 20.0), (18.9, 40.0, 55, 30.0),
    (48.9, 40.0, 55, 20.0), (58.9, 50.0, 15, 30.0),
    (98.9, 50.0, 15, 20.0), (38.9, 40.0, 55, 10.0),
    (18.9, 40.0, 55, 30.0), (38.9, 50.0, 15, 10.0),
    (58.9, 50.0, 15, 30.0), (38.9, 40.0, 55, 10.0),
    (38.9, 40.0, 55, 10.0), (88.9, 50.0, 15, 20.0),
    (38.9, 50.0, 15, 10.0), (48.9, 40.0, 55, 20.0),
    (38.9, 40.0, 55, 10.0), (98.9, 50.0, 15, 20.0),
    (88.9, 50.0, 15, 20.0), (18.9, 40.0, 55, 30.0),
    (48.9, 40.0, 55, 20.0), (58.9, 50.0, 15, 30.0),
    (98.9, 50.0, 15, 20.0), (38.9, 40.0, 55, 10.0),
    (18.9, 40.0, 55, 30.0), (38.9, 50.0, 15, 10.0),
    (58.9, 50.0, 15, 30.0), (38.9, 40.0, 55, 10.0)],
    ['col_a', 'col_b', 'col_c', 'col_d'])

In [4]:
target_df = spark.createDataFrame([
    (18.9, 40.0, 95, 10.0), (38.9, 50.0, 15, 20.0),
    (18.9, 50.0, 95, 10.0), (38.9, 40.0, 55, 20.0),
    (18.9, 40.0, 95, 10.0), (38.9, 50.0, 15, 20.0),
    (18.9, 50.0, 95, 30.0), (38.9, 40.0, 55, 30.0),
    (18.9, 40.0, 95, 30.0), (38.9, 50.0, 15, 30.0),
    (38.9, 50.0, 95, 30.0), (18.9, 40.0, 55, 10.0),
    (38.9, 40.0, 95, 30.0), (18.9, 50.0, 15, 10.0),
    (38.9, 50.0, 95, 30.0), (18.9, 40.0, 55, 10.0),
    (38.9, 40.0, 55, 30.0), (58.9, 50.0, 15, 20.0),
    (38.9, 50.0, 15, 30.0), (58.9, 40.0, 55, 20.0),
    (38.9, 40.0, 55, 30.0), (58.9, 50.0, 15, 20.0),
    (58.9, 50.0, 15, 30.0), (58.9, 40.0, 55, 30.0),
    (58.9, 40.0, 55, 30.0), (58.9, 50.0, 15, 30.0),
    (58.9, 50.0, 15, 30.0), (58.9, 40.0, 55, 10.0),
    (58.9, 40.0, 55, 30.0), (58.9, 50.0, 15, 10.0),
    (58.9, 50.0, 15, 30.0), (58.9, 40.0, 55, 10.0)],
    ['col_a', 'col_b', 'col_c', 'col_d'])

# Compute weights

In [5]:
conf = {
    AutoMLConfig.DATA: {
        AutoMLConfig.LABEL_COL: OriginFeatures.ORIGIN,
        AutoMLConfig.WEIGHT_COL: WeightFeatures.WEIGHT,
        AutoMLConfig.BASE_TABLE_PATH: 'data/base_table.parquet',
        AutoMLConfig.WEIGHT_PATH: 'data/weight.csv'
    },
    AutoMLConfig.SERVER_CONN_INFO: {
        H2OServerInfo.IP: 'localhost',
        H2OServerInfo.PORT: '54321'
    },
    AutoMLConfig.CROSS_VAL: {
        AutoMLConfig.FOLD_COL: "fold",
        AutoMLConfig.NFOLDS: 8,
    },
    AutoMLConfig.MODELING: {
        AutoMLConfig.MAX_RUNTIME_SECS: 3600,
        AutoMLConfig.MAX_MODELS: 10,
        AutoMLConfig.STOPPING_METRIC: 'logloss',
        AutoMLConfig.SORT_METRIC: 'logloss'
    },
    AutoMLConfig.EXCLUDE_ALGOS: [
        "StackedEnsemble",
        "DeepLearning"
    ],
    AutoMLConfig.MODEL: {
        AutoMLConfig.BEST_MODEL_PATH: 'data/model/'
    },
    AutoMLConfig.SEED: 23
}

In [6]:
pc = ProbabilisticClassification(source_df, target_df, conf)
pc.run()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_191"; Java(TM) SE Runtime Environment (build 1.8.0_191-b12); Java HotSpot(TM) 64-Bit Server VM (build 25.191-b12, mixed mode)
  Starting server from /Users/albertus.kelvin/Documents/GLAIR_OWN/invstr-credit-scoring/env/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/c4/drbdtk0d67b4g99gp19b655c0000gp/T/tmpb4xrfjav
  JVM stdout: /var/folders/c4/drbdtk0d67b4g99gp19b655c0000gp/T/tmpb4xrfjav/h2o_albertus_kelvin_started_from_python.out
  JVM stderr: /var/folders/c4/drbdtk0d67b4g99gp19b655c0000gp/T/tmpb4xrfjav/h2o_albertus_kelvin_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,07 secs
H2O cluster timezone:,Asia/Jakarta
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.2
H2O cluster version age:,2 months and 5 days
H2O cluster name:,H2O_from_python_albertus_kelvin_l5wysn
H2O cluster total nodes:,1
H2O cluster free memory:,3.556 Gb
H2O cluster total cores:,0
H2O cluster allowed cores:,0


Parse progress: |█████████████████████████████████████████████████████████| 100%
Base table inferred column types: {'col_a': 'real', 'col_b': 'int', 'col_c': 'int', 'col_d': 'int', 'origin': 'enum', 'row_id': 'int'}


INFO:probabilistic_covshift.automl.trainer:Base table inferred column types: {'col_a': 'real', 'col_b': 'int', 'col_c': 'int', 'col_d': 'int', 'origin': 'enum', 'row_id': 'int'}


AutoML progress: |
20:50:13.449: Fold column fold will be used for cross-validation. nfolds parameter will be ignored.

███████████████
20:50:24.691: Skipping training of model GBM_5_AutoML_20200325_205013 due to exception: water.exceptions.H2OModelBuilderIllegalArgumentException: Illegal argument(s) for GBM model: GBM_5_AutoML_20200325_205013.  Details: ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 64.0.


█████████████████████████████████████████| 100%


model_id,logloss,auc,aucpr,mean_per_class_error,rmse,mse,training_time_ms,predict_time_per_row_ms
GBM_1_AutoML_20200325_205013,0.158705,0.983398,0.95298,0.0625,0.23743,0.0563728,128,0.060321
XRT_1_AutoML_20200325_205013,0.277691,0.97168,0.909793,0.078125,0.285155,0.0813134,51,0.039699
DRF_1_AutoML_20200325_205013,0.313299,0.960938,0.869001,0.078125,0.30806,0.0949008,48,0.029327
GLM_1_AutoML_20200325_205013,0.646755,0.676758,0.677642,0.3125,0.480984,0.231345,23,0.034154
GBM_3_AutoML_20200325_205013,0.690877,0.615723,0.550971,0.34375,0.498013,0.248017,32,0.035117
XGBoost_1_AutoML_20200325_205013,0.702867,0.380371,0.404435,0.5,0.504831,0.254855,68,0.028668
XGBoost_2_AutoML_20200325_205013,0.703422,0.363281,0.380878,0.5,0.505105,0.255131,42,0.030702
XGBoost_3_AutoML_20200325_205013,0.703889,0.535156,0.499382,0.5,0.504634,0.254656,87,0.024903
GBM_4_AutoML_20200325_205013,0.706368,0.600098,0.536821,0.328125,0.503695,0.253708,32,0.029798
GBM_2_AutoML_20200325_205013,0.714846,0.57959,0.531972,0.328125,0.507338,0.257392,39,0.028417


Leaderboard: 



INFO:probabilistic_covshift.automl.trainer:Leaderboard: 



Cross validation metrics summary


INFO:probabilistic_covshift.automl.util:Cross validation metrics summary



Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid,cv_6_valid,cv_7_valid,cv_8_valid
0,accuracy,0.9704861,0.054775022,0.8888889,1.0,1.0,1.0,0.875,1.0,1.0,1.0
1,auc,0.9859375,0.026252126,0.95,1.0,1.0,1.0,0.9375,1.0,1.0,1.0
2,aucpr,0.6919271,0.09008674,0.7083333,0.75,0.8,0.75,0.69375,0.6666667,0.6666667,0.5
3,err,0.029513888,0.054775022,0.11111111,0.0,0.0,0.0,0.125,0.0,0.0,0.0
4,err_count,0.25,0.46291006,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,f0point5,0.9713542,0.05990392,0.9375,1.0,1.0,1.0,0.8333333,1.0,1.0,1.0
6,f1,0.96825397,0.059391387,0.85714287,1.0,1.0,1.0,0.8888889,1.0,1.0,1.0
7,f2,0.96773183,0.073929526,0.7894737,1.0,1.0,1.0,0.95238096,1.0,1.0,1.0
8,lift_top_group,2.1208334,0.6163609,2.25,2.0,1.8,1.25,2.0,1.6666666,3.0,3.0
9,logloss,0.15396479,0.14058098,0.276085,0.13929272,0.005406139,0.034427386,0.4014059,0.21450697,0.0057743485,0.15481982



See the whole table with table.as_data_frame()


Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid,cv_6_valid,cv_7_valid,cv_8_valid
0,accuracy,0.9704861,0.054775022,0.8888889,1.0,1.0,1.0,0.875,1.0,1.0,1.0
1,auc,0.9859375,0.026252126,0.95,1.0,1.0,1.0,0.9375,1.0,1.0,1.0
2,aucpr,0.6919271,0.09008674,0.7083333,0.75,0.8,0.75,0.69375,0.6666667,0.6666667,0.5
3,err,0.029513888,0.054775022,0.11111111,0.0,0.0,0.0,0.125,0.0,0.0,0.0
4,err_count,0.25,0.46291006,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,f0point5,0.9713542,0.05990392,0.9375,1.0,1.0,1.0,0.8333333,1.0,1.0,1.0
6,f1,0.96825397,0.059391387,0.85714287,1.0,1.0,1.0,0.8888889,1.0,1.0,1.0
7,f2,0.96773183,0.073929526,0.7894737,1.0,1.0,1.0,0.95238096,1.0,1.0,1.0
8,lift_top_group,2.1208334,0.6163609,2.25,2.0,1.8,1.25,2.0,1.6666666,3.0,3.0
9,logloss,0.15396479,0.14058098,0.276085,0.13929272,0.005406139,0.034427386,0.4014059,0.21450697,0.0057743485,0.15481982



See the whole table with table.as_data_frame()


INFO:probabilistic_covshift.automl.util:


Cross validation model performance


INFO:probabilistic_covshift.automl.util:Cross validation model performance



ModelMetricsBinomial: gbm
** Reported on cross-validation data. **

MSE: 0.056372835246841585
RMSE: 0.2374296427298866
LogLoss: 0.15870474746278895
Mean Per-Class Error: 0.0625
AUC: 0.9833984375
AUCPR: 0.9529801361832612
Gini: 0.966796875

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.12361275820647073: 


Unnamed: 0,Unnamed: 1,source,target,Error,Rate
0,source,28.0,4.0,0.125,(4.0/32.0)
1,target,0.0,32.0,0.0,(0.0/32.0)
2,Total,28.0,36.0,0.0625,(4.0/64.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.123613,0.941176,30.0
1,max f2,0.123613,0.97561,30.0
2,max f0point5,0.797886,0.964286,24.0
3,max accuracy,0.123613,0.9375,30.0
4,max precision,0.999957,1.0,0.0
5,max recall,0.123613,1.0,30.0
6,max specificity,0.999957,1.0,0.0
7,max absolute_mcc,0.123613,0.881917,30.0
8,max min_per_class_accuracy,0.619261,0.875,25.0
9,max mean_per_class_accuracy,0.123613,0.9375,30.0



Gains/Lift Table: Avg response rate: 50.00 %, avg score: 48.15 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.015625,0.999956,2.0,2.0,1.0,0.999957,1.0,0.999957,0.03125,0.03125,100.0,100.0
1,,2,0.03125,0.999944,2.0,2.0,1.0,0.999956,1.0,0.999956,0.03125,0.0625,100.0,100.0
2,,3,0.03125,0.999917,0.0,2.0,0.0,0.0,1.0,0.999956,0.0,0.0625,-100.0,100.0
3,,4,0.046875,0.999856,2.0,2.0,1.0,0.999912,1.0,0.999942,0.03125,0.09375,100.0,100.0
4,,5,0.078125,0.999805,2.0,2.0,1.0,0.999805,1.0,0.999887,0.0625,0.15625,100.0,100.0
5,,6,0.109375,0.998633,2.0,2.0,1.0,0.998916,1.0,0.999609,0.0625,0.21875,100.0,100.0
6,,7,0.15625,0.996086,2.0,2.0,1.0,0.99803,1.0,0.999136,0.09375,0.3125,100.0,100.0
7,,8,0.21875,0.991707,2.0,2.0,1.0,0.993018,1.0,0.997388,0.125,0.4375,100.0,100.0
8,,9,0.296875,0.980537,2.0,2.0,1.0,0.987172,1.0,0.994699,0.15625,0.59375,100.0,100.0
9,,10,0.40625,0.847003,2.0,2.0,1.0,0.941268,1.0,0.980314,0.21875,0.8125,100.0,100.0





ModelMetricsBinomial: gbm
** Reported on cross-validation data. **

MSE: 0.056372835246841585
RMSE: 0.2374296427298866
LogLoss: 0.15870474746278895
Mean Per-Class Error: 0.0625
AUC: 0.9833984375
AUCPR: 0.9529801361832612
Gini: 0.966796875

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.12361275820647073: 


Unnamed: 0,Unnamed: 1,source,target,Error,Rate
0,source,28.0,4.0,0.125,(4.0/32.0)
1,target,0.0,32.0,0.0,(0.0/32.0)
2,Total,28.0,36.0,0.0625,(4.0/64.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.123613,0.941176,30.0
1,max f2,0.123613,0.97561,30.0
2,max f0point5,0.797886,0.964286,24.0
3,max accuracy,0.123613,0.9375,30.0
4,max precision,0.999957,1.0,0.0
5,max recall,0.123613,1.0,30.0
6,max specificity,0.999957,1.0,0.0
7,max absolute_mcc,0.123613,0.881917,30.0
8,max min_per_class_accuracy,0.619261,0.875,25.0
9,max mean_per_class_accuracy,0.123613,0.9375,30.0



Gains/Lift Table: Avg response rate: 50.00 %, avg score: 48.15 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.015625,0.999956,2.0,2.0,1.0,0.999957,1.0,0.999957,0.03125,0.03125,100.0,100.0
1,,2,0.03125,0.999944,2.0,2.0,1.0,0.999956,1.0,0.999956,0.03125,0.0625,100.0,100.0
2,,3,0.03125,0.999917,0.0,2.0,0.0,0.0,1.0,0.999956,0.0,0.0625,-100.0,100.0
3,,4,0.046875,0.999856,2.0,2.0,1.0,0.999912,1.0,0.999942,0.03125,0.09375,100.0,100.0
4,,5,0.078125,0.999805,2.0,2.0,1.0,0.999805,1.0,0.999887,0.0625,0.15625,100.0,100.0
5,,6,0.109375,0.998633,2.0,2.0,1.0,0.998916,1.0,0.999609,0.0625,0.21875,100.0,100.0
6,,7,0.15625,0.996086,2.0,2.0,1.0,0.99803,1.0,0.999136,0.09375,0.3125,100.0,100.0
7,,8,0.21875,0.991707,2.0,2.0,1.0,0.993018,1.0,0.997388,0.125,0.4375,100.0,100.0
8,,9,0.296875,0.980537,2.0,2.0,1.0,0.987172,1.0,0.994699,0.15625,0.59375,100.0,100.0
9,,10,0.40625,0.847003,2.0,2.0,1.0,0.941268,1.0,0.980314,0.21875,0.8125,100.0,100.0





INFO:probabilistic_covshift.automl.util:


Threshold that maximizes F1: 0.12361275820647073


INFO:probabilistic_covshift.automl.util:Threshold that maximizes F1: 0.12361275820647073


F1: 0.9411764705882353


INFO:probabilistic_covshift.automl.util:F1: 0.9411764705882353


/Users/albertus.kelvin/Documents/PROJECTS/probabilistic-covshift/example/data/model/GBM_1_AutoML_20200325_205013
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Load the model and leaderboard


INFO:probabilistic_covshift.automl.predictor:Load the model and leaderboard


Type(model): <class 'h2o.estimators.gbm.H2OGradientBoostingEstimator'>


INFO:probabilistic_covshift.automl.predictor:Type(model): <class 'h2o.estimators.gbm.H2OGradientBoostingEstimator'>



Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,95.0,95.0,20622.0,0.0,6.0,5.189474,1.0,19.0,12.642105


Loaded model: 


INFO:probabilistic_covshift.automl.predictor:Loaded model: 


gbm prediction progress: |████████████████████████████████████████████████| 100%
Export File progress: |███████████████████████████████████████████████████| 100%


# Append the weight to the base table

In [7]:
base_frame_df = spark.read.parquet(conf[AutoMLConfig.DATA][AutoMLConfig.BASE_TABLE_PATH])\
                     .drop(conf[AutoMLConfig.DATA][AutoMLConfig.LABEL_COL])
base_frame_df.toPandas().head()

Unnamed: 0,col_a,col_b,col_c,col_d,row_id
0,38.9,40.0,55,10.0,1
1,88.9,50.0,15,20.0,2
2,38.9,50.0,15,10.0,3
3,48.9,40.0,55,20.0,4
4,38.9,40.0,55,10.0,5


In [8]:
weight_df = spark.read.csv(conf[AutoMLConfig.DATA][AutoMLConfig.WEIGHT_PATH], header=True)
weight_df.toPandas().head()

Unnamed: 0,row_id,weight
0,1,0.0036418907088322
1,2,0.0008131772727181886
2,3,0.003894342741075
3,4,0.0009556472283483082
4,5,0.0036418907088322


In [9]:
weighted_base_frame_df = base_frame_df.join(weight_df, how='left', on='row_id')
weighted_base_frame_df.toPandas().head()

Unnamed: 0,row_id,col_a,col_b,col_c,col_d,weight
0,1,38.9,40.0,55,10.0,0.0036418907088322
1,2,88.9,50.0,15,20.0,0.0008131772727181886
2,3,38.9,50.0,15,10.0,0.003894342741075
3,4,48.9,40.0,55,20.0,0.0009556472283483082
4,5,38.9,40.0,55,10.0,0.0036418907088322
