In [1]:
import sys

sys.path.append('../')

from pyspark.sql import SparkSession
from pyspark.sql import functions as F

from probabilistic_covshift.constants.automl_constants import AutoMLConfig
from probabilistic_covshift.constants.automl_constants import H2OServerInfo
from probabilistic_covshift.constants.main_constants import OriginFeatures
from probabilistic_covshift.constants.main_constants import WeightFeatures
from probabilistic_covshift.probabilistic_classification_covshift import ProbabilisticClassification

In [2]:
spark = SparkSession.builder.appName('main').master('local[4]').getOrCreate()

In [3]:
source_df = spark.createDataFrame([
    (38.9, 40.0, 55, 10.0), (88.9, 50.0, 15, 20.0),
    (38.9, 50.0, 15, 10.0), (48.9, 40.0, 55, 20.0),
    (38.9, 40.0, 55, 10.0), (98.9, 50.0, 15, 20.0),
    (88.9, 50.0, 15, 20.0), (18.9, 40.0, 55, 30.0),
    (48.9, 40.0, 55, 20.0), (58.9, 50.0, 15, 30.0),
    (98.9, 50.0, 15, 20.0), (38.9, 40.0, 55, 10.0),
    (18.9, 40.0, 55, 30.0), (38.9, 50.0, 15, 10.0),
    (58.9, 50.0, 15, 30.0), (38.9, 40.0, 55, 10.0),
    (38.9, 40.0, 55, 10.0), (88.9, 50.0, 15, 20.0),
    (38.9, 50.0, 15, 10.0), (48.9, 40.0, 55, 20.0),
    (38.9, 40.0, 55, 10.0), (98.9, 50.0, 15, 20.0),
    (88.9, 50.0, 15, 20.0), (18.9, 40.0, 55, 30.0),
    (48.9, 40.0, 55, 20.0), (58.9, 50.0, 15, 30.0),
    (98.9, 50.0, 15, 20.0), (38.9, 40.0, 55, 10.0),
    (18.9, 40.0, 55, 30.0), (38.9, 50.0, 15, 10.0),
    (58.9, 50.0, 15, 30.0), (38.9, 40.0, 55, 10.0)],
    ['col_a', 'col_b', 'col_c', 'col_d'])

In [4]:
target_df = spark.createDataFrame([
    (18.9, 40.0, 95, 10.0), (38.9, 50.0, 15, 20.0),
    (18.9, 50.0, 95, 10.0), (38.9, 40.0, 55, 20.0),
    (18.9, 40.0, 95, 10.0), (38.9, 50.0, 15, 20.0),
    (18.9, 50.0, 95, 30.0), (38.9, 40.0, 55, 30.0),
    (18.9, 40.0, 95, 30.0), (38.9, 50.0, 15, 30.0),
    (38.9, 50.0, 95, 30.0), (18.9, 40.0, 55, 10.0),
    (38.9, 40.0, 95, 30.0), (18.9, 50.0, 15, 10.0),
    (38.9, 50.0, 95, 30.0), (18.9, 40.0, 55, 10.0),
    (38.9, 40.0, 55, 30.0), (58.9, 50.0, 15, 20.0),
    (38.9, 50.0, 15, 30.0), (58.9, 40.0, 55, 20.0),
    (38.9, 40.0, 55, 30.0), (58.9, 50.0, 15, 20.0),
    (58.9, 50.0, 15, 30.0), (58.9, 40.0, 55, 30.0),
    (58.9, 40.0, 55, 30.0), (58.9, 50.0, 15, 30.0),
    (58.9, 50.0, 15, 30.0), (58.9, 40.0, 55, 10.0),
    (58.9, 40.0, 55, 30.0), (58.9, 50.0, 15, 10.0),
    (58.9, 50.0, 15, 30.0), (58.9, 40.0, 55, 10.0)],
    ['col_a', 'col_b', 'col_c', 'col_d'])

# Compute weights

In [5]:
conf = {
    AutoMLConfig.DATA: {
        AutoMLConfig.LABEL_COL: OriginFeatures.ORIGIN,
        AutoMLConfig.WEIGHT_COL: WeightFeatures.WEIGHT,
        AutoMLConfig.BASE_TABLE_PATH: 'data/base_table.parquet',
        AutoMLConfig.WEIGHT_PATH: 'data/weight.csv'
    },
    AutoMLConfig.SERVER_CONN_INFO: {
        H2OServerInfo.IP: 'localhost',
        H2OServerInfo.PORT: '54321'
    },
    AutoMLConfig.CROSS_VAL: {
        AutoMLConfig.FOLD_COL: "fold",
        AutoMLConfig.NFOLDS: 8,
    },
    AutoMLConfig.MODELING: {
        AutoMLConfig.MAX_RUNTIME_SECS: 3600,
        AutoMLConfig.MAX_MODELS: 10,
        AutoMLConfig.STOPPING_METRIC: 'logloss',
        AutoMLConfig.SORT_METRIC: 'logloss'
    },
    AutoMLConfig.EXCLUDE_ALGOS: [
        "StackedEnsemble",
        "DeepLearning"
    ],
    AutoMLConfig.MODEL: {
        AutoMLConfig.BEST_MODEL_PATH: 'data/model/'
    },
    AutoMLConfig.SEED: 23
}

In [6]:
pc = ProbabilisticClassification(source_df, target_df, conf)
pc.run()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_191"; Java(TM) SE Runtime Environment (build 1.8.0_191-b12); Java HotSpot(TM) 64-Bit Server VM (build 25.191-b12, mixed mode)
  Starting server from /Users/albertus.kelvin/Documents/GLAIR_OWN/invstr-credit-scoring/env/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/c4/drbdtk0d67b4g99gp19b655c0000gp/T/tmpqr_c1xkb
  JVM stdout: /var/folders/c4/drbdtk0d67b4g99gp19b655c0000gp/T/tmpqr_c1xkb/h2o_albertus_kelvin_started_from_python.out
  JVM stderr: /var/folders/c4/drbdtk0d67b4g99gp19b655c0000gp/T/tmpqr_c1xkb/h2o_albertus_kelvin_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,07 secs
H2O cluster timezone:,Asia/Jakarta
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.2
H2O cluster version age:,2 months and 9 days
H2O cluster name:,H2O_from_python_albertus_kelvin_7w9vio
H2O cluster total nodes:,1
H2O cluster free memory:,3.556 Gb
H2O cluster total cores:,0
H2O cluster allowed cores:,0


Parse progress: |█████████████████████████████████████████████████████████| 100%
Base table inferred column types: {'col_a': 'real', 'col_b': 'int', 'col_c': 'int', 'col_d': 'int', 'origin': 'enum', 'row_id': 'int'}


INFO:probabilistic_covshift.automl.trainer:Base table inferred column types: {'col_a': 'real', 'col_b': 'int', 'col_c': 'int', 'col_d': 'int', 'origin': 'enum', 'row_id': 'int'}


AutoML progress: |
13:57:47.521: Fold column fold will be used for cross-validation. nfolds parameter will be ignored.

███████████████
13:57:58.763: Skipping training of model GBM_5_AutoML_20200330_135747 due to exception: water.exceptions.H2OModelBuilderIllegalArgumentException: Illegal argument(s) for GBM model: GBM_5_AutoML_20200330_135747.  Details: ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 64.0.


█████████████████████████████████████████| 100%


model_id,logloss,auc,aucpr,mean_per_class_error,rmse,mse,training_time_ms,predict_time_per_row_ms
GBM_1_AutoML_20200330_135747,0.133964,0.984863,0.954195,0.0625,0.213472,0.0455702,131,0.050899
XRT_1_AutoML_20200330_135747,0.249173,0.977051,0.946114,0.0625,0.260918,0.068078,54,0.041943
DRF_1_AutoML_20200330_135747,0.301337,0.972168,0.878091,0.0625,0.301149,0.0906909,53,0.030976
GLM_1_AutoML_20200330_135747,0.615117,0.70459,0.697265,0.3125,0.468291,0.219297,44,0.031624
GBM_2_AutoML_20200330_135747,0.670214,0.603027,0.587405,0.5,0.487551,0.237706,34,0.043735
GBM_4_AutoML_20200330_135747,0.673993,0.581543,0.554126,0.5,0.48941,0.239522,30,0.031008
GBM_3_AutoML_20200330_135747,0.674321,0.60498,0.599117,0.484375,0.489962,0.240063,36,0.031841
XGBoost_3_AutoML_20200330_135747,0.685844,0.557129,0.561425,0.484375,0.496277,0.246291,64,0.031508
XGBoost_2_AutoML_20200330_135747,0.711761,0.297852,0.363119,0.5,0.50921,0.259295,50,0.038593
XGBoost_1_AutoML_20200330_135747,0.715397,0.29541,0.375297,0.5,0.510986,0.261107,67,0.032117


Leaderboard: 



INFO:probabilistic_covshift.automl.trainer:Leaderboard: 



Cross validation model performance


INFO:probabilistic_covshift.automl.util:Cross validation model performance



ModelMetricsBinomial: gbm
** Reported on cross-validation data. **

MSE: 0.04557016059815419
RMSE: 0.2134716857059835
LogLoss: 0.13396398737117182
Mean Per-Class Error: 0.0625
AUC: 0.98486328125
AUCPR: 0.9541949331816059
Gini: 0.9697265625

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.1787750713564958: 


Unnamed: 0,Unnamed: 1,source,target,Error,Rate
0,source,28.0,4.0,0.125,(4.0/32.0)
1,target,0.0,32.0,0.0,(0.0/32.0)
2,Total,28.0,36.0,0.0625,(4.0/64.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.178775,0.941176,30.0
1,max f2,0.178775,0.97561,30.0
2,max f0point5,0.728565,0.964286,24.0
3,max accuracy,0.178775,0.9375,30.0
4,max precision,0.99997,1.0,0.0
5,max recall,0.178775,1.0,30.0
6,max specificity,0.99997,1.0,0.0
7,max absolute_mcc,0.178775,0.881917,30.0
8,max min_per_class_accuracy,0.503337,0.90625,27.0
9,max mean_per_class_accuracy,0.178775,0.9375,30.0



Gains/Lift Table: Avg response rate: 50.00 %, avg score: 47.66 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.015625,0.999952,2.0,2.0,1.0,0.99997,1.0,0.99997,0.03125,0.03125,100.0,100.0
1,,2,0.03125,0.999942,2.0,2.0,1.0,0.999942,1.0,0.999956,0.03125,0.0625,100.0,100.0
2,,3,0.03125,0.999941,0.0,2.0,0.0,0.0,1.0,0.999956,0.0,0.0625,-100.0,100.0
3,,4,0.046875,0.999938,2.0,2.0,1.0,0.999941,1.0,0.999951,0.03125,0.09375,100.0,100.0
4,,5,0.0625,0.999929,2.0,2.0,1.0,0.999936,1.0,0.999947,0.03125,0.125,100.0,100.0
5,,6,0.109375,0.998949,2.0,2.0,1.0,0.999303,1.0,0.999671,0.09375,0.21875,100.0,100.0
6,,7,0.15625,0.997221,2.0,2.0,1.0,0.998435,1.0,0.9993,0.09375,0.3125,100.0,100.0
7,,8,0.203125,0.991421,2.0,2.0,1.0,0.994585,1.0,0.998212,0.09375,0.40625,100.0,100.0
8,,9,0.296875,0.965151,2.0,2.0,1.0,0.982402,1.0,0.993219,0.1875,0.59375,100.0,100.0
9,,10,0.40625,0.86062,2.0,2.0,1.0,0.938129,1.0,0.978387,0.21875,0.8125,100.0,100.0





ModelMetricsBinomial: gbm
** Reported on cross-validation data. **

MSE: 0.04557016059815419
RMSE: 0.2134716857059835
LogLoss: 0.13396398737117182
Mean Per-Class Error: 0.0625
AUC: 0.98486328125
AUCPR: 0.9541949331816059
Gini: 0.9697265625

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.1787750713564958: 


Unnamed: 0,Unnamed: 1,source,target,Error,Rate
0,source,28.0,4.0,0.125,(4.0/32.0)
1,target,0.0,32.0,0.0,(0.0/32.0)
2,Total,28.0,36.0,0.0625,(4.0/64.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.178775,0.941176,30.0
1,max f2,0.178775,0.97561,30.0
2,max f0point5,0.728565,0.964286,24.0
3,max accuracy,0.178775,0.9375,30.0
4,max precision,0.99997,1.0,0.0
5,max recall,0.178775,1.0,30.0
6,max specificity,0.99997,1.0,0.0
7,max absolute_mcc,0.178775,0.881917,30.0
8,max min_per_class_accuracy,0.503337,0.90625,27.0
9,max mean_per_class_accuracy,0.178775,0.9375,30.0



Gains/Lift Table: Avg response rate: 50.00 %, avg score: 47.66 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.015625,0.999952,2.0,2.0,1.0,0.99997,1.0,0.99997,0.03125,0.03125,100.0,100.0
1,,2,0.03125,0.999942,2.0,2.0,1.0,0.999942,1.0,0.999956,0.03125,0.0625,100.0,100.0
2,,3,0.03125,0.999941,0.0,2.0,0.0,0.0,1.0,0.999956,0.0,0.0625,-100.0,100.0
3,,4,0.046875,0.999938,2.0,2.0,1.0,0.999941,1.0,0.999951,0.03125,0.09375,100.0,100.0
4,,5,0.0625,0.999929,2.0,2.0,1.0,0.999936,1.0,0.999947,0.03125,0.125,100.0,100.0
5,,6,0.109375,0.998949,2.0,2.0,1.0,0.999303,1.0,0.999671,0.09375,0.21875,100.0,100.0
6,,7,0.15625,0.997221,2.0,2.0,1.0,0.998435,1.0,0.9993,0.09375,0.3125,100.0,100.0
7,,8,0.203125,0.991421,2.0,2.0,1.0,0.994585,1.0,0.998212,0.09375,0.40625,100.0,100.0
8,,9,0.296875,0.965151,2.0,2.0,1.0,0.982402,1.0,0.993219,0.1875,0.59375,100.0,100.0
9,,10,0.40625,0.86062,2.0,2.0,1.0,0.938129,1.0,0.978387,0.21875,0.8125,100.0,100.0





INFO:probabilistic_covshift.automl.util:


Threshold = 0.1787750713564958 for maximum F1 = 0.9411764705882353


INFO:probabilistic_covshift.automl.util:Threshold = 0.1787750713564958 for maximum F1 = 0.9411764705882353


/Users/albertus.kelvin/Documents/PROJECTS/probabilistic-covshift/example/data/model/GBM_1_AutoML_20200330_135747
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%
Export File progress: |███████████████████████████████████████████████████| 100%


# Append the weight to the base table

In [None]:
base_frame_df = spark.read.parquet(conf[AutoMLConfig.DATA][AutoMLConfig.BASE_TABLE_PATH])\
                     .drop(conf[AutoMLConfig.DATA][AutoMLConfig.LABEL_COL])
base_frame_df.toPandas().head()

In [None]:
weight_df = spark.read.csv(conf[AutoMLConfig.DATA][AutoMLConfig.WEIGHT_PATH], header=True)
weight_df.toPandas().head()

In [None]:
weighted_base_frame_df = base_frame_df.join(weight_df, how='left', on='row_id')
weighted_base_frame_df.toPandas().head()