# XGBoost
XGBoost is an optimized distributed gradient boosting library designed to be highly efficient, flexible and portable. It implements machine learning algorithms under the Gradient Boosting framework. XGBoost provides a parallel tree boosting (also known as GBDT, GBM) that solve many data science problems in a fast and accurate way. XGBoost was first added in MADlib 1.20.0.

In [1]:
%load_ext sql

In [4]:
# Greenplum Database 6.X
%sql postgresql://okislal@localhost:6600/madlib

u'Connected: okislal@madlib'

In [5]:
%sql select madlib.version();
#%sql select version();

 * postgresql://okislal@localhost:6600/madlib
1 rows affected.


version
"MADlib version: 1.20.0, git revision: rc/1.20.0-rc2-6-gb07f7466, cmake configuration time: Fri Jul 29 14:31:52 UTC 2022, build type: RelWithDebInfo, build system: Darwin-20.6.0, C compiler: Clang, C++ compiler: Clang"


# 1.  Load data

The sample data for XGBoost can be downloaded from the examples section of the MADlib documentation. Direct link: https://madlib.apache.org/docs/latest/example/madlib_xgboost_example.sql

In [6]:
%%sql 
SELECT * FROM abalone LIMIT 10;

 * postgresql://okislal@localhost:6600/madlib
10 rows affected.


id,sex,length,diameter,height,whole,shucked,viscera,shell,rings
2026,F,0.55,0.47,0.15,0.9205,0.381,0.2435,0.2675,10
1796,F,0.58,0.43,0.17,1.48,0.6535,0.324,0.4155,10
829,I,0.41,0.325,0.1,0.394,0.208,0.0655,0.106,6
3703,F,0.665,0.54,0.195,1.764,0.8505,0.3615,0.47,11
1665,I,0.605,0.47,0.145,0.8025,0.379,0.2265,0.22,9
3901,M,0.445,0.345,0.14,0.476,0.2055,0.1015,0.1085,15
2734,I,0.415,0.335,0.1,0.358,0.169,0.067,0.105,7
1155,M,0.6,0.455,0.17,1.1915,0.696,0.2395,0.24,8
3467,M,0.64,0.5,0.17,1.4545,0.642,0.3575,0.354,9
2433,F,0.61,0.485,0.165,1.087,0.4255,0.232,0.38,11


# 2. Run a single XGBoost training
Note that the function collates the data into a single segment and runs the xgboost python process on that machine.

In [17]:
%%sql
DROP TABLE IF EXISTS xgb_single_out, xgb_single_out_summary;
SELECT madlib.xgboost(
    'abalone',  -- Training table
    'xgb_single_out',  -- Grid search results table.
    'id',       -- Id column
    'sex',      -- Class label column
    '*',        -- Independent variables  
    NULL,       -- Columns to exclude from features 
    $$ 
    {
        'learning_rate': [0.01], #Regularization on weights (eta). For smaller values, increase n_estimators
        'max_depth': [9],#Larger values could lead to overfitting
        'subsample': [0.85],#introduce randomness in samples picked to prevent overfitting
        'colsample_bytree': [0.85],#introduce randomness in features picked to prevent overfitting
        'min_child_weight': [10],#larger values will prevent over-fitting
        'n_estimators':[100] #More estimators, lesser variance (better fit on test set) 
    } 
    $$,         -- XGBoost grid search parameters
    '',         -- Class weights
    0.8,        -- Training set size ratio
    NULL        -- Variable used to do the test/train split.
);

SELECT features, importance, precision, recall, fscore, support FROM xgb_single_out_summary;

 * postgresql://okislal@localhost:6600/madlib
Done.
1 rows affected.
1 rows affected.


features,importance,precision,recall,fscore,support
"[u'length', u'diameter', u'height', u'whole_weight', u'shucked_weight', u'viscera_weight', u'shell_weight', u'rings']","[u'1205', u'1179', u'1115', u'941', u'926', u'711', u'580', u'454']","[u'0.45390070921985815', u'0.6984615384615385', u'0.4780701754385965']","[u'0.4866920152091255', u'0.8315018315018315', u'0.36454849498327757']","[u'0.46972477064220186', u'0.7591973244147157', u'0.413662239089184']","[u'263.0', u'273.0', u'299.0']"


# 3. Run XGBoost Prediction

In [25]:
%%sql
DROP TABLE IF EXISTS xgb_single_score_out, xgb_single_score_out_metrics, xgb_single_score_out_roc_curve;

SELECT madlib.xgboost_predict(
    'abalone',          -- test_table
    'xgb_single_out',   -- model_table
    'xgb_single_score_out',    -- predict_output_table
    'id',               -- id_column
    'sex',              -- class_label
    1                   -- model_filters
);

SELECT * FROM xgb_single_score_out LIMIT 10;

 * postgresql://okislal@localhost:6600/madlib
Done.
1 rows affected.
10 rows affected.


id,sex_predicted,sex_proba_predicted
2,I,"[0.180475369096, 0.575919687748, 0.243604928255]"
3,I,"[0.27669274807, 0.44246467948, 0.280842572451]"
4,M,"[0.319970279932, 0.313613921404, 0.366415828466]"
7,F,"[0.384111016989, 0.266917943954, 0.348971098661]"
8,F,"[0.344503968954, 0.315709024668, 0.339786976576]"
16,F,"[0.401963979006, 0.242080762982, 0.355955272913]"
18,I,"[0.315914690495, 0.363648235798, 0.32043710351]"
19,I,"[0.184259131551, 0.606196165085, 0.209544733167]"
22,M,"[0.27689999342, 0.361068278551, 0.362031728029]"
24,F,"[0.367550551891, 0.345346838236, 0.287102639675]"


# 4. Run XGBoost with grid search
The parameter options are combined to form a grid and explored in parallel by running distinct xgboost processes in different segments in parallel. The following example will generate 4 configurations to test by combining 'learning_rate': [0.01,0.1] and 'max_depth': [9,12].

In [32]:
%%sql
DROP TABLE IF EXISTS xgb_grid_out, xgb_grid_out_summary;

SELECT xgboost(
    'abalone',  -- Training table
    'xgb_grid_out',  -- Grid search results table.
    'id',       -- Id column
    'sex',      -- Class label column
    '*',        -- Independent variables
    NULL,       -- Columns to exclude from features
    $$
    {
        'learning_rate': [0.01,0.1], #Regularization on weights (eta). For smaller values, increase n_estimators
        'max_depth': [9,12],#Larger values could lead to overfitting
        'subsample': [0.85],#introduce randomness in samples picked to prevent overfitting
        'colsample_bytree': [0.85],#introduce randomness in features picked to prevent overfitting
        'min_child_weight': [10],#larger values will prevent over-fitting
        'n_estimators':[100] #More estimators, lesser variance (better fit on test set)
    }
    $$,         -- XGBoost grid search parameters
    '',         -- Class weights
    0.8,        -- Training set size ratio
    NULL        -- Variable used to do the test/train split.
);

SELECT features, params, importance, precision, recall, fscore, support, params_index FROM xgb_grid_out_summary;

 * postgresql://okislal@localhost:6600/madlib
Done.
1 rows affected.
4 rows affected.


features,params,importance,precision,recall,fscore,support,params_index
"[u'length', u'diameter', u'height', u'whole_weight', u'shucked_weight', u'viscera_weight', u'shell_weight', u'rings']","('colsample_bytree=0.85', 'learning_rate=0.01', 'min_child_weight=10', 'n_estimators=100', 'subsample=0.85', 'max_depth=12')","[u'1294', u'1183', u'1069', u'974', u'900', u'717', u'608', u'490']","[u'0.48148148148148145', u'0.6883561643835616', u'0.47619047619047616']","[u'0.4642857142857143', u'0.788235294117647', u'0.43333333333333335']","[u'0.4727272727272727', u'0.7349177330895795', u'0.4537521815008726']","[u'280.0', u'255.0', u'300.0']",2
"[u'length', u'diameter', u'height', u'whole_weight', u'shucked_weight', u'viscera_weight', u'shell_weight', u'rings']","('colsample_bytree=0.85', 'learning_rate=0.1', 'min_child_weight=10', 'n_estimators=100', 'subsample=0.85', 'max_depth=9')","[u'953', u'882', u'872', u'848', u'579', u'500', u'454', u'429']","[u'0.4259927797833935', u'0.7080536912751678', u'0.47307692307692306']","[u'0.44696969696969696', u'0.7962264150943397', u'0.4019607843137255']","[u'0.4362292051756007', u'0.74955595026643', u'0.43462897526501765']","[u'264.0', u'265.0', u'306.0']",3
"[u'length', u'diameter', u'height', u'whole_weight', u'shucked_weight', u'viscera_weight', u'shell_weight', u'rings']","('colsample_bytree=0.85', 'learning_rate=0.1', 'min_child_weight=10', 'n_estimators=100', 'subsample=0.85', 'max_depth=12')","[u'1168', u'1099', u'1069', u'908', u'717', u'534', u'471', u'462']","[u'0.4007220216606498', u'0.775', u'0.49640287769784175']","[u'0.4605809128630705', u'0.775', u'0.4394904458598726']","[u'0.42857142857142855', u'0.775', u'0.46621621621621623']","[u'241.0', u'280.0', u'314.0']",4
"[u'length', u'diameter', u'height', u'whole_weight', u'shucked_weight', u'viscera_weight', u'shell_weight', u'rings']","('colsample_bytree=0.85', 'learning_rate=0.01', 'min_child_weight=10', 'n_estimators=100', 'subsample=0.85', 'max_depth=9')","[u'1257', u'1211', u'1105', u'904', u'867', u'824', u'649', u'400']","[u'0.40148698884758366', u'0.6488095238095238', u'0.49130434782608695']","[u'0.45188284518828453', u'0.8352490421455939', u'0.3373134328358209']","[u'0.4251968503937008', u'0.7303182579564489', u'0.4']","[u'239.0', u'261.0', u'335.0']",1


# 5.  Run XGBoost Prediction on Grid Output Table
Let's say we are interested in the model 2 and want to run a prediction using it.

In [31]:
%%sql

DROP TABLE IF EXISTS xgb_grid_score_out, xgb_grid_score_out_metrics, xgb_grid_score_out_roc_curve;

SELECT madlib.xgboost_predict(
    'abalone',               -- test_table
    'xgb_grid_out',          -- model_table
    'xgb_grid_score_out',    -- predict_output_table
    'id',                    -- id_column
    'sex',                   -- class_label
    2                        -- model_filters
);
SELECT * FROM xgb_grid_score_out LIMIT 10;

 * postgresql://okislal@localhost:6600/madlib
Done.
1 rows affected.
10 rows affected.


id,sex_predicted,sex_proba_predicted
1,I,"[0.312986373901, 0.34792137146, 0.339092254639]"
12,I,"[0.337030380964, 0.379457473755, 0.283512145281]"
15,I,"[0.292645961046, 0.382402688265, 0.324951350689]"
20,I,"[0.235972866416, 0.479740768671, 0.284286379814]"
23,M,"[0.3711399436, 0.21823567152, 0.410624355078]"
26,M,"[0.343350559473, 0.223895892501, 0.432753533125]"
30,M,"[0.359976351261, 0.246053755283, 0.393969863653]"
31,F,"[0.437169611454, 0.199478805065, 0.363351553679]"
35,F,"[0.516167163849, 0.170660674572, 0.313172131777]"
36,I,"[0.252817928791, 0.48461201787, 0.262570023537]"
