In [15]:
import pandas as pd
import os
from xgboost import DMatrix, train
import time

In [18]:
xgboost_params = {
    "tree_method": "approx",
    "objective": "binary:logistic",
    "eval_metric": ["logloss", "error"],
}

In [8]:
os.chdir("ray-experience-program-v1/ray-experience-program/train_xgboost")

In [9]:
data = pd.read_parquet(
    "./data"
)

In [10]:
data.columns

Index(['feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5',
       'feature_6', 'feature_7', 'feature_8', 'feature_9', 'feature_10',
       'label'],
      dtype='object')

In [11]:
data

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,label
0,-1.466814,-1.265149,0.777963,0.723106,-0.358261,-0.064106,-1.630237,1.740549,-3.441059,-0.021078,1
1,0.138483,0.924616,-0.374949,1.639720,1.148096,1.234976,-0.924511,0.018798,-0.768708,0.858501,1
2,0.881383,-0.109776,0.435453,1.151986,-0.459115,-1.335648,-0.801152,-1.948106,0.507930,-0.914044,0
3,-2.044750,-0.481905,0.162026,-0.621405,1.103516,-0.753945,-1.724052,-1.345771,-0.367444,-2.337029,1
4,-1.346097,2.129415,1.862751,0.282567,-0.028876,1.290923,0.736878,1.595458,-0.382979,-0.750874,1
...,...,...,...,...,...,...,...,...,...,...,...
1999995,0.580033,-0.660796,-0.401391,1.586322,1.132132,0.594638,1.721991,2.220757,-2.783139,-0.342065,1
1999996,0.009184,-1.159101,0.404769,2.489561,-0.197882,3.360334,5.327104,-1.401125,-1.833050,-0.451552,1
1999997,-0.503484,-0.394713,-1.192797,0.846218,0.032064,1.930239,2.886316,-0.512531,-0.537099,0.564459,1
1999998,1.673476,-1.754599,0.143146,-2.539083,0.612207,2.260775,0.851629,0.149056,-1.043119,-0.514377,1


In [12]:
data.mean()

feature_1    -0.038141
feature_2    -0.023049
feature_3     0.074320
feature_4    -0.125593
feature_5    -0.022516
feature_6    -0.000108
feature_7    -0.086874
feature_8     0.009272
feature_9    -0.017591
feature_10   -0.014548
label         0.500004
dtype: float64

In [14]:
data.std()

feature_1     1.345333
feature_2     1.484504
feature_3     1.356264
feature_4     1.549881
feature_5     1.471910
feature_6     1.541667
feature_7     1.489877
feature_8     1.563940
feature_9     1.523952
feature_10    1.416065
label         0.500000
dtype: float64

In [16]:
def train_xgboost(
    config: dict, data: pd.DataFrame, target_column: str, test_fraction: float = 0.3
):
    """
    Train an XGBoost model on the data specified in the `data` arg.
    Args:
        config (Dict): A dictionary for XGBoost specific configurations.
        data: (pd.DataFrame): A pandas dataframe containing the data to train on.
        target_column (str): The name of the column in the dataframe to use as the labels.
        test_fraction (float): What fraction of the data to use as the test set. 
            The test set data will be used only for evaluation and not for training. 
    """

    # Get the current time. We will use this later to see how long training takes.
    start_time = time.time()

    # Split data into train and test.
    # First sample 1-test_fraction of the data to use as the training set.
    X_train = data.sample(frac=1 - test_fraction)
    # Then take the remainder of the data and use that as the test set.
    X_test = data.drop(X_train.index)

    # Pass Pandas dataframe to DMatrix. 
    train_set = DMatrix(X_train.drop(target_column, axis=1), X_train[target_column])
    test_set = DMatrix(X_test.drop(target_column, axis=1), X_test[target_column])

    evals_result = {}

    # Run the training
    bst = train(
        params=config,
        dtrain=train_set,
        evals=[(test_set, "eval")],
        evals_result=evals_result,
        verbose_eval=False,
        num_boost_round=10,
    )
    print(f"Total time taken: {time.time()-start_time}")

    # Get a model back
    model_path = "model.xgb"
    bst.save_model(model_path)
    print("Final validation error: {:.4f}".format(evals_result["eval"]["error"][-1]))

    return bst

In [19]:
bst = train_xgboost(
    xgboost_params,
    data,
    "label",
)

Total time taken: 7.502196311950684
Final validation error: 0.3683
