# Case One: Project Notebook
By August and William

In [5]:
### Imports
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

import wandb

# 1. Data Loading
## Load data and remove nan's
realized = pd.read_csv('data/realized_preprocessed_data.csv', index_col=0)
future =pd.read_csv('data/future_preprocessed_data.csv', index_col=0)

# Remove target from data
y = realized.LoadFactor
realized = realized.loc[:, realized.columns != 'LoadFactor']

## Make copy of **SeatCapacity** for computing forecast accuracy
realized['SeatCapacityOriginal'] = realized.SeatCapacity
future['SeatCapacityOriginal'] = future.SeatCapacity

X = realized

# 3. Data splitting
## Split data into modeling data (will be training and validation) and test data
from sklearn.model_selection import train_test_split

### Make train/val set *0.8 and test *0.2
def split_model_test(X, y, seed=0, shuffle=False, stratify=False):
    if stratify:
        X_model, X_test, y_model, y_test = train_test_split(X, y, test_size=0.2, random_state=seed, shuffle=shuffle, stratify=y)
    else:
        X_model, X_test, y_model, y_test = train_test_split(X, y, test_size=0.2, random_state=seed, shuffle=shuffle)
    return X_model, X_test, y_model, y_test

def split_train_val(X_m, y_m, seed=0, shuffle=False, stratify=False):
    if stratify:
        X_train, X_val, y_train, y_val = train_test_split(X_m, y_m, test_size=0.25, random_state=seed, shuffle=shuffle, stratify=y_m)
    else:
        X_train, X_val, y_train, y_val = train_test_split(X_m, y_m, test_size=0.25, random_state=seed, shuffle=shuffle)
    return X_train, X_val, y_train, y_val

def seperate_SCO(X_train_model, X_val_test):
    X_train_model_SCO, X_val_test_SCO = X_train_model.SeatCapacityOriginal, X_val_test.SeatCapacityOriginal

    X_train_model = X_train_model.loc[:, ~X_train_model.columns.isin(['SeatCapacityOriginal'])]
    X_val_test = X_val_test.loc[:, ~X_val_test.columns.isin(['SeatCapacityOriginal'])]

    return X_train_model, X_val_test, X_train_model_SCO, X_val_test_SCO


# 4. Define validation setup for different models
## Define forecast accuracy function
def mean_forecast_accuracy(loadfactor_forecasted, loadfactor_true, seatcapacity):

    passengers_true = loadfactor_true * seatcapacity
    passengers_forecasted = loadfactor_forecasted * seatcapacity
    
    abs_deviation_per_flight = np.abs((passengers_true-passengers_forecasted) / passengers_true)
    abs_deviation_per_flight[abs_deviation_per_flight >= 10000] = 100

    mean_forecast_acc = np.mean(1 - abs_deviation_per_flight*1)*100
    return mean_forecast_acc

## Define normalizer for training on **SeatCapacity**
def normalize_seatcapacity_fit(X_train):
    scaler = MinMaxScaler()
    scaler.fit(X_train.SeatCapacity.values.reshape(-1, 1))
    X_train.SeatCapacity = scaler.transform(X_train.SeatCapacity.values.reshape(-1, 1))
    return X_train, scaler

def normalize_seatcapacity(X_val, scaler):
    X_val.SeatCapacity = scaler.transform(X_val.SeatCapacity.values.reshape(-1, 1))
    return X_val

## Functions for fitting+validating models, as well as testing models
### Make function for fitting and validating model
def fit_evaluate_model(X_tr_m, X_v_te, y_tr_m, y_v_te, model):
    
    ## Remove original seatcapacity
    X_tr_m, X_v_te, X_tr_m_SCO, X_v_te_SCO = seperate_SCO(X_train_model=X_tr_m, X_val_test=X_v_te)

    ## Normalize seatcapacity
    X_tr_m, fitted_scaler = normalize_seatcapacity_fit(X_train=X_tr_m)
    ## Fit model to the training data
    model.fit(X=X_tr_m, y=y_tr_m)

    ## Normalize validation data SeatCapacity for predictions
    X_v_te = normalize_seatcapacity(X_val=X_v_te, scaler=fitted_scaler)
    ## Make predictions
    pred = model.predict(X_v_te)

    ## Compute forecasting accuracy
    acc = mean_forecast_accuracy(loadfactor_forecasted=pred, loadfactor_true=y_v_te.to_numpy(), seatcapacity=X_v_te_SCO.to_numpy())

    return acc, model

# Compute EPE ground function

In [None]:
# shuffle = True
# M = 100
# forecast_acc = []

# for m in range(M):

#     ## Split data
#     seed = np.random.randint(10000)
#     X_model, X_test, y_model, y_test = split_model_test(X=X, y=y, seed=seed, shuffle=shuffle)
#     X_train, X_val, y_train, y_val = split_train_val(X_m=X_model, y_m=y_model, seed=seed, shuffle=shuffle)

#     ## Train model on training data with different model parameters
#     #TODO MODEL TRAINING FUNCTION
#     val_accs, best_model = 

#     ## Evaluate best model on test data
#     test_acc, _ = fit_evaluate_model(X_tr_m=X_train, X_v_te=X_test, y_tr_m=y_train, y_v_te=y_test, model=best_model)
#     forecast_acc.append(test_acc)


# XGBoost

In [6]:
shuffle = True
M = 100
config = {
    "shuffle": True,
    "booster": "gbtree",
    "max_depth": 10,
    "learning_rate": 0.2,
    "subsample": 0.5,
    "tree_method": "gpu_hist",
    }

np.random.seed(0)
forecast_acc = []

wandb.init(project='02582_case1_final', entity='tgml', config=config, group='XGB')

for m in range(M):

    ## Split data
    seed = np.random.randint(10000)
    X_model, X_test, y_model, y_test = split_model_test(X=X, y=y, seed=seed, shuffle=shuffle)
    X_train, X_val, y_train, y_val = split_train_val(X_m=X_model, y_m=y_model, seed=seed, shuffle=shuffle)

    ## Train and evaluate model
    model = XGBRegressor(booster=config['booster'], 
                        max_depth=config['max_depth'],
                        learning_rate=config['learning_rate'], 
                        subsample=config['subsample'], 
                        tree_method=config['tree_method'])

    ## Evaluate best model on test data
    model_test_acc, _ = fit_evaluate_model(X_tr_m=X_model, X_v_te=X_test, y_tr_m=y_model, y_v_te=y_test, model=model)
    forecast_acc.append(model_test_acc)
    wandb.log({"accuracy": model_test_acc})


print(f'Mean of test accuracies: {np.mean(forecast_acc)}\nStd. of test accuracies: {np.var(forecast_acc)}')

wandb: Currently logged in as: augustsemrau (use `wandb login --relogin` to force relogin)


Mean of test accuracies: 40.944463024185325
Std. of test accuracies: 21.06142943120369


# Gradient Boosted Trees (SKLearn)

In [None]:
shuffle = True
M = 100
config = {
    'n_estimators': 10,
    'max_depth': 6,
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    }

np.random.seed(0)
forecast_acc = []

wandb.init(project='02582_case1_final', entity='tgml', config=config, group='SKLEARN_GB')

for m in range(M):

    ## Split data
    seed = np.random.randint(10000)
    X_model, X_test, y_model, y_test = split_model_test(X=X, y=y, seed=seed, shuffle=shuffle)
    X_train, X_val, y_train, y_val = split_train_val(X_m=X_model, y_m=y_model, seed=seed, shuffle=shuffle)

    ## Train and evaluate model
    model = GradientBoostingRegressor(n_estimators=config['n_estimators'], 
                                    max_features=config['max_features'],
                                    max_depth=config['max_depth'], 
                                    min_samples_split=config['min_samples_split'], 
                                    min_samples_leaf=config['min_samples_leaf'])

    ## Evaluate best model on test data
    model_test_acc, _ = fit_evaluate_model(X_tr_m=X_model, X_v_te=X_test, y_tr_m=y_model, y_v_te=y_test, model=model)
    forecast_acc.append(model_test_acc)
    wandb.log({"accuracy": model_test_acc})


print(f'Mean of test accuracies: {np.mean(forecast_acc)}\nStd. of test accuracies: {np.var(forecast_acc)}')

# Random Forest

In [None]:
shuffle = True
M = 100
config = {
    # 'bootstrap': True,
    # 'criterion': 'mse',
    # 'max_leaf_nodes': None,
    # 'min_impurity_decrease': 0.0,
    # 'min_impurity_split': None,
    # 'min_weight_fraction_leaf': 0.0,
    # 'n_jobs': 1,
    # 'oob_score': False,
    # 'random_state': 42,
    # 'verbose': 0,
    # 'warm_start': False,
    # "shuffle": True,
    'max_features': 'auto',
    'n_estimators': 2000,
    'max_depth': 30,
    'min_samples_leaf': 4,
    'min_samples_split': 4,
    }

np.random.seed(0)
forecast_acc = []

wandb.init(project='02582_case1_final', entity='tgml', config=config, group='RF')

for m in range(M):

    ## Split data
    seed = np.random.randint(10000)
    X_model, X_test, y_model, y_test = split_model_test(X=X, y=y, seed=seed, shuffle=shuffle)
    X_train, X_val, y_train, y_val = split_train_val(X_m=X_model, y_m=y_model, seed=seed, shuffle=shuffle)

    ## Train and evaluate model
    model = RandomForestRegressor(n_estimators=config['n_estimators'], 
                                    max_features=config['max_features'],
                                    max_depth=config['max_depth'], 
                                    min_samples_split=config['min_samples_split'], 
                                    min_samples_leaf=config['min_samples_leaf'])

    ## Evaluate best model on test data
    model_test_acc, _ = fit_evaluate_model(X_tr_m=X_model, X_v_te=X_test, y_tr_m=y_model, y_v_te=y_test, model=model)
    forecast_acc.append(model_test_acc)
    wandb.log({"accuracy": model_test_acc})


print(f'Mean of test accuracies: {np.mean(forecast_acc)}\nStd. of test accuracies: {np.var(forecast_acc)}')

# Make WandB Sweeps to preliminarily find best ranges for parameters

# XGBoost

In [None]:
sweep_config = {
    "method": "random", # try grid or random
    "metric": {
      "name": "accuracy",
      "goal": "maximize"   
    },
    "parameters": {
        "max_depth": {
            "values": [6, 7, 8, 9]
        },
        "learning_rate": {
            "values": [0.1, 0.15, 0.2]
        },
        "subsample": {
            "distribution": "uniform",
            "min": 0.00001,
            "max": 1
        }
    }
}

sweep_id = wandb.sweep(sweep_config, project='02582_case1_1', entity='tgml')

def train():
    config_defaults = {
    "booster": "gbtree",
    "max_depth": 3,
    "learning_rate": 0.1,
    "subsample": 1,
    "seed": 0,
    "shuffle": True,
    "tree_method": "gpu_hist"
    }

    wandb.init(project='02582_case1_1', entity='tgml', config=config_defaults)  # defaults are over-ridden during the sweep
    config = wandb.config

    X_model, X_test, y_model, y_test = split_model_test(X=X, y=y, seed=config.seed, shuffle=config.shuffle, stratify=False)
    X_train, X_val, y_train, y_val = split_train_val(X_m=X_model, y_m=y_model, seed=config.seed, shuffle=config.shuffle, stratify=False)

    # fit model on train
    model = XGBRegressor(booster=config.booster, max_depth=config.max_depth,
                            learning_rate=config.learning_rate, subsample=config.subsample, tree_method=config.tree_method)

    # model_on_test_acc, _ = fit_evaluate_model(X_tr_m=X_model, X_v_te=X_test, y_tr_m=y_model, y_v_te=y_test, model=model)
    # train_on_test_acc, _ = fit_evaluate_model(X_tr_m=X_train, X_v_te=X_test, y_tr_m=y_train, y_v_te=y_test, model=model)
    train_on_val_acc, _ = fit_evaluate_model(X_tr_m=X_train, X_v_te=X_val, y_tr_m=y_train, y_v_te=y_val, model=model)

    wandb.log({"accuracy": train_on_val_acc})
#objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1, alpha = 10, n_estimators = 10

wandb.agent(sweep_id, train, count=50)

# Gradient Boost Tree

In [7]:
sweep_config = {
    "method": "random", # try grid or random
    "metric": {
      "name": "accuracy",
      "goal": "maximize"   
    },
    "parameters": {
        "n_estimators": {
            "values": [400, 800, 1200, 1600, 2000]
        },
        "max_depth": {
            "values": [5,7,9,11,13]
            # "distribution": "uniform",
            # "min": 0.00001,
            # "max": 1
        },
        "learning_rate": {
            "values": [0.1, 0.15, 0.2]
        }
    }
}

sweep_id = wandb.sweep(sweep_config, project='02582_case1_gb', entity='tgml')

def train():
    config_defaults = {
    # 'bootstrap': True,
    # 'criterion': 'mse',
    # 'max_features': 'auto',
    # 'max_leaf_nodes': None,
    # 'min_impurity_decrease': 0.0,
    # 'min_impurity_split': None,
    # 'min_samples_leaf': 1,
    # 'min_samples_split': 2,
    # 'min_weight_fraction_leaf': 0.0,
    # 'n_estimators': 10,
    # 'n_jobs': 1,
    # 'oob_score': False,
    # 'random_state': 42,
    # 'verbose': 0,
    # 'warm_start': False,
    "seed": 0,
    "shuffle": True,
    'max_depth': 3,
    "n_estimators": 100,
    "learning_rate": 0.1,
    }

    wandb.init(project='02582_case1_1', entity='tgml', config=config_defaults)  # defaults are over-ridden during the sweep
    config = wandb.config

    X_model, X_test, y_model, y_test = split_model_test(X=X, y=y, seed=config.seed, shuffle=config.shuffle, stratify=False)
    X_train, X_val, y_train, y_val = split_train_val(X_m=X_model, y_m=y_model, seed=config.seed, shuffle=config.shuffle, stratify=False)

    # fit model on train
    model = GradientBoostingRegressor(n_estimators=config.n_estimators, 
                                        learning_rate=config.learning_rate,
                                        max_depth=config.max_depth)

    # model_on_test_acc, _ = fit_evaluate_model(X_tr_m=X_model, X_v_te=X_test, y_tr_m=y_model, y_v_te=y_test, model=model)
    # train_on_test_acc, _ = fit_evaluate_model(X_tr_m=X_train, X_v_te=X_test, y_tr_m=y_train, y_v_te=y_test, model=model)
    train_on_val_acc, _ = fit_evaluate_model(X_tr_m=X_train, X_v_te=X_val, y_tr_m=y_train, y_v_te=y_val, model=model)

    wandb.log({"accuracy": train_on_val_acc})


wandb.agent(sweep_id, train, count=100)



Create sweep with ID: f0ot28x8
Sweep URL: https://wandb.ai/tgml/02582_case1_gb/sweeps/f0ot28x8


wandb: Agent Starting Run: pwxo2wpt with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 11
wandb: 	n_estimators: 1200





VBox(children=(Label(value='0.218 MB of 0.218 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,44.0397


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: j073aehh with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 13
wandb: 	n_estimators: 1200





VBox(children=(Label(value='0.218 MB of 0.218 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,42.81714


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: 7wbfyjlh with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 7
wandb: 	n_estimators: 800





VBox(children=(Label(value='0.218 MB of 0.218 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,44.31831


wandb: Agent Starting Run: gvbtzcit with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 5
wandb: 	n_estimators: 1600





VBox(children=(Label(value='0.218 MB of 0.218 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,45.02447


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: 67emr0wi with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 9
wandb: 	n_estimators: 2000





VBox(children=(Label(value='0.218 MB of 0.218 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,44.12553


wandb: Agent Starting Run: eel8w020 with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 13
wandb: 	n_estimators: 1600





VBox(children=(Label(value='0.218 MB of 0.218 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,42.87296


wandb: Agent Starting Run: zywh8uee with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 11
wandb: 	n_estimators: 1200





VBox(children=(Label(value='0.218 MB of 0.218 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,43.62232


wandb: Agent Starting Run: pxzz3jq6 with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 13
wandb: 	n_estimators: 800





VBox(children=(Label(value='0.218 MB of 0.218 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,42.59118


wandb: Agent Starting Run: tivleyce with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 11
wandb: 	n_estimators: 400





VBox(children=(Label(value='0.218 MB of 0.218 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,43.34029


wandb: Agent Starting Run: gppfvpth with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 11
wandb: 	n_estimators: 1600





VBox(children=(Label(value='0.218 MB of 0.218 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,43.97717


wandb: Agent Starting Run: jsld03gf with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 5
wandb: 	n_estimators: 1200





VBox(children=(Label(value='0.218 MB of 0.218 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,44.91863


wandb: Agent Starting Run: jtca1dsr with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 5
wandb: 	n_estimators: 1600





VBox(children=(Label(value='0.218 MB of 0.218 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,45.20744


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: e0gjry76 with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 7
wandb: 	n_estimators: 1200





VBox(children=(Label(value='0.218 MB of 0.218 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,45.03863


wandb: Agent Starting Run: ha7p22gf with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 13
wandb: 	n_estimators: 1200





VBox(children=(Label(value='0.218 MB of 0.218 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,43.09969


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: atzg527j with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 7
wandb: 	n_estimators: 400





VBox(children=(Label(value='0.218 MB of 0.218 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,44.85996


wandb: Agent Starting Run: 2nzzf2x1 with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 11
wandb: 	n_estimators: 800





VBox(children=(Label(value='0.218 MB of 0.218 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,43.50714


wandb: Agent Starting Run: ne9xspy4 with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 5
wandb: 	n_estimators: 400





VBox(children=(Label(value='0.218 MB of 0.218 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,39.97961


wandb: Agent Starting Run: 3vxna5mi with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 9
wandb: 	n_estimators: 1600





VBox(children=(Label(value='0.218 MB of 0.218 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,42.31798


wandb: Agent Starting Run: tfw57hxu with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 13
wandb: 	n_estimators: 1600





VBox(children=(Label(value='0.218 MB of 0.218 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,42.23759


wandb: Agent Starting Run: uhidhkvp with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 13
wandb: 	n_estimators: 2000





VBox(children=(Label(value='0.218 MB of 0.218 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,43.23355


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: m8vpu2ar with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 5
wandb: 	n_estimators: 1200





VBox(children=(Label(value='0.218 MB of 0.218 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,44.25032


wandb: Agent Starting Run: p5f91vc4 with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 7
wandb: 	n_estimators: 1200





VBox(children=(Label(value='0.218 MB of 0.218 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,44.99802


wandb: Agent Starting Run: zinksxuk with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 7
wandb: 	n_estimators: 1600





VBox(children=(Label(value='0.218 MB of 0.218 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,43.5649


wandb: Agent Starting Run: g5p9h1cv with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 13
wandb: 	n_estimators: 2000





VBox(children=(Label(value='0.218 MB of 0.218 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,42.98488


wandb: Agent Starting Run: ykavfnd6 with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 13
wandb: 	n_estimators: 800





VBox(children=(Label(value='0.230 MB of 0.230 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,43.14286


wandb: Agent Starting Run: 471fjbdp with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 9
wandb: 	n_estimators: 2000





VBox(children=(Label(value='0.230 MB of 0.230 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,43.69167


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: 27195708 with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 11
wandb: 	n_estimators: 2000





VBox(children=(Label(value='0.230 MB of 0.230 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,43.4305


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: cg5od53k with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 7
wandb: 	n_estimators: 800


wandb: Network error (ConnectionError), entering retry loop.





VBox(children=(Label(value='0.230 MB of 0.230 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,45.32083


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: udia9mod with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 11
wandb: 	n_estimators: 800





VBox(children=(Label(value='0.230 MB of 0.230 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,44.37664


wandb: Agent Starting Run: hhllvr32 with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 5
wandb: 	n_estimators: 1600





VBox(children=(Label(value='0.230 MB of 0.230 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,45.08065


wandb: Agent Starting Run: m775asef with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 11
wandb: 	n_estimators: 400





VBox(children=(Label(value='0.230 MB of 0.230 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,43.83739


wandb: Agent Starting Run: uolygjwv with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 7
wandb: 	n_estimators: 1200





VBox(children=(Label(value='0.230 MB of 0.230 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,44.05288


wandb: Agent Starting Run: o34bfkeo with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 9
wandb: 	n_estimators: 1600





VBox(children=(Label(value='0.230 MB of 0.230 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,42.91903


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: n1d9avr8 with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 13
wandb: 	n_estimators: 2000





VBox(children=(Label(value='0.230 MB of 0.230 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,43.10517


wandb: Agent Starting Run: uxsf4xls with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 13
wandb: 	n_estimators: 2000





VBox(children=(Label(value='0.230 MB of 0.230 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,42.49003


wandb: Agent Starting Run: 6454ozsv with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 7
wandb: 	n_estimators: 400





VBox(children=(Label(value='0.230 MB of 0.230 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,45.09509


wandb: Agent Starting Run: nrx3ofxu with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 9
wandb: 	n_estimators: 1200





VBox(children=(Label(value='0.230 MB of 0.230 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,43.53279


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: s5gixmd3 with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 7
wandb: 	n_estimators: 1600





VBox(children=(Label(value='0.230 MB of 0.230 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,44.43932


wandb: Agent Starting Run: 09gxs7k6 with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 13
wandb: 	n_estimators: 1200





VBox(children=(Label(value='0.238 MB of 0.238 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,42.17735


wandb: Agent Starting Run: mru8zus7 with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 11
wandb: 	n_estimators: 1200





VBox(children=(Label(value='0.238 MB of 0.238 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,43.41952


wandb: Agent Starting Run: zwap97e2 with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 9
wandb: 	n_estimators: 2000





VBox(children=(Label(value='0.238 MB of 0.238 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,44.01496


wandb: Agent Starting Run: 08dklad1 with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 9
wandb: 	n_estimators: 1200





VBox(children=(Label(value='0.238 MB of 0.238 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,43.57049


wandb: Agent Starting Run: 0pj3fcij with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 7
wandb: 	n_estimators: 1200





VBox(children=(Label(value='0.238 MB of 0.238 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,45.19453


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: 7gaexs3i with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 11
wandb: 	n_estimators: 1600





VBox(children=(Label(value='0.238 MB of 0.238 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,43.52667


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: e6ws07df with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 7
wandb: 	n_estimators: 800





VBox(children=(Label(value='0.238 MB of 0.238 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,45.34872


wandb: Agent Starting Run: guijd50h with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 7
wandb: 	n_estimators: 2000





VBox(children=(Label(value='0.238 MB of 0.238 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,42.99595


wandb: Agent Starting Run: 0zx7hmbw with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 13
wandb: 	n_estimators: 400





VBox(children=(Label(value='0.238 MB of 0.238 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,42.35554


wandb: Agent Starting Run: 7g2quhav with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 9
wandb: 	n_estimators: 2000





VBox(children=(Label(value='0.238 MB of 0.238 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,42.80405


wandb: Agent Starting Run: uzvv31vz with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 11
wandb: 	n_estimators: 400





VBox(children=(Label(value='0.238 MB of 0.238 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,43.66546


wandb: Agent Starting Run: 6j39op3v with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 5
wandb: 	n_estimators: 1200





VBox(children=(Label(value='0.238 MB of 0.238 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,44.7369


wandb: Agent Starting Run: qxsu9rr9 with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 11
wandb: 	n_estimators: 1200





VBox(children=(Label(value='0.238 MB of 0.238 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,42.92293


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: rq2xcwi4 with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 13
wandb: 	n_estimators: 1600





VBox(children=(Label(value='0.238 MB of 0.238 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,42.60654


wandb: Agent Starting Run: 0k6qzbig with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 9
wandb: 	n_estimators: 800





VBox(children=(Label(value='0.238 MB of 0.238 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,45.30778


wandb: Agent Starting Run: 8ecxscra with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 9
wandb: 	n_estimators: 800





VBox(children=(Label(value='0.238 MB of 0.238 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,43.41885


wandb: Agent Starting Run: wd8pmooy with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 11
wandb: 	n_estimators: 2000





VBox(children=(Label(value='0.238 MB of 0.238 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,44.08976


wandb: Agent Starting Run: h9vb0vky with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 7
wandb: 	n_estimators: 1600





VBox(children=(Label(value='0.238 MB of 0.238 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,44.56243


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: 6b59enri with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 13
wandb: 	n_estimators: 800





VBox(children=(Label(value='0.238 MB of 0.238 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,42.93643


wandb: Agent Starting Run: f9ou5cxy with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 13
wandb: 	n_estimators: 1600





VBox(children=(Label(value='0.238 MB of 0.238 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,43.16268


wandb: Agent Starting Run: 7sasap49 with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 7
wandb: 	n_estimators: 400





VBox(children=(Label(value='0.238 MB of 0.238 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,45.07463


wandb: Agent Starting Run: xgg8ekwn with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 11
wandb: 	n_estimators: 400





VBox(children=(Label(value='0.238 MB of 0.238 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,44.30059


wandb: Agent Starting Run: ylwdyf10 with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 13
wandb: 	n_estimators: 400





VBox(children=(Label(value='0.238 MB of 0.238 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,43.30171


wandb: Agent Starting Run: esnbe5cd with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 9
wandb: 	n_estimators: 2000





VBox(children=(Label(value='0.238 MB of 0.238 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,43.20189


wandb: Agent Starting Run: npetdat6 with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 13
wandb: 	n_estimators: 2000


wandb: Ctrl + C detected. Stopping sweep.





VBox(children=(Label(value='0.238 MB of 0.238 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

# Random Forest

In [None]:
X = tree_realized

sweep_config = {
    "method": "random", # try grid or random
    "metric": {
      "name": "accuracy",
      "goal": "maximize"   
    },
    "parameters": {
        "n_estimators": {
            "values": [1800, 2000, 2400, 2800]
        },
        "max_depth": {
            "values": [20, 30, 40, 50]
            # "distribution": "uniform",
            # "min": 0.00001,
            # "max": 1
        },
        "min_samples_split": {
            "values": [2,5,10]
        },
        "min_samples_leaf": {
            "values": [1,2,4]
        }
    }
}

sweep_id = wandb.sweep(sweep_config, project='02582_case1_rf', entity='tgml')

def train():
    config_defaults = {'bootstrap': True,
    'criterion': 'mse',
    'max_depth': None,
    'max_features': 'auto',
    'max_leaf_nodes': None,
    'min_impurity_decrease': 0.0,
    'min_impurity_split': None,
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    'min_weight_fraction_leaf': 0.0,
    'n_estimators': 10,
    'n_jobs': 1,
    'oob_score': False,
    'random_state': 42,
    'verbose': 0,
    'warm_start': False,
    "seed": 0,
    "shuffle": True}

    wandb.init(project='02582_case1_1', entity='tgml', config=config_defaults)  # defaults are over-ridden during the sweep
    config = wandb.config

    X_model, X_test, y_model, y_test = split_model_test(X=X, y=y, seed=config.seed, shuffle=config.shuffle, stratify=False)
    X_train, X_val, y_train, y_val = split_train_val(X_m=X_model, y_m=y_model, seed=config.seed, shuffle=config.shuffle, stratify=False)

    # fit model on train
    model = RandomForestRegressor(n_estimators=config.n_estimators, max_features=config.max_features,
                            max_depth=config.max_depth, min_samples_split=config.min_samples_split, min_samples_leaf=config.min_samples_leaf)

    # model_on_test_acc, _ = fit_evaluate_model(X_tr_m=X_model, X_v_te=X_test, y_tr_m=y_model, y_v_te=y_test, model=model)
    # train_on_test_acc, _ = fit_evaluate_model(X_tr_m=X_train, X_v_te=X_test, y_tr_m=y_train, y_v_te=y_test, model=model)
    train_on_val_acc, _ = fit_evaluate_model(X_tr_m=X_train, X_v_te=X_val, y_tr_m=y_train, y_v_te=y_val, model=model)

    wandb.log({"accuracy": train_on_val_acc})


wandb.agent(sweep_id, train, count=100)

# Extra

In [None]:
## Initialize WandB for logging config and metrics
wandb.init(project='02582_case1_1', entity='tgml', config=wandb_config)

## Split data
seed = np.random.randint(10000)
X_model, X_test, y_model, y_test = split_model_test(X=X, y=y, seed=seed, shuffle=shuffle)
X_train, X_val, y_train, y_val = split_train_val(X_m=X_model, y_m=y_model, seed=seed, shuffle=shuffle)

## Train model on training data with different model parameters
model = XGBRegressor(max_depth=wandb_config['max_depth'], tree_method=wandb_config['tree_method'])

model_on_test_acc, _ = fit_evaluate_model(X_tr_m=X_model, X_v_te=X_test, y_tr_m=y_model, y_v_te=y_test, model=model)
## Evaluate best model on test data
train_on_test_acc, _ = fit_evaluate_model(X_tr_m=X_train, X_v_te=X_test, y_tr_m=y_train, y_v_te=y_test, model=model)

wandb.log({'model_on_test_acc':model_on_test_acc, 'train_on_test_acc': train_on_test_acc})