# Case One: Project Notebook
By August and William

In [None]:
### Imports
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

import wandb

# 1. Data Loading
## Load data and remove nan's
tree_realized = pd.read_csv('data/tree_realized_preprocessed_data.csv', index_col=0)
tree_future = pd.read_csv('data/tree_future_preprocessed_data.csv', index_col=0)
linear_realized = pd.read_csv('data/linear_realized_preprocessed_data.csv', index_col=0)
linear_future = pd.read_csv('data/linear_future_preprocessed_data.csv', index_col=0)

# Remove target from data
y = tree_realized.LoadFactor
tree_realized = tree_realized.loc[:, tree_realized.columns != 'LoadFactor']
linear_realized = linear_realized.loc[:, linear_realized.columns != 'LoadFactor']

## Make copy of **SeatCapacity** for computing forecast accuracy
tree_realized['SeatCapacityOriginal'] = tree_realized.SeatCapacity
tree_future['SeatCapacityOriginal'] = tree_future.SeatCapacity
linear_realized['SeatCapacityOriginal'] = linear_realized.SeatCapacity
linear_future['SeatCapacityOriginal'] = linear_future.SeatCapacity

# 3. Data splitting
## Split data into modeling data (will be training and validation) and test data
from sklearn.model_selection import train_test_split

### Make train/val set *0.8 and test *0.2
def split_model_test(X, y, seed=0, shuffle=False, stratify=False):
    if stratify:
        X_model, X_test, y_model, y_test = train_test_split(X, y, test_size=0.2, random_state=seed, shuffle=shuffle, stratify=y)
    else:
        X_model, X_test, y_model, y_test = train_test_split(X, y, test_size=0.2, random_state=seed, shuffle=shuffle)
    return X_model, X_test, y_model, y_test

def split_train_val(X_m, y_m, seed=0, shuffle=False, stratify=False):
    if stratify:
        X_train, X_val, y_train, y_val = train_test_split(X_m, y_m, test_size=0.25, random_state=seed, shuffle=shuffle, stratify=y_m)
    else:
        X_train, X_val, y_train, y_val = train_test_split(X_m, y_m, test_size=0.25, random_state=seed, shuffle=shuffle)
    return X_train, X_val, y_train, y_val

def seperate_SCO(X_train_model, X_val_test):
    X_train_model_SCO, X_val_test_SCO = X_train_model.SeatCapacityOriginal, X_val_test.SeatCapacityOriginal

    X_train_model = X_train_model.loc[:, ~X_train_model.columns.isin(['SeatCapacityOriginal'])]
    X_val_test = X_val_test.loc[:, ~X_val_test.columns.isin(['SeatCapacityOriginal'])]

    return X_train_model, X_val_test, X_train_model_SCO, X_val_test_SCO


# 4. Define validation setup for different models
## Define forecast accuracy function
def mean_forecast_accuracy(loadfactor_forecasted, loadfactor_true, seatcapacity):

    passengers_true = loadfactor_true * seatcapacity
    passengers_forecasted = loadfactor_forecasted * seatcapacity
    
    abs_deviation_per_flight = np.abs((passengers_true-passengers_forecasted) / passengers_true)
    abs_deviation_per_flight[abs_deviation_per_flight >= 10000] = 100

    mean_forecast_acc = np.mean(1 - abs_deviation_per_flight*1)*100
    return mean_forecast_acc

## Define normalizer for training on **SeatCapacity**
def normalize_seatcapacity_fit(X_train):
    scaler = MinMaxScaler()
    scaler.fit(X_train.SeatCapacity.values.reshape(-1, 1))
    X_train.SeatCapacity = scaler.transform(X_train.SeatCapacity.values.reshape(-1, 1))
    return X_train, scaler

def normalize_seatcapacity(X_val, scaler):
    X_val.SeatCapacity = scaler.transform(X_val.SeatCapacity.values.reshape(-1, 1))
    return X_val

## Functions for fitting+validating models, as well as testing models
### Make function for fitting and validating model
def fit_evaluate_model(X_tr_m, X_v_te, y_tr_m, y_v_te, model):
    
    ## Remove original seatcapacity
    X_tr_m, X_v_te, X_tr_m_SCO, X_v_te_SCO = seperate_SCO(X_train_model=X_tr_m, X_val_test=X_v_te)

    ## Normalize seatcapacity
    X_tr_m, fitted_scaler = normalize_seatcapacity_fit(X_train=X_tr_m)
    ## Fit model to the training data
    model.fit(X=X_tr_m, y=y_tr_m)

    ## Normalize validation data SeatCapacity for predictions
    X_v_te = normalize_seatcapacity(X_val=X_v_te, scaler=fitted_scaler)
    ## Make predictions
    pred = model.predict(X_v_te)

    ## Compute forecasting accuracy
    acc = mean_forecast_accuracy(loadfactor_forecasted=pred, loadfactor_true=y_v_te.to_numpy(), seatcapacity=X_v_te_SCO.to_numpy())

    return acc, model

# Compute EPE ground function

In [None]:
# shuffle = True
# M = 100
# forecast_acc = []

# for m in range(M):

#     ## Split data
#     seed = np.random.randint(10000)
#     X_model, X_test, y_model, y_test = split_model_test(X=X, y=y, seed=seed, shuffle=shuffle)
#     X_train, X_val, y_train, y_val = split_train_val(X_m=X_model, y_m=y_model, seed=seed, shuffle=shuffle)

#     ## Train model on training data with different model parameters
#     #TODO MODEL TRAINING FUNCTION
#     val_accs, best_model = 

#     ## Evaluate best model on test data
#     test_acc, _ = fit_evaluate_model(X_tr_m=X_train, X_v_te=X_test, y_tr_m=y_train, y_v_te=y_test, model=best_model)
#     forecast_acc.append(test_acc)


# XGBoost

In [None]:
X = tree_realized

shuffle = True
M = 100
config = {
    "shuffle": True,
    "booster": "gbtree",
    "max_depth": 10,
    "learning_rate": 0.2,
    "subsample": 0.5,
    "tree_method": "gpu_hist",
    }

np.random.seed(0)
forecast_acc = []

wandb.init(project='02582_case1_final', entity='tgml', config=config, group='XGB')

for m in range(M):

    ## Split data
    seed = np.random.randint(10000)
    X_model, X_test, y_model, y_test = split_model_test(X=X, y=y, seed=seed, shuffle=shuffle)
    X_train, X_val, y_train, y_val = split_train_val(X_m=X_model, y_m=y_model, seed=seed, shuffle=shuffle)

    ## Train and evaluate model
    model = XGBRegressor(booster=config['booster'], 
                        max_depth=config['max_depth'],
                        learning_rate=config['learning_rate'], 
                        subsample=config['subsample'], 
                        tree_method=config['tree_method'])

    ## Evaluate best model on test data
    model_test_acc, _ = fit_evaluate_model(X_tr_m=X_model, X_v_te=X_test, y_tr_m=y_model, y_v_te=y_test, model=model)
    forecast_acc.append(model_test_acc)
    wandb.log({"accuracy": model_test_acc})


print(f'Mean of test accuracies: {np.mean(forecast_acc)}\nStd. of test accuracies: {np.var(forecast_acc)}')

# Gradient Boosted Trees (SKLearn)

In [None]:
X = tree_realized

shuffle = True
M = 100
config = {
    'n_estimators': 10,
    'max_depth': None,
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    }

np.random.seed(0)
forecast_acc = []

wandb.init(project='02582_case1_final', entity='tgml', config=config, group='XGB')

for m in range(M):

    ## Split data
    seed = np.random.randint(10000)
    X_model, X_test, y_model, y_test = split_model_test(X=X, y=y, seed=seed, shuffle=shuffle)
    X_train, X_val, y_train, y_val = split_train_val(X_m=X_model, y_m=y_model, seed=seed, shuffle=shuffle)

    ## Train and evaluate model
    model = GradientBoostingRegressor(n_estimators=config['n_estimators'], 
                                    max_features=config['max_features'],
                                    max_depth=config['max_depth'], 
                                    min_samples_split=config['min_samples_split'], 
                                    min_samples_leaf=config['min_samples_leaf'])

    ## Evaluate best model on test data
    model_test_acc, _ = fit_evaluate_model(X_tr_m=X_model, X_v_te=X_test, y_tr_m=y_model, y_v_te=y_test, model=model)
    forecast_acc.append(model_test_acc)
    wandb.log({"accuracy": model_test_acc})


print(f'Mean of test accuracies: {np.mean(forecast_acc)}\nStd. of test accuracies: {np.var(forecast_acc)}')

# Random Forest

In [None]:
X = tree_realized

shuffle = True
M = 100
config = {
    # 'bootstrap': True,
    # 'criterion': 'mse',
    # 'max_leaf_nodes': None,
    # 'min_impurity_decrease': 0.0,
    # 'min_impurity_split': None,
    # 'min_weight_fraction_leaf': 0.0,
    # 'n_jobs': 1,
    # 'oob_score': False,
    # 'random_state': 42,
    # 'verbose': 0,
    # 'warm_start': False,
    # "shuffle": True,
    'max_features': 'auto',
    'n_estimators': 2000,
    'max_depth': 30,
    'min_samples_leaf': 4,
    'min_samples_split': 4,
    }

np.random.seed(0)
forecast_acc = []

wandb.init(project='02582_case1_final', entity='tgml', config=config, group='RF')

for m in range(M):

    ## Split data
    seed = np.random.randint(10000)
    X_model, X_test, y_model, y_test = split_model_test(X=X, y=y, seed=seed, shuffle=shuffle)
    X_train, X_val, y_train, y_val = split_train_val(X_m=X_model, y_m=y_model, seed=seed, shuffle=shuffle)

    ## Train and evaluate model
    model = RandomForestRegressor(n_estimators=config['n_estimators'], 
                                    max_features=config['max_features'],
                                    max_depth=config['max_depth'], 
                                    min_samples_split=config['min_samples_split'], 
                                    min_samples_leaf=config['min_samples_leaf'])

    ## Evaluate best model on test data
    model_test_acc, _ = fit_evaluate_model(X_tr_m=X_model, X_v_te=X_test, y_tr_m=y_model, y_v_te=y_test, model=model)
    forecast_acc.append(model_test_acc)
    wandb.log({"accuracy": model_test_acc})


print(f'Mean of test accuracies: {np.mean(forecast_acc)}\nStd. of test accuracies: {np.var(forecast_acc)}')

# Make WandB Sweeps to preliminarily find best ranges for parameters

# XGBoost

In [None]:
X = tree_realized

sweep_config = {
    "method": "random", # try grid or random
    "metric": {
      "name": "accuracy",
      "goal": "maximize"   
    },
    "parameters": {
        "max_depth": {
            "values": [6, 7, 8, 9]
        },
        "learning_rate": {
            "values": [0.1, 0.15, 0.2]
        },
        "subsample": {
            "distribution": "uniform",
            "min": 0.00001,
            "max": 1
        }
    }
}

sweep_id = wandb.sweep(sweep_config, project='02582_case1_1', entity='tgml')

def train():
    config_defaults = {
    "booster": "gbtree",
    "max_depth": 3,
    "learning_rate": 0.1,
    "subsample": 1,
    "seed": 0,
    "shuffle": True,
    "tree_method": "gpu_hist"
    }

    wandb.init(project='02582_case1_1', entity='tgml', config=config_defaults)  # defaults are over-ridden during the sweep
    config = wandb.config

    X_model, X_test, y_model, y_test = split_model_test(X=X, y=y, seed=config.seed, shuffle=config.shuffle, stratify=False)
    X_train, X_val, y_train, y_val = split_train_val(X_m=X_model, y_m=y_model, seed=config.seed, shuffle=config.shuffle, stratify=False)

    # fit model on train
    model = XGBRegressor(booster=config.booster, max_depth=config.max_depth,
                            learning_rate=config.learning_rate, subsample=config.subsample, tree_method=config.tree_method)

    # model_on_test_acc, _ = fit_evaluate_model(X_tr_m=X_model, X_v_te=X_test, y_tr_m=y_model, y_v_te=y_test, model=model)
    # train_on_test_acc, _ = fit_evaluate_model(X_tr_m=X_train, X_v_te=X_test, y_tr_m=y_train, y_v_te=y_test, model=model)
    train_on_val_acc, _ = fit_evaluate_model(X_tr_m=X_train, X_v_te=X_val, y_tr_m=y_train, y_v_te=y_val, model=model)

    wandb.log({"accuracy": train_on_val_acc})
#objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1, alpha = 10, n_estimators = 10

wandb.agent(sweep_id, train, count=50)

# Gradient Boost Tree

In [4]:
X = tree_realized

sweep_config = {
    "method": "random", # try grid or random
    "metric": {
      "name": "accuracy",
      "goal": "maximize"   
    },
    "parameters": {
        "n_estimators": {
            "values": [400, 800, 1200, 1600, 2000]
        },
        "max_depth": {
            "values": [1,3,5,7,9]
            # "distribution": "uniform",
            # "min": 0.00001,
            # "max": 1
        },
        "learning_rate": {
            "values": [0.01, 0.1, 0.2]
        }
    }
}

sweep_id = wandb.sweep(sweep_config, project='02582_case1_gb', entity='tgml')

def train():
    config_defaults = {
    # 'bootstrap': True,
    # 'criterion': 'mse',
    # 'max_features': 'auto',
    # 'max_leaf_nodes': None,
    # 'min_impurity_decrease': 0.0,
    # 'min_impurity_split': None,
    # 'min_samples_leaf': 1,
    # 'min_samples_split': 2,
    # 'min_weight_fraction_leaf': 0.0,
    # 'n_estimators': 10,
    # 'n_jobs': 1,
    # 'oob_score': False,
    # 'random_state': 42,
    # 'verbose': 0,
    # 'warm_start': False,
    "seed": 0,
    "shuffle": True,
    'max_depth': 3,
    "n_estimators": 100,
    "learning_rate": 0.1,
    }

    wandb.init(project='02582_case1_1', entity='tgml', config=config_defaults)  # defaults are over-ridden during the sweep
    config = wandb.config

    X_model, X_test, y_model, y_test = split_model_test(X=X, y=y, seed=config.seed, shuffle=config.shuffle, stratify=False)
    X_train, X_val, y_train, y_val = split_train_val(X_m=X_model, y_m=y_model, seed=config.seed, shuffle=config.shuffle, stratify=False)

    # fit model on train
    model = GradientBoostingRegressor(n_estimators=config.n_estimators, 
                                        learning_rate=config.learning_rate,
                                        max_depth=config.max_depth)

    # model_on_test_acc, _ = fit_evaluate_model(X_tr_m=X_model, X_v_te=X_test, y_tr_m=y_model, y_v_te=y_test, model=model)
    # train_on_test_acc, _ = fit_evaluate_model(X_tr_m=X_train, X_v_te=X_test, y_tr_m=y_train, y_v_te=y_test, model=model)
    train_on_val_acc, _ = fit_evaluate_model(X_tr_m=X_train, X_v_te=X_val, y_tr_m=y_train, y_v_te=y_val, model=model)

    wandb.log({"accuracy": train_on_val_acc})


wandb.agent(sweep_id, train, count=100)

Create sweep with ID: ltwtrer8
Sweep URL: https://wandb.ai/tgml/02582_case1_gb/sweeps/ltwtrer8


wandb: Agent Starting Run: d5vlmbfd with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 3
wandb: 	n_estimators: 800
wandb: Currently logged in as: augustsemrau (use `wandb login --relogin` to force relogin)





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,41.33221


wandb: Agent Starting Run: l7b7af13 with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 3
wandb: 	n_estimators: 1200





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,46.84715


wandb: Agent Starting Run: 1wjj6xyq with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 3
wandb: 	n_estimators: 800





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,41.32958


wandb: Agent Starting Run: j14w81q0 with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 9
wandb: 	n_estimators: 400





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,52.48463


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: vki46a5x with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 5
wandb: 	n_estimators: 400





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,47.1486


wandb: Agent Starting Run: 5pstu1dp with config:
wandb: 	learning_rate: 0.01
wandb: 	max_depth: 3
wandb: 	n_estimators: 400





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,9.91702


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: ymk5hbis with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 5
wandb: 	n_estimators: 400





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,47.09535


wandb: Agent Starting Run: 02bb4769 with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 9
wandb: 	n_estimators: 1200





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,52.40513


wandb: Agent Starting Run: i87wukxb with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 5
wandb: 	n_estimators: 800





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,50.23997


wandb: Agent Starting Run: 8p37rccf with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 7
wandb: 	n_estimators: 400





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,51.87396


wandb: Agent Starting Run: c1dss4ee with config:
wandb: 	learning_rate: 0.01
wandb: 	max_depth: 1
wandb: 	n_estimators: 1200





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,1.20218


wandb: Agent Starting Run: 462me25q with config:
wandb: 	learning_rate: 0.01
wandb: 	max_depth: 9
wandb: 	n_estimators: 800





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,47.70047


wandb: Agent Starting Run: ng3brp2c with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 7
wandb: 	n_estimators: 800





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,51.95954


wandb: Agent Starting Run: 7kx7jwq2 with config:
wandb: 	learning_rate: 0.01
wandb: 	max_depth: 7
wandb: 	n_estimators: 800





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,41.27337


wandb: Agent Starting Run: 4ysm890i with config:
wandb: 	learning_rate: 0.01
wandb: 	max_depth: 7
wandb: 	n_estimators: 400





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,32.66423


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: 927yta1u with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 5
wandb: 	n_estimators: 1200





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,51.36916


wandb: Agent Starting Run: u3hxpt2h with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 9
wandb: 	n_estimators: 400





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,50.56893


wandb: Agent Starting Run: 6by0cmng with config:
wandb: 	learning_rate: 0.01
wandb: 	max_depth: 5
wandb: 	n_estimators: 400





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,21.5287


wandb: Agent Starting Run: ap32r1tr with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 9
wandb: 	n_estimators: 1200





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,52.30744


wandb: Agent Starting Run: 2gy98axh with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 1
wandb: 	n_estimators: 800





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,18.54516


wandb: Agent Starting Run: 2eaueh6i with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 9
wandb: 	n_estimators: 400





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,50.24883


wandb: Agent Starting Run: gjr93lmg with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 5
wandb: 	n_estimators: 2000





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,52.63895


wandb: Agent Starting Run: 2twrpzx2 with config:
wandb: 	learning_rate: 0.01
wandb: 	max_depth: 5
wandb: 	n_estimators: 2000





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,41.19743


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: jsviy6vp with config:
wandb: 	learning_rate: 0.01
wandb: 	max_depth: 3
wandb: 	n_estimators: 1600





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,27.04435


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: mctey0rr with config:
wandb: 	learning_rate: 0.01
wandb: 	max_depth: 5
wandb: 	n_estimators: 2000





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,41.20466


wandb: Agent Starting Run: 3z7kqxdx with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 5
wandb: 	n_estimators: 1200





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,51.64488


wandb: Agent Starting Run: slrrv1rd with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 7
wandb: 	n_estimators: 800





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,52.06295


wandb: Agent Starting Run: 8m14epa2 with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 7
wandb: 	n_estimators: 1200





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,52.49303


wandb: Agent Starting Run: k96n3ycn with config:
wandb: 	learning_rate: 0.01
wandb: 	max_depth: 9
wandb: 	n_estimators: 2000





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,51.24721


wandb: Agent Starting Run: q380vdjj with config:
wandb: 	learning_rate: 0.01
wandb: 	max_depth: 3
wandb: 	n_estimators: 1200





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,23.41815


wandb: Agent Starting Run: itepuvqh with config:
wandb: 	learning_rate: 0.01
wandb: 	max_depth: 5
wandb: 	n_estimators: 800





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,31.37861


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: u867d3if with config:
wandb: 	learning_rate: 0.01
wandb: 	max_depth: 5
wandb: 	n_estimators: 400





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,21.52565


wandb: Agent Starting Run: iiowg1lr with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 7
wandb: 	n_estimators: 400





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,51.90931


wandb: Agent Starting Run: eu18sf60 with config:
wandb: 	learning_rate: 0.01
wandb: 	max_depth: 5
wandb: 	n_estimators: 1200





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,36.46417


wandb: Agent Starting Run: 4m259q94 with config:
wandb: 	learning_rate: 0.01
wandb: 	max_depth: 9
wandb: 	n_estimators: 1600





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,50.53016


wandb: Agent Starting Run: t1gu13qp with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 3
wandb: 	n_estimators: 800





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,45.19741


wandb: Agent Starting Run: m2asbsmq with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 3
wandb: 	n_estimators: 2000





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,48.11134


wandb: Agent Starting Run: zmxium7g with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 7
wandb: 	n_estimators: 1600





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,51.65071


wandb: Agent Starting Run: xxs6sddm with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 3
wandb: 	n_estimators: 1600





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,47.60961


wandb: Agent Starting Run: vpmf68xl with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 5
wandb: 	n_estimators: 800





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,50.0637


wandb: Agent Starting Run: f21rjc4x with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 7
wandb: 	n_estimators: 1600





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,51.69006


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: jbvsnfiq with config:
wandb: 	learning_rate: 0.01
wandb: 	max_depth: 9
wandb: 	n_estimators: 1600





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,50.55622


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: px7nmq08 with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 9
wandb: 	n_estimators: 2000





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,52.22717


wandb: Agent Starting Run: v7c1w32l with config:
wandb: 	learning_rate: 0.01
wandb: 	max_depth: 9
wandb: 	n_estimators: 800





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,47.70992


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: u69bjbjb with config:
wandb: 	learning_rate: 0.01
wandb: 	max_depth: 3
wandb: 	n_estimators: 800





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,18.66551


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: kz079rkt with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 5
wandb: 	n_estimators: 400





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,49.52597


wandb: Agent Starting Run: pey1nxmb with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 1
wandb: 	n_estimators: 1200





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,20.31506


wandb: Agent Starting Run: zxt1b0oz with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 5
wandb: 	n_estimators: 1600





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,52.19643


wandb: Agent Starting Run: rtx2dvbz with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 9
wandb: 	n_estimators: 2000





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,52.31624


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: rb0ivpti with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 3
wandb: 	n_estimators: 800





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,41.3314


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: jca9oesr with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 9
wandb: 	n_estimators: 1200





VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

# Random Forest

In [None]:
X = tree_realized

sweep_config = {
    "method": "random", # try grid or random
    "metric": {
      "name": "accuracy",
      "goal": "maximize"   
    },
    "parameters": {
        "n_estimators": {
            "values": [1800, 2000, 2400, 2800]
        },
        "max_depth": {
            "values": [20, 30, 40, 50]
            # "distribution": "uniform",
            # "min": 0.00001,
            # "max": 1
        },
        "min_samples_split": {
            "values": [2,5,10]
        },
        "min_samples_leaf": {
            "values": [1,2,4]
        }
    }
}

sweep_id = wandb.sweep(sweep_config, project='02582_case1_rf', entity='tgml')

def train():
    config_defaults = {'bootstrap': True,
    'criterion': 'mse',
    'max_depth': None,
    'max_features': 'auto',
    'max_leaf_nodes': None,
    'min_impurity_decrease': 0.0,
    'min_impurity_split': None,
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    'min_weight_fraction_leaf': 0.0,
    'n_estimators': 10,
    'n_jobs': 1,
    'oob_score': False,
    'random_state': 42,
    'verbose': 0,
    'warm_start': False,
    "seed": 0,
    "shuffle": True}

    wandb.init(project='02582_case1_1', entity='tgml', config=config_defaults)  # defaults are over-ridden during the sweep
    config = wandb.config

    X_model, X_test, y_model, y_test = split_model_test(X=X, y=y, seed=config.seed, shuffle=config.shuffle, stratify=False)
    X_train, X_val, y_train, y_val = split_train_val(X_m=X_model, y_m=y_model, seed=config.seed, shuffle=config.shuffle, stratify=False)

    # fit model on train
    model = RandomForestRegressor(n_estimators=config.n_estimators, max_features=config.max_features,
                            max_depth=config.max_depth, min_samples_split=config.min_samples_split, min_samples_leaf=config.min_samples_leaf)

    # model_on_test_acc, _ = fit_evaluate_model(X_tr_m=X_model, X_v_te=X_test, y_tr_m=y_model, y_v_te=y_test, model=model)
    # train_on_test_acc, _ = fit_evaluate_model(X_tr_m=X_train, X_v_te=X_test, y_tr_m=y_train, y_v_te=y_test, model=model)
    train_on_val_acc, _ = fit_evaluate_model(X_tr_m=X_train, X_v_te=X_val, y_tr_m=y_train, y_v_te=y_val, model=model)

    wandb.log({"accuracy": train_on_val_acc})


wandb.agent(sweep_id, train, count=100)

# Extra

In [None]:
## Initialize WandB for logging config and metrics
wandb.init(project='02582_case1_1', entity='tgml', config=wandb_config)

## Split data
seed = np.random.randint(10000)
X_model, X_test, y_model, y_test = split_model_test(X=X, y=y, seed=seed, shuffle=shuffle)
X_train, X_val, y_train, y_val = split_train_val(X_m=X_model, y_m=y_model, seed=seed, shuffle=shuffle)

## Train model on training data with different model parameters
model = XGBRegressor(max_depth=wandb_config['max_depth'], tree_method=wandb_config['tree_method'])

model_on_test_acc, _ = fit_evaluate_model(X_tr_m=X_model, X_v_te=X_test, y_tr_m=y_model, y_v_te=y_test, model=model)
## Evaluate best model on test data
train_on_test_acc, _ = fit_evaluate_model(X_tr_m=X_train, X_v_te=X_test, y_tr_m=y_train, y_v_te=y_test, model=model)

wandb.log({'model_on_test_acc':model_on_test_acc, 'train_on_test_acc': train_on_test_acc})