# Case One: Project Notebook
By August and William

In [1]:
### Imports
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

import wandb
from sklearn.model_selection import train_test_split

# 1. Data Loading
# `train.csv`
# `validation.csv`
# `test.csv`
# `future_preprocessed.csv`
real_train = pd.read_csv('data/train.csv', index_col=0)
real_val = pd.read_csv('data/validation.csv', index_col=0)
real_test = pd.read_csv('data/test.csv', index_col=0)

future = pd.read_csv('data/future_preprocessed.csv', index_col=0)

# Remove target from data
y_train = real_train.LoadFactor
real_train = real_train.loc[:, real_train.columns != 'LoadFactor']
y_val = real_val.LoadFactor
real_val = real_val.loc[:, real_val.columns != 'LoadFactor']

## Make copy of **SeatCapacity** for computing forecast accuracy
real_train['SeatCapacityOriginal'] = real_train.SeatCapacity
real_val['SeatCapacityOriginal'] = real_val.SeatCapacity
real_test['SeatCapacityOriginal'] = real_test.SeatCapacity
# future['SeatCapacityOriginal'] = future.SeatCapacity

X_train = real_train
X_val = real_val


# # 3. Data splitting
# ## Split data into modeling data (will be training and validation) and test data

# ### Make train/val set *0.8 and test *0.2
# def split_model_test(X, y, seed=0, shuffle=False, stratify=False):
#     if stratify:
#         X_model, X_test, y_model, y_test = train_test_split(X, y, test_size=0.2, random_state=seed, shuffle=shuffle, stratify=y)
#     else:
#         X_model, X_test, y_model, y_test = train_test_split(X, y, test_size=0.2, random_state=seed, shuffle=shuffle)
#     return X_model, X_test, y_model, y_test

# def split_train_val(X_m, y_m, seed=0, shuffle=False, stratify=False):
#     if stratify:
#         X_train, X_val, y_train, y_val = train_test_split(X_m, y_m, test_size=0.25, random_state=seed, shuffle=shuffle, stratify=y_m)
#     else:
#         X_train, X_val, y_train, y_val = train_test_split(X_m, y_m, test_size=0.25, random_state=seed, shuffle=shuffle)
#     return X_train, X_val, y_train, y_val

def seperate_SCO(X_train_model, X_val_test):
    X_train_model_SCO, X_val_test_SCO = X_train_model.SeatCapacityOriginal, X_val_test.SeatCapacityOriginal

    X_train_model = X_train_model.loc[:, ~X_train_model.columns.isin(['SeatCapacityOriginal'])]
    X_val_test = X_val_test.loc[:, ~X_val_test.columns.isin(['SeatCapacityOriginal'])]

    return X_train_model, X_val_test, X_train_model_SCO, X_val_test_SCO


# 4. Define validation setup for different models
## Define forecast accuracy function
def mean_forecast_accuracy(loadfactor_forecasted, loadfactor_true, seatcapacity):

    passengers_true = loadfactor_true * seatcapacity
    passengers_forecasted = loadfactor_forecasted * seatcapacity
    
    abs_deviation_per_flight = np.abs((passengers_true-passengers_forecasted) / passengers_true)
    abs_deviation_per_flight[abs_deviation_per_flight >= 10000] = 100

    mean_forecast_acc = np.mean(1 - abs_deviation_per_flight*1)*100
    return mean_forecast_acc

## Define normalizer for training on **SeatCapacity**
def normalize_seatcapacity_fit(X_train):
    scaler = MinMaxScaler()
    scaler.fit(X_train.SeatCapacity.values.reshape(-1, 1))
    X_train.SeatCapacity = scaler.transform(X_train.SeatCapacity.values.reshape(-1, 1))
    return X_train, scaler

def normalize_seatcapacity(X_val, scaler):
    X_val.SeatCapacity = scaler.transform(X_val.SeatCapacity.values.reshape(-1, 1))
    return X_val

## Functions for fitting+validating models, as well as testing models
### Make function for fitting and validating model
def fit_evaluate_model(X_tr_m, X_v_te, y_tr_m, y_v_te, model):
    
    ## Remove original seatcapacity
    X_tr_m, X_v_te, X_tr_m_SCO, X_v_te_SCO = seperate_SCO(X_train_model=X_tr_m, X_val_test=X_v_te)

    ## Normalize seatcapacity
    X_tr_m, fitted_scaler = normalize_seatcapacity_fit(X_train=X_tr_m)
    ## Fit model to the training data
    model.fit(X=X_tr_m, y=y_tr_m)

    ## Normalize validation data SeatCapacity for predictions
    X_v_te = normalize_seatcapacity(X_val=X_v_te, scaler=fitted_scaler)
    ## Make predictions
    pred = model.predict(X_v_te)

    ## Compute forecasting accuracy
    acc = mean_forecast_accuracy(loadfactor_forecasted=pred, loadfactor_true=y_v_te.to_numpy(), seatcapacity=X_v_te_SCO.to_numpy())

    return acc, model

# Part 1: Find best hyperparameters using val set

# Gradient boost

In [15]:
sweep_config = {
    "method": "random", # try grid or random
    "metric": {
      "name": "accuracy",
      "goal": "maximize"   
    },
    "parameters": {
        "n_estimators": {
            "values": [400, 800, 1200, 1600, 2000, 2500, 3000]
        },
        "max_depth": {
            "values": [3,4,5,6,7,9,11]
            # "distribution": "uniform",
            # "min": 0.00001,
            # "max": 1
        },
        "learning_rate": {
            "values": [0.05, 0.1, 0.15, 0.2]
        },
        "subsample": {
            "values": [0.25, 0.5, 0.75, 1]
        }
    }
}

sweep_id = wandb.sweep(sweep_config, project='02582_case1_boost_sweep', entity='tgml')

def train():
    config_defaults = {
    # 'bootstrap': True,
    # 'criterion': 'mse',
    # 'max_features': 'auto',
    # 'max_leaf_nodes': None,
    # 'min_impurity_decrease': 0.0,
    # 'min_impurity_split': None,
    # 'min_samples_leaf': 1,
    # 'min_samples_split': 2,
    # 'min_weight_fraction_leaf': 0.0,
    # 'n_estimators': 10,
    # 'n_jobs': 1,
    # 'oob_score': False,
    # 'random_state': 42,
    # 'verbose': 0,
    # 'warm_start': False,
    "seed": 0,
    "n_estimators": 100,
    'max_depth': 3,
    "learning_rate": 0.1,
    "subsample": 0.5,
    }

    wandb.init(project='02582_case1_boost_sweep', entity='tgml', config=config_defaults)  # defaults are over-ridden during the sweep
    config = wandb.config

    # fit model on train
    model = GradientBoostingRegressor(n_estimators=config.n_estimators, 
                                        learning_rate=config.learning_rate,
                                        max_depth=config.max_depth,
                                        subsample=config.subsample)

    train_on_val_acc, _ = fit_evaluate_model(X_tr_m=X_train, X_v_te=X_val, y_tr_m=y_train, y_v_te=y_val, model=model)

    wandb.log({"accuracy": train_on_val_acc})


wandb.agent(sweep_id, train, count=100)

Create sweep with ID: hgdu1h14
Sweep URL: https://wandb.ai/tgml/02582_case1_boost_sweep/sweeps/hgdu1h14


wandb: Agent Starting Run: cs73rtar with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 5
wandb: 	n_estimators: 2000
wandb: 	subsample: 1





VBox(children=(Label(value='3.793 MB of 3.793 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.18344


wandb: Agent Starting Run: bp2ojany with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 9
wandb: 	n_estimators: 2500
wandb: 	subsample: 0.75





VBox(children=(Label(value='3.793 MB of 3.793 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,22.1518


wandb: Agent Starting Run: 4v13wkte with config:
wandb: 	learning_rate: 0.05
wandb: 	max_depth: 3
wandb: 	n_estimators: 1200
wandb: 	subsample: 1





VBox(children=(Label(value='3.793 MB of 3.793 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,21.38431


wandb: Agent Starting Run: 1l6d0h1f with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 9
wandb: 	n_estimators: 1600
wandb: 	subsample: 0.75





VBox(children=(Label(value='3.793 MB of 3.793 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,21.27911


wandb: Agent Starting Run: rj9injlj with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 7
wandb: 	n_estimators: 1200
wandb: 	subsample: 1





VBox(children=(Label(value='3.793 MB of 3.793 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,23.97162


wandb: Agent Starting Run: kawoga02 with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 11
wandb: 	n_estimators: 1600
wandb: 	subsample: 1


# Random Forest

In [2]:
sweep_config = {
    "method": "random", # try grid or random
    "metric": {
      "name": "accuracy",
      "goal": "maximize"   
    },
    "parameters": {
        "n_estimators": {
            "values": [1800, 2000, 2400, 2800]
        },
        "max_depth": {
            "values": [20, 30, 40, 50]
        },
        "min_samples_split": {
            "values": [2,4, 6,8]
        },
        "min_samples_leaf": {
            "values": [1,2,3,4]
        },
        "bootstrap": {
            "values": [True, False]
        }
    }
}

sweep_id = wandb.sweep(sweep_config, project='02582_case1_rf_sweep', entity='tgml')

def train():
    config_defaults = {
    'max_features': 'auto',
    'max_leaf_nodes': None,
    'bootstrap': True,
    'max_depth': 4,
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    'n_estimators': 10,
    # 'criterion': 'mse',
    # 'min_impurity_decrease': 0.0,
    # 'min_impurity_split': None,
    # 'min_weight_fraction_leaf': 0.0,
    # 'n_jobs': 1,
    # 'oob_score': False,
    # 'random_state': 42,
    # 'verbose': 0,
    # 'warm_start': False,
    # "seed": 0,
    # "shuffle": True
    }

    wandb.init(project='02582_case1_rf_sweep', entity='tgml', config=config_defaults)  # defaults are over-ridden during the sweep
    config = wandb.config


    # fit model on train
    model = RandomForestRegressor(n_estimators=config.n_estimators, bootstrap=config.bootstrap,
                            max_depth=config.max_depth, min_samples_split=config.min_samples_split, min_samples_leaf=config.min_samples_leaf)

    train_on_val_acc, _ = fit_evaluate_model(X_tr_m=X_train, X_v_te=X_val, y_tr_m=y_train, y_v_te=y_val, model=model)

    wandb.log({"accuracy": train_on_val_acc})


wandb.agent(sweep_id, train, count=100)

Create sweep with ID: 6hzp74en
Sweep URL: https://wandb.ai/tgml/02582_case1_rf_sweep/sweeps/6hzp74en


wandb: Agent Starting Run: bkjyuld7 with config:
wandb: 	bootstrap: False
wandb: 	max_depth: 40
wandb: 	min_samples_leaf: 3
wandb: 	min_samples_split: 2
wandb: 	n_estimators: 2400
wandb: Currently logged in as: augustsemrau (use `wandb login --relogin` to force relogin)





VBox(children=(Label(value='3.813 MB of 3.813 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,23.27222


wandb: Agent Starting Run: ld9ivxdi with config:
wandb: 	bootstrap: True
wandb: 	max_depth: 20
wandb: 	min_samples_leaf: 3
wandb: 	min_samples_split: 8
wandb: 	n_estimators: 2000





VBox(children=(Label(value='3.813 MB of 3.813 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.13359


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: p9hgpbk4 with config:
wandb: 	bootstrap: False
wandb: 	max_depth: 40
wandb: 	min_samples_leaf: 4
wandb: 	min_samples_split: 8
wandb: 	n_estimators: 2400
wandb: W&B API key is configured (use `wandb login --relogin` to force relogin)





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,23.59773


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: fd3tyiqs with config:
wandb: 	bootstrap: True
wandb: 	max_depth: 20
wandb: 	min_samples_leaf: 3
wandb: 	min_samples_split: 6
wandb: 	n_estimators: 1800





VBox(children=(Label(value='3.813 MB of 3.813 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.14928


wandb: Agent Starting Run: os748dwi with config:
wandb: 	bootstrap: True
wandb: 	max_depth: 40
wandb: 	min_samples_leaf: 2
wandb: 	min_samples_split: 4
wandb: 	n_estimators: 2400





VBox(children=(Label(value='3.813 MB of 3.813 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.41613


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: y00c9g3k with config:
wandb: 	bootstrap: True
wandb: 	max_depth: 40
wandb: 	min_samples_leaf: 3
wandb: 	min_samples_split: 8
wandb: 	n_estimators: 1800





VBox(children=(Label(value='3.813 MB of 3.813 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.44057


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: u13lo2md with config:
wandb: 	bootstrap: True
wandb: 	max_depth: 50
wandb: 	min_samples_leaf: 2
wandb: 	min_samples_split: 6
wandb: 	n_estimators: 2000





VBox(children=(Label(value='3.813 MB of 3.813 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.46779


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: tmnwxm61 with config:
wandb: 	bootstrap: True
wandb: 	max_depth: 20
wandb: 	min_samples_leaf: 1
wandb: 	min_samples_split: 6
wandb: 	n_estimators: 1800





VBox(children=(Label(value='3.813 MB of 3.813 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.21702


wandb: Agent Starting Run: xin8odrp with config:
wandb: 	bootstrap: False
wandb: 	max_depth: 20
wandb: 	min_samples_leaf: 4
wandb: 	min_samples_split: 4
wandb: 	n_estimators: 1800





VBox(children=(Label(value='3.813 MB of 3.813 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,23.53373


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: m32134sp with config:
wandb: 	bootstrap: False
wandb: 	max_depth: 20
wandb: 	min_samples_leaf: 3
wandb: 	min_samples_split: 6
wandb: 	n_estimators: 2400





VBox(children=(Label(value='3.813 MB of 3.813 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,23.2183


wandb: Agent Starting Run: vgg27iuv with config:
wandb: 	bootstrap: False
wandb: 	max_depth: 20
wandb: 	min_samples_leaf: 2
wandb: 	min_samples_split: 2
wandb: 	n_estimators: 2000





VBox(children=(Label(value='3.813 MB of 3.813 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: 0wbo2hef with config:
wandb: 	bootstrap: True
wandb: 	max_depth: 40
wandb: 	min_samples_leaf: 4
wandb: 	min_samples_split: 8
wandb: 	n_estimators: 2000





VBox(children=(Label(value='3.813 MB of 3.813 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.35404


wandb: Agent Starting Run: 3c95fxi0 with config:
wandb: 	bootstrap: True
wandb: 	max_depth: 30
wandb: 	min_samples_leaf: 3
wandb: 	min_samples_split: 4
wandb: 	n_estimators: 1800





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.40925


wandb: Agent Starting Run: o3xjkcig with config:
wandb: 	bootstrap: False
wandb: 	max_depth: 40
wandb: 	min_samples_leaf: 2
wandb: 	min_samples_split: 8
wandb: 	n_estimators: 2800





VBox(children=(Label(value='3.813 MB of 3.813 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,23.22623


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: ungzkt89 with config:
wandb: 	bootstrap: True
wandb: 	max_depth: 50
wandb: 	min_samples_leaf: 1
wandb: 	min_samples_split: 4
wandb: 	n_estimators: 1800





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.26924


wandb: Agent Starting Run: 4c22l3y4 with config:
wandb: 	bootstrap: True
wandb: 	max_depth: 30
wandb: 	min_samples_leaf: 2
wandb: 	min_samples_split: 4
wandb: 	n_estimators: 2400





VBox(children=(Label(value='3.813 MB of 3.813 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.38833


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: coes72at with config:
wandb: 	bootstrap: True
wandb: 	max_depth: 50
wandb: 	min_samples_leaf: 2
wandb: 	min_samples_split: 6
wandb: 	n_estimators: 2800





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.48598


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: 1vfmfbwu with config:
wandb: 	bootstrap: False
wandb: 	max_depth: 30
wandb: 	min_samples_leaf: 3
wandb: 	min_samples_split: 2
wandb: 	n_estimators: 2800





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,23.36561


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: 74qd63pf with config:
wandb: 	bootstrap: True
wandb: 	max_depth: 20
wandb: 	min_samples_leaf: 2
wandb: 	min_samples_split: 4
wandb: 	n_estimators: 2000





VBox(children=(Label(value='3.813 MB of 3.813 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.1669


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: wps9un4z with config:
wandb: 	bootstrap: False
wandb: 	max_depth: 30
wandb: 	min_samples_leaf: 4
wandb: 	min_samples_split: 4
wandb: 	n_estimators: 2400





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,23.66764


wandb: Agent Starting Run: 4yhp7e30 with config:
wandb: 	bootstrap: True
wandb: 	max_depth: 20
wandb: 	min_samples_leaf: 1
wandb: 	min_samples_split: 6
wandb: 	n_estimators: 2400





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.18904


wandb: Agent Starting Run: 1bdw299l with config:
wandb: 	bootstrap: False
wandb: 	max_depth: 50
wandb: 	min_samples_leaf: 1
wandb: 	min_samples_split: 8
wandb: 	n_estimators: 2400





VBox(children=(Label(value='3.813 MB of 3.813 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,22.46352


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: o1s5kmr9 with config:
wandb: 	bootstrap: False
wandb: 	max_depth: 20
wandb: 	min_samples_leaf: 4
wandb: 	min_samples_split: 2
wandb: 	n_estimators: 2800





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,23.53383


wandb: Agent Starting Run: n7qkjrin with config:
wandb: 	bootstrap: False
wandb: 	max_depth: 30
wandb: 	min_samples_leaf: 1
wandb: 	min_samples_split: 8
wandb: 	n_estimators: 2000





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,22.82968


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: obql7zry with config:
wandb: 	bootstrap: True
wandb: 	max_depth: 20
wandb: 	min_samples_leaf: 2
wandb: 	min_samples_split: 6
wandb: 	n_estimators: 2400





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.1975


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: 4ywjl8g9 with config:
wandb: 	bootstrap: True
wandb: 	max_depth: 30
wandb: 	min_samples_leaf: 2
wandb: 	min_samples_split: 6
wandb: 	n_estimators: 2400





VBox(children=(Label(value='3.813 MB of 3.813 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.45526


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: r1w60cdc with config:
wandb: 	bootstrap: False
wandb: 	max_depth: 40
wandb: 	min_samples_leaf: 2
wandb: 	min_samples_split: 2
wandb: 	n_estimators: 2000





VBox(children=(Label(value='3.813 MB of 3.813 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: t0d1n5hy with config:
wandb: 	bootstrap: True
wandb: 	max_depth: 40
wandb: 	min_samples_leaf: 3
wandb: 	min_samples_split: 2
wandb: 	n_estimators: 2800





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.41633


wandb: Agent Starting Run: qsopmks7 with config:
wandb: 	bootstrap: True
wandb: 	max_depth: 50
wandb: 	min_samples_leaf: 1
wandb: 	min_samples_split: 4
wandb: 	n_estimators: 2800





VBox(children=(Label(value='3.813 MB of 3.813 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.2922


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: ir84w3a0 with config:
wandb: 	bootstrap: False
wandb: 	max_depth: 30
wandb: 	min_samples_leaf: 3
wandb: 	min_samples_split: 6
wandb: 	n_estimators: 2400





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,23.36573


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: t0ip6cr9 with config:
wandb: 	bootstrap: True
wandb: 	max_depth: 50
wandb: 	min_samples_leaf: 2
wandb: 	min_samples_split: 6
wandb: 	n_estimators: 1800





VBox(children=(Label(value='3.813 MB of 3.813 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.44315


wandb: Agent Starting Run: t1pbz53w with config:
wandb: 	bootstrap: True
wandb: 	max_depth: 40
wandb: 	min_samples_leaf: 4
wandb: 	min_samples_split: 2
wandb: 	n_estimators: 2000





VBox(children=(Label(value='3.813 MB of 3.813 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.34696


wandb: Agent Starting Run: 5e52d5mz with config:
wandb: 	bootstrap: True
wandb: 	max_depth: 40
wandb: 	min_samples_leaf: 3
wandb: 	min_samples_split: 2
wandb: 	n_estimators: 2400





VBox(children=(Label(value='3.813 MB of 3.813 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.43093


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: aunvj7qh with config:
wandb: 	bootstrap: True
wandb: 	max_depth: 20
wandb: 	min_samples_leaf: 4
wandb: 	min_samples_split: 4
wandb: 	n_estimators: 2800





VBox(children=(Label(value='3.813 MB of 3.813 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.09187


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: atml8f7m with config:
wandb: 	bootstrap: True
wandb: 	max_depth: 50
wandb: 	min_samples_leaf: 2
wandb: 	min_samples_split: 6
wandb: 	n_estimators: 2400





VBox(children=(Label(value='3.813 MB of 3.813 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.48179


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: mtzi41p6 with config:
wandb: 	bootstrap: False
wandb: 	max_depth: 30
wandb: 	min_samples_leaf: 4
wandb: 	min_samples_split: 8
wandb: 	n_estimators: 2000





VBox(children=(Label(value='3.813 MB of 3.813 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,23.66686


wandb: Agent Starting Run: sw9i5s7z with config:
wandb: 	bootstrap: True
wandb: 	max_depth: 40
wandb: 	min_samples_leaf: 1
wandb: 	min_samples_split: 8
wandb: 	n_estimators: 2400





VBox(children=(Label(value='3.813 MB of 3.813 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.55962


wandb: Agent Starting Run: 086tkpgf with config:
wandb: 	bootstrap: False
wandb: 	max_depth: 40
wandb: 	min_samples_leaf: 4
wandb: 	min_samples_split: 2
wandb: 	n_estimators: 2400





VBox(children=(Label(value='3.813 MB of 3.813 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,23.59795


wandb: Agent Starting Run: b5vwvagc with config:
wandb: 	bootstrap: True
wandb: 	max_depth: 40
wandb: 	min_samples_leaf: 2
wandb: 	min_samples_split: 8
wandb: 	n_estimators: 2800





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.47528


wandb: Agent Starting Run: p7hpb1zt with config:
wandb: 	bootstrap: False
wandb: 	max_depth: 50
wandb: 	min_samples_leaf: 2
wandb: 	min_samples_split: 6
wandb: 	n_estimators: 2400





VBox(children=(Label(value='3.813 MB of 3.813 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,22.92798


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: vmgu79oy with config:
wandb: 	bootstrap: True
wandb: 	max_depth: 40
wandb: 	min_samples_leaf: 4
wandb: 	min_samples_split: 2
wandb: 	n_estimators: 2000





VBox(children=(Label(value='3.813 MB of 3.813 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.34861


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: cc3s8o02 with config:
wandb: 	bootstrap: False
wandb: 	max_depth: 50
wandb: 	min_samples_leaf: 2
wandb: 	min_samples_split: 2
wandb: 	n_estimators: 2800





VBox(children=(Label(value='3.813 MB of 3.813 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: fuo3bxld with config:
wandb: 	bootstrap: False
wandb: 	max_depth: 30
wandb: 	min_samples_leaf: 2
wandb: 	min_samples_split: 2
wandb: 	n_estimators: 2400





VBox(children=(Label(value='4.215 MB of 4.215 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

wandb: Ctrl + C detected. Stopping sweep.


# Now make accuracy prediction using test set

# Gradient boost

In [None]:
shuffle = True
M = 1000
config = {
    'n_estimators': 10,
    'max_depth': 6,
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    }

np.random.seed(0)
forecast_acc = []

wandb.init(project='02582_case1_final', entity='tgml', config=config, group='SKLEARN_GB')

for m in range(M):

    ## Sample from test set
    seed = np.random.randint(10000)
    real_test_sampled = real_test
    real_test_sampled = real_test_sampled.sample(n=len(real_test), replace=True, random_state=seed)
    y_test = real_test_sampled.LoadFactor
    X_test = real_test_sampled.loc[:, real_test.columns != 'LoadFactor']


    ## Train and evaluate model
    model = GradientBoostingRegressor(n_estimators=config['n_estimators'], 
                                    max_features=config['max_features'],
                                    max_depth=config['max_depth'], 
                                    min_samples_split=config['min_samples_split'], 
                                    min_samples_leaf=config['min_samples_leaf'])

    ## Evaluate best model on test data
    test_acc, _ = fit_evaluate_model(X_tr_m=X_train, X_v_te=X_test, y_tr_m=y_train, y_v_te=y_test, model=model)
    forecast_acc.append(test_acc)
    wandb.log({"accuracy": test_acc})


print(f'Mean of test accuracies: {np.mean(forecast_acc)}\nStd. of test accuracies: {np.var(forecast_acc)}')

# Random Forest

In [None]:
shuffle = True
M = 1000
config = {
    # 'bootstrap': True,
    # 'criterion': 'mse',
    # 'max_leaf_nodes': None,
    # 'min_impurity_decrease': 0.0,
    # 'min_impurity_split': None,
    # 'min_weight_fraction_leaf': 0.0,
    # 'n_jobs': 1,
    # 'oob_score': False,
    # 'random_state': 42,
    # 'verbose': 0,
    # 'warm_start': False,
    # "shuffle": True,
    'max_features': 'auto',
    'n_estimators': 2000,
    'max_depth': 30,
    'min_samples_leaf': 4,
    'min_samples_split': 4,
    }

np.random.seed(0)
forecast_acc = []

wandb.init(project='02582_case1_final', entity='tgml', config=config, group='RF')

for m in range(M):

    ## Sample from test set
    seed = np.random.randint(10000)
    real_test_sampled = real_test
    real_test_sampled = real_test_sampled.sample(n=len(real_test), replace=True, random_state=seed)
    y_test = real_test_sampled.LoadFactor
    X_test = real_test_sampled.loc[:, real_test.columns != 'LoadFactor']


    ## Train and evaluate model
    model = RandomForestRegressor(n_estimators=config['n_estimators'], 
                                    max_features=config['max_features'],
                                    max_depth=config['max_depth'], 
                                    min_samples_split=config['min_samples_split'], 
                                    min_samples_leaf=config['min_samples_leaf'])

    ## Evaluate best model on test data
    test_acc, _ = fit_evaluate_model(X_tr_m=X_train, X_v_te=X_test, y_tr_m=y_train, y_v_te=y_test, model=model)
    forecast_acc.append(test_acc)
    wandb.log({"accuracy": test_acc})


print(f'Mean of test accuracies: {np.mean(forecast_acc)}\nStd. of test accuracies: {np.var(forecast_acc)}')

# We now have the best model, make predictions and save to output.txt

In [None]:
realized = pd.read_csv('data/realized_preprocessed_data.csv', index_col=0)
future = pd.read_csv('data/future_preprocessed_data.csv', index_col=0)

# Remove target from data
y = realized.LoadFactor
realized = realized.loc[:, realized.columns != 'LoadFactor']

X = realized

# Fit model
X_tr_m, fitted_scaler = normalize_seatcapacity_fit(X_train=X)

## Fit model to the training data
model.fit(X=X y=y)

## Normalize validation data SeatCapacity for predictions
X_future = normalize_seatcapacity(X_val=future, scaler=fitted_scaler)
## Make predictions
pred = model.predict(X_future)

prediction_file = future
prediction_file['LoadFactor'] = pred



In [None]:
prediction_file

In [None]:
prediction_file.to_txt('output.txt', sep=',', decimal='.')