# Case One: Project Notebook
By August and William

In [4]:
### Imports
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

import wandb
from sklearn.model_selection import train_test_split

# 1. Data Loading
# `train.csv`
# `validation.csv`
# `test.csv`
# `future_preprocessed.csv`
real_train = pd.read_csv('data/train.csv', index_col=0)
real_val = pd.read_csv('data/validation.csv', index_col=0)
real_test = pd.read_csv('data/test.csv', index_col=0)

future = pd.read_csv('data/future_preprocessed.csv', index_col=0)

# Remove target from data
y_train = real_train.LoadFactor
real_train = real_train.loc[:, real_train.columns != 'LoadFactor']
y_val = real_val.LoadFactor
real_val = real_val.loc[:, real_val.columns != 'LoadFactor']
y_test = real_test.LoadFactor
real_test = real_test.loc[:, real_test.columns != 'LoadFactor']



## Make copy of **SeatCapacity** for computing forecast accuracy
real_train['SeatCapacityOriginal'] = real_train.SeatCapacity
real_val['SeatCapacityOriginal'] = real_val.SeatCapacity
real_test['SeatCapacityOriginal'] = real_test.SeatCapacity
# future['SeatCapacityOriginal'] = future.SeatCapacity

X_train = real_train
X_val = real_val
X_test = real_test

# # 3. Data splitting
# ## Split data into modeling data (will be training and validation) and test data

# ### Make train/val set *0.8 and test *0.2
# def split_model_test(X, y, seed=0, shuffle=False, stratify=False):
#     if stratify:
#         X_model, X_test, y_model, y_test = train_test_split(X, y, test_size=0.2, random_state=seed, shuffle=shuffle, stratify=y)
#     else:
#         X_model, X_test, y_model, y_test = train_test_split(X, y, test_size=0.2, random_state=seed, shuffle=shuffle)
#     return X_model, X_test, y_model, y_test

# def split_train_val(X_m, y_m, seed=0, shuffle=False, stratify=False):
#     if stratify:
#         X_train, X_val, y_train, y_val = train_test_split(X_m, y_m, test_size=0.25, random_state=seed, shuffle=shuffle, stratify=y_m)
#     else:
#         X_train, X_val, y_train, y_val = train_test_split(X_m, y_m, test_size=0.25, random_state=seed, shuffle=shuffle)
#     return X_train, X_val, y_train, y_val

def seperate_SCO(X_train_model, X_val_test):
    X_train_model_SCO, X_val_test_SCO = X_train_model.SeatCapacityOriginal, X_val_test.SeatCapacityOriginal

    X_train_model = X_train_model.loc[:, ~X_train_model.columns.isin(['SeatCapacityOriginal'])]
    X_val_test = X_val_test.loc[:, ~X_val_test.columns.isin(['SeatCapacityOriginal'])]

    return X_train_model, X_val_test, X_train_model_SCO, X_val_test_SCO


# 4. Define validation setup for different models
## Define forecast accuracy function
def mean_forecast_accuracy(loadfactor_forecasted, loadfactor_true, seatcapacity):

    passengers_true = loadfactor_true * seatcapacity
    passengers_forecasted = loadfactor_forecasted * seatcapacity
    
    abs_deviation_per_flight = np.abs((passengers_true-passengers_forecasted) / passengers_true)
    abs_deviation_per_flight[abs_deviation_per_flight >= 10000] = 100

    mean_forecast_acc = np.mean(1 - abs_deviation_per_flight*1)*100
    return mean_forecast_acc

## Define normalizer for training on **SeatCapacity**
def normalize_seatcapacity_fit(X_train):
    scaler = MinMaxScaler()
    scaler.fit(X_train.SeatCapacity.values.reshape(-1, 1))
    X_train.SeatCapacity = scaler.transform(X_train.SeatCapacity.values.reshape(-1, 1))
    return X_train, scaler

def normalize_seatcapacity(X_val, scaler):
    X_val.SeatCapacity = scaler.transform(X_val.SeatCapacity.values.reshape(-1, 1))
    return X_val

## Functions for fitting+validating models, as well as testing models
### Make function for fitting and validating model
def fit_evaluate_model(X_tr_m, X_v_te, y_tr_m, y_v_te, model):
    
    ## Remove original seatcapacity
    X_tr_m, X_v_te, X_tr_m_SCO, X_v_te_SCO = seperate_SCO(X_train_model=X_tr_m, X_val_test=X_v_te)

    ## Normalize seatcapacity
    X_tr_m, fitted_scaler = normalize_seatcapacity_fit(X_train=X_tr_m)
    ## Fit model to the training data
    model.fit(X=X_tr_m, y=y_tr_m)

    ## Normalize validation data SeatCapacity for predictions
    X_v_te = normalize_seatcapacity(X_val=X_v_te, scaler=fitted_scaler)
    ## Make predictions
    pred = model.predict(X_v_te)

    ## Compute forecasting accuracy
    acc = mean_forecast_accuracy(loadfactor_forecasted=pred, loadfactor_true=y_v_te.to_numpy(), seatcapacity=X_v_te_SCO.to_numpy())

    return acc, model

def evaluate_model(X_tr_m, X_v_te, y_tr_m, y_v_te, model):
    
    ## Remove original seatcapacity
    X_tr_m, X_v_te, X_tr_m_SCO, X_v_te_SCO = seperate_SCO(X_train_model=X_tr_m, X_val_test=X_v_te)

    # ## Normalize seatcapacity
    # X_tr_m, fitted_scaler = normalize_seatcapacity_fit(X_train=X_tr_m)

    # ## Normalize validation data SeatCapacity for predictions
    # X_v_te = normalize_seatcapacity(X_val=X_v_te, scaler=fitted_scaler)
    ## Make predictions
    pred = model.predict(X_v_te)

    ## Compute forecasting accuracy
    acc = mean_forecast_accuracy(loadfactor_forecasted=pred, loadfactor_true=y_v_te.to_numpy(), seatcapacity=X_v_te_SCO.to_numpy())

    return acc

In [10]:
X_train

Unnamed: 0,SeatCapacity,Month,WeekNumber,Weekday,HourOfDay,AircraftType_319,AircraftType_320,AircraftType_333,AircraftType_359,AircraftType_32A,...,Destination_YXU,Destination_YXX,Destination_YYJ,Destination_YYZ,Destination_YZF,Season_Fall,Season_Spring,Season_Summer,Season_Winter,SeatCapacityOriginal
0,142,1,53,4,6,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,142
1,74,1,53,4,10,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,74
2,142,1,53,4,12,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,142
3,72,1,53,4,13,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,72
4,186,1,53,4,14,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,186
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33385,186,12,52,4,18,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,186
33386,189,12,52,4,19,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,189
33387,186,12,52,4,19,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,186
33388,186,12,52,4,19,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,186


In [11]:
X_val

Unnamed: 0,SeatCapacity,Month,WeekNumber,Weekday,HourOfDay,AircraftType_319,AircraftType_320,AircraftType_333,AircraftType_359,AircraftType_32A,...,Destination_YXU,Destination_YXX,Destination_YYJ,Destination_YYZ,Destination_YZF,Season_Fall,Season_Spring,Season_Summer,Season_Winter,SeatCapacityOriginal
23865,186,1,52,5,5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,186
23866,138,1,52,5,5,1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,138
23867,180,1,52,5,5,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,180
23868,189,1,52,5,7,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,189
23869,90,1,52,5,6,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36486,98,1,5,0,17,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,98
36487,156,1,5,0,17,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,156
36488,189,1,5,0,18,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,189
36489,156,1,5,0,18,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,156


In [9]:
X_test

Unnamed: 0,SeatCapacity,Month,WeekNumber,Weekday,HourOfDay,AircraftType_319,AircraftType_320,AircraftType_333,AircraftType_359,AircraftType_32A,...,Destination_YXU,Destination_YXX,Destination_YYJ,Destination_YYZ,Destination_YZF,Season_Fall,Season_Spring,Season_Summer,Season_Winter,SeatCapacityOriginal
35345,138,2,5,1,6,1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,138
35346,168,2,5,1,6,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,168
35347,189,2,5,1,6,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,189
35348,189,2,5,1,7,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,189
35349,180,2,5,1,8,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39444,144,2,9,0,18,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,144
39445,156,2,9,0,19,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,156
39446,98,2,9,0,20,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,98
39447,186,2,9,0,19,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,186


# Part 1: Find best hyperparameters using val set

# Gradient boost

In [3]:
sweep_config = {
    "method": "random", # try grid or random
    "metric": {
      "name": "accuracy",
      "goal": "maximize"   
    },
    "parameters": {
        "n_estimators": {
            "values": [400, 800, 1200, 1600, 2000, 2500, 3000]
        },
        "max_depth": {
            "values": [3,4,5,6,7]
            # "distribution": "uniform",
            # "min": 0.00001,
            # "max": 1
        },
        "learning_rate": {
            "values": [0.05, 0.1, 0.15]
        }
        # ,
        # "subsample": {
        #     "values": [0.8, 1]
        # }
        ,
        "max_features": {
            "values": ['auto', 10, 20, 30]
        }

    }
}

sweep_id = wandb.sweep(sweep_config, project='02582_case1_boost_sweep', entity='tgml')

def train():
    config_defaults = {
    # 'bootstrap': True,
    # 'criterion': 'mse',
    # 'max_features': 'auto',
    # 'max_leaf_nodes': None,
    # 'min_impurity_decrease': 0.0,
    # 'min_impurity_split': None,
    # 'min_samples_leaf': 1,
    # 'min_samples_split': 2,
    # 'min_weight_fraction_leaf': 0.0,
    # 'n_estimators': 10,
    # 'n_jobs': 1,
    # 'oob_score': False,
    # 'random_state': 42,
    # 'verbose': 0,
    # 'warm_start': False,
    "seed": 0,
    "n_estimators": 1000,
    'max_depth': 3,
    "learning_rate": 0.1,
    "subsample": 1.,
    }

    wandb.init(project='02582_case1_boost_sweep', entity='tgml', config=config_defaults)  # defaults are over-ridden during the sweep
    config = wandb.config

    # fit model on train
    model = GradientBoostingRegressor(n_estimators=config.n_estimators, 
                                        learning_rate=config.learning_rate,
                                        max_depth=config.max_depth,
                                        subsample=config.subsample)

    train_on_val_acc, _ = fit_evaluate_model(X_tr_m=X_train, X_v_te=X_val, y_tr_m=y_train, y_v_te=y_val, model=model)

    wandb.log({"accuracy": train_on_val_acc})


wandb.agent(sweep_id, train, count=200)

Create sweep with ID: u8xoj2k4
Sweep URL: https://wandb.ai/tgml/02582_case1_boost_sweep/sweeps/u8xoj2k4


wandb: Agent Starting Run: fvy5og70 with config:
wandb: 	learning_rate: 0.05
wandb: 	max_depth: 4
wandb: 	max_features: auto
wandb: 	n_estimators: 800
wandb: Currently logged in as: augustsemrau (use `wandb login --relogin` to force relogin)





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,27.7516


wandb: Agent Starting Run: r5s36x2c with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 4
wandb: 	max_features: auto
wandb: 	n_estimators: 3000





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,19.70288


wandb: Agent Starting Run: a65flyd5 with config:
wandb: 	learning_rate: 0.05
wandb: 	max_depth: 5
wandb: 	max_features: 20
wandb: 	n_estimators: 400





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,28.36987


wandb: Agent Starting Run: b858zbhd with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 6
wandb: 	max_features: 10
wandb: 	n_estimators: 1200





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,19.80661


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: 0unsdois with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 6
wandb: 	max_features: 20
wandb: 	n_estimators: 2500





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,15.96299


wandb: Agent Starting Run: 7oyq4qba with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 6
wandb: 	max_features: auto
wandb: 	n_estimators: 2500





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,15.65632


wandb: Agent Starting Run: iq6gd8in with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 7
wandb: 	max_features: 20
wandb: 	n_estimators: 2500





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,16.39413


wandb: Agent Starting Run: 28v5wq6j with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 7
wandb: 	max_features: 20
wandb: 	n_estimators: 2000





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,17.5248


wandb: Agent Starting Run: wdifh48p with config:
wandb: 	learning_rate: 0.05
wandb: 	max_depth: 6
wandb: 	max_features: 30
wandb: 	n_estimators: 800





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,28.06077


wandb: Agent Starting Run: 13jxvwb0 with config:
wandb: 	learning_rate: 0.05
wandb: 	max_depth: 5
wandb: 	max_features: auto
wandb: 	n_estimators: 1600





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.82704


wandb: Agent Starting Run: z0sujzu3 with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 6
wandb: 	max_features: 30
wandb: 	n_estimators: 400





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.91044


wandb: Agent Starting Run: 88hi1t16 with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 3
wandb: 	max_features: 20
wandb: 	n_estimators: 1600





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,23.64552


wandb: Agent Starting Run: q40gb59d with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 6
wandb: 	max_features: auto
wandb: 	n_estimators: 3000





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,15.03169


wandb: Agent Starting Run: ezkzjy28 with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 4
wandb: 	max_features: auto
wandb: 	n_estimators: 400





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,27.01173


wandb: Agent Starting Run: dhqdxwpa with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 6
wandb: 	max_features: auto
wandb: 	n_estimators: 2000





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,16.72957


wandb: Agent Starting Run: 0r0tvuc3 with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 6
wandb: 	max_features: 30
wandb: 	n_estimators: 800





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,24.49237


wandb: Agent Starting Run: xxwt2uh2 with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 7
wandb: 	max_features: 20
wandb: 	n_estimators: 400





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,24.47208


wandb: Agent Starting Run: vs4bu56z with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 7
wandb: 	max_features: auto
wandb: 	n_estimators: 2000





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,17.92578


wandb: Agent Starting Run: x95r3suy with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 6
wandb: 	max_features: 30
wandb: 	n_estimators: 400





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.86696


wandb: Agent Starting Run: igwa994f with config:
wandb: 	learning_rate: 0.05
wandb: 	max_depth: 6
wandb: 	max_features: 10
wandb: 	n_estimators: 1600





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.06795


wandb: Agent Starting Run: 4i9x2u3c with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 6
wandb: 	max_features: 10
wandb: 	n_estimators: 3000





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,15.53341


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: 4j1ey8y5 with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 3
wandb: 	max_features: auto
wandb: 	n_estimators: 800





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,24.32351


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: fm8aix3r with config:
wandb: 	learning_rate: 0.05
wandb: 	max_depth: 4
wandb: 	max_features: 20
wandb: 	n_estimators: 800





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,27.7551


wandb: Agent Starting Run: 3ahetdjx with config:
wandb: 	learning_rate: 0.05
wandb: 	max_depth: 5
wandb: 	max_features: 20
wandb: 	n_estimators: 1600





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.83497


wandb: Agent Starting Run: xniqg2vq with config:
wandb: 	learning_rate: 0.05
wandb: 	max_depth: 7
wandb: 	max_features: 10
wandb: 	n_estimators: 1600





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,24.35758


wandb: Agent Starting Run: iqadg0ex with config:
wandb: 	learning_rate: 0.05
wandb: 	max_depth: 5
wandb: 	max_features: 10
wandb: 	n_estimators: 1600





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.81067


wandb: Agent Starting Run: 22ak2ib5 with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 4
wandb: 	max_features: 20
wandb: 	n_estimators: 1200





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,24.37463


wandb: Agent Starting Run: rqu2ne3l with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 3
wandb: 	max_features: 10
wandb: 	n_estimators: 800





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,23.93398


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: o7d8wtrc with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 5
wandb: 	max_features: auto
wandb: 	n_estimators: 400





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,26.96018


wandb: Agent Starting Run: lqch6rpx with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 3
wandb: 	max_features: auto
wandb: 	n_estimators: 400





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,24.10738


wandb: Agent Starting Run: k2za71bd with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 5
wandb: 	max_features: auto
wandb: 	n_estimators: 3000





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,16.55622


wandb: Agent Starting Run: 5nrakq7a with config:
wandb: 	learning_rate: 0.05
wandb: 	max_depth: 3
wandb: 	max_features: auto
wandb: 	n_estimators: 1600





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,24.65722


wandb: Agent Starting Run: 4sf65fmx with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 4
wandb: 	max_features: 20
wandb: 	n_estimators: 2000





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,21.54335


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: 2umkvbwh with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 4
wandb: 	max_features: 10
wandb: 	n_estimators: 2000





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,21.56012


wandb: Agent Starting Run: kd8uik2d with config:
wandb: 	learning_rate: 0.05
wandb: 	max_depth: 6
wandb: 	max_features: auto
wandb: 	n_estimators: 1600





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.00413


wandb: Agent Starting Run: glrgm3zs with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 6
wandb: 	max_features: 30
wandb: 	n_estimators: 2000





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,20.06988


wandb: Agent Starting Run: rsz1rkqb with config:
wandb: 	learning_rate: 0.05
wandb: 	max_depth: 4
wandb: 	max_features: 30
wandb: 	n_estimators: 1200





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,27.04042


wandb: Agent Starting Run: h9lykuyz with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 4
wandb: 	max_features: 20
wandb: 	n_estimators: 1200





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,24.22301


wandb: Agent Starting Run: wqqv9p8u with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 5
wandb: 	max_features: 20
wandb: 	n_estimators: 800





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,24.74096


wandb: Agent Starting Run: lp2q8qh1 with config:
wandb: 	learning_rate: 0.05
wandb: 	max_depth: 4
wandb: 	max_features: 30
wandb: 	n_estimators: 2500





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.33163


wandb: Agent Starting Run: yj06h9vv with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 3
wandb: 	max_features: 10
wandb: 	n_estimators: 2500





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,20.06938


wandb: Agent Starting Run: rpj6a3oo with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 7
wandb: 	max_features: 10
wandb: 	n_estimators: 3000





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,15.94251


wandb: Agent Starting Run: 8ophhrrw with config:
wandb: 	learning_rate: 0.05
wandb: 	max_depth: 6
wandb: 	max_features: 30
wandb: 	n_estimators: 3000





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,22.4986


wandb: Agent Starting Run: aw5h9m3w with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 6
wandb: 	max_features: 10
wandb: 	n_estimators: 2500





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,19.20893


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: zd87mo45 with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 7
wandb: 	max_features: 30
wandb: 	n_estimators: 800





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,24.45147


wandb: Agent Starting Run: q40hkik9 with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 4
wandb: 	max_features: auto
wandb: 	n_estimators: 1200





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,24.25245


wandb: Agent Starting Run: cmp5dj2k with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 5
wandb: 	max_features: 10
wandb: 	n_estimators: 2500





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,20.87481


wandb: Agent Starting Run: u8gbgiqu with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 5
wandb: 	max_features: 20
wandb: 	n_estimators: 1600





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,22.38115


wandb: Agent Starting Run: 3c0wx88c with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 7
wandb: 	max_features: 10
wandb: 	n_estimators: 2000





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,17.89198


wandb: Agent Starting Run: i14b0lva with config:
wandb: 	learning_rate: 0.05
wandb: 	max_depth: 7
wandb: 	max_features: 30
wandb: 	n_estimators: 1200





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.53524


wandb: Agent Starting Run: 56eu6m5l with config:
wandb: 	learning_rate: 0.05
wandb: 	max_depth: 4
wandb: 	max_features: 20
wandb: 	n_estimators: 1600





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,26.64605


wandb: Agent Starting Run: w98zjalq with config:
wandb: 	learning_rate: 0.05
wandb: 	max_depth: 5
wandb: 	max_features: 30
wandb: 	n_estimators: 800





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,27.58791


wandb: Agent Starting Run: ea1ngs0s with config:
wandb: 	learning_rate: 0.05
wandb: 	max_depth: 4
wandb: 	max_features: 30
wandb: 	n_estimators: 2500





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.30489


wandb: Agent Starting Run: 59tx87me with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 7
wandb: 	max_features: 10
wandb: 	n_estimators: 2000





VBox(children=(Label(value='4.290 MB of 4.290 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,17.74402


wandb: Agent Starting Run: sece2wes with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 6
wandb: 	max_features: 20
wandb: 	n_estimators: 1600





VBox(children=(Label(value='4.502 MB of 4.502 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,21.39011


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: xsx1vgxu with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 5
wandb: 	max_features: 20
wandb: 	n_estimators: 1600





VBox(children=(Label(value='4.502 MB of 4.502 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,22.50459


wandb: Agent Starting Run: x0rjnjqt with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 7
wandb: 	max_features: 10
wandb: 	n_estimators: 800





VBox(children=(Label(value='4.502 MB of 4.502 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,25.05342


wandb: Agent Starting Run: ez0lccko with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 4
wandb: 	max_features: 10
wandb: 	n_estimators: 3000





VBox(children=(Label(value='4.502 MB of 4.502 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,19.57182


wandb: Agent Starting Run: 7fqipnh6 with config:
wandb: 	learning_rate: 0.15
wandb: 	max_depth: 7
wandb: 	max_features: auto
wandb: 	n_estimators: 1600


wandb: Ctrl + C detected. Stopping sweep.





VBox(children=(Label(value='4.518 MB of 4.518 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

# Random Forest

In [None]:
sweep_config = {
    "method": "random", # try grid or random
    "metric": {
      "name": "accuracy",
      "goal": "maximize"   
    },
    "parameters": {
        "n_estimators": {
            "values": [1800, 2000, 2400, 2800]
        },
        "max_depth": {
            "values": [20, 30, 40, 50]
        },
        "min_samples_split": {
            "values": [2,5,10]
        },
        "min_samples_leaf": {
            "values": [1,2,4]
        }
    }
}

sweep_id = wandb.sweep(sweep_config, project='02582_case1_rf_sweep', entity='tgml')

def train():
    config_defaults = {
    'bootstrap': True,
    'n_estimators': 1000,
    'max_depth': None,
    'max_features': 'auto',
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    # 'max_leaf_nodes': None,
    # 'criterion': 'mse',
    # 'min_impurity_decrease': 0.0,
    # 'min_impurity_split': None,
    # 'min_weight_fraction_leaf': 0.0,
    # 'n_jobs': 1,
    # 'oob_score': False,
    # 'random_state': 42,
    # 'verbose': 0,
    # 'warm_start': False,
    # "seed": 0,
    # "shuffle": True,
    }

    wandb.init(project='02582_case1_rf_sweep', entity='tgml', config=config_defaults)  # defaults are over-ridden during the sweep
    config = wandb.config


    # fit model on train
    model = RandomForestRegressor(n_estimators=config.n_estimators, bootstrap=config.bootstrap,
                            max_depth=config.max_depth, min_samples_split=config.min_samples_split, min_samples_leaf=config.min_samples_leaf)

    # model_on_test_acc, _ = fit_evaluate_model(X_tr_m=X_model, X_v_te=X_test, y_tr_m=y_model, y_v_te=y_test, model=model)
    # train_on_test_acc, _ = fit_evaluate_model(X_tr_m=X_train, X_v_te=X_test, y_tr_m=y_train, y_v_te=y_test, model=model)
    train_on_val_acc, _ = fit_evaluate_model(X_tr_m=X_train, X_v_te=X_val, y_tr_m=y_train, y_v_te=y_val, model=model)

    wandb.log({"accuracy": train_on_val_acc})


wandb.agent(sweep_id, train, count=100)

# Now make accuracy prediction using test set

# Gradient boost

In [None]:
shuffle = True
M = 1000
config = {
    'n_estimators': 10,
    'max_depth': 6,
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    }

np.random.seed(0)
forecast_acc = []

wandb.init(project='02582_case1_final', entity='tgml', config=config, group='SKLEARN_GB')

## Train and evaluate model
model = GradientBoostingRegressor(n_estimators=config['n_estimators'], 
                                max_features=config['max_features'],
                                max_depth=config['max_depth'], 
                                min_samples_split=config['min_samples_split'], 
                                min_samples_leaf=config['min_samples_leaf'])

## Evaluate best model on test data
test_acc, fitted_model = fit_evaluate_model(X_tr_m=X_train, X_v_te=X_test, y_tr_m=y_train, y_v_te=y_test, model=model)
for m in range(M):

    ## Sample from test set
    seed = np.random.randint(10000)
    real_test_sampled = real_test
    real_test_sampled = real_test_sampled.sample(n=len(real_test), replace=True, random_state=seed)
    y_test = real_test_sampled.LoadFactor
    X_test = real_test_sampled.loc[:, real_test.columns != 'LoadFactor']

    test_acc = evaluate_model(X_tr_m=X_train, X_v_te=X_test, y_tr_m=y_train, y_v_te=y_test, model=fitted_model)
    forecast_acc.append(test_acc)
    wandb.log({"accuracy": test_acc})


print(f'Mean of test accuracies: {np.mean(forecast_acc)}\nStd. of test accuracies: {np.var(forecast_acc)}')

print(f'Mean of test accuracies: {np.mean(forecast_acc)}\nStd. of test accuracies: {np.var(forecast_acc)}')

# Random Forest

In [None]:
shuffle = True
M = 1000
config = {
    # 'bootstrap': True,
    # 'criterion': 'mse',
    # 'max_leaf_nodes': None,
    # 'min_impurity_decrease': 0.0,
    # 'min_impurity_split': None,
    # 'min_weight_fraction_leaf': 0.0,
    # 'n_jobs': 1,
    # 'oob_score': False,
    # 'random_state': 42,
    # 'verbose': 0,
    # 'warm_start': False,
    # "shuffle": True,
    'max_features': 'auto',
    'n_estimators': 2000,
    'max_depth': 30,
    'min_samples_leaf': 4,
    'min_samples_split': 4,
    }

np.random.seed(0)
forecast_acc = []

wandb.init(project='02582_case1_final', entity='tgml', config=config, group='RF')

## Train and evaluate model
model = RandomForestRegressor(n_estimators=config['n_estimators'], 
                                max_features=config['max_features'],
                                max_depth=config['max_depth'], 
                                min_samples_split=config['min_samples_split'], 
                                min_samples_leaf=config['min_samples_leaf'])

## Evaluate best model on test data
test_acc, fitted_model = fit_evaluate_model(X_tr_m=X_train, X_v_te=X_test, y_tr_m=y_train, y_v_te=y_test, model=model)

for m in range(M):

    ## Sample from test set
    seed = np.random.randint(10000)
    real_test_sampled = real_test
    real_test_sampled = real_test_sampled.sample(n=len(real_test), replace=True, random_state=seed)
    y_test = real_test_sampled.LoadFactor
    X_test = real_test_sampled.loc[:, real_test.columns != 'LoadFactor']

    test_acc = evaluate_model(X_tr_m=X_train, X_v_te=X_test, y_tr_m=y_train, y_v_te=y_test, model=fitted_model)
    forecast_acc.append(test_acc)
    wandb.log({"accuracy": test_acc})


print(f'Mean of test accuracies: {np.mean(forecast_acc)}\nStd. of test accuracies: {np.var(forecast_acc)}')

# We now have the best model, make predictions and save to output.txt

In [None]:
realized = pd.read_csv('data/realized_preprocessed_data.csv', index_col=0)
future = pd.read_csv('data/future_preprocessed_data.csv', index_col=0)

# Remove target from data
y = realized.LoadFactor
realized = realized.loc[:, realized.columns != 'LoadFactor']

X = realized

# Fit model
X_tr_m, fitted_scaler = normalize_seatcapacity_fit(X_train=X)

## Fit model to the training data
model.fit(X=X y=y)

## Normalize validation data SeatCapacity for predictions
X_future = normalize_seatcapacity(X_val=future, scaler=fitted_scaler)
## Make predictions
pred = model.predict(X_future)

prediction_file = future
prediction_file['LoadFactor'] = pred



In [None]:
prediction_file

In [None]:
prediction_file.to_txt('output.txt', sep=',', decimal='.')