# Case One: Project Notebook
By August and William

In [1]:
### Imports
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

import wandb
from sklearn.model_selection import train_test_split

# 1. Data Loading
# `train.csv`
# `validation.csv`
# `test.csv`
# `future_preprocessed.csv`
real_train = pd.read_csv('data/train.csv', index_col=0)
real_val = pd.read_csv('data/validation.csv', index_col=0)
real_test = pd.read_csv('data/test.csv', index_col=0)

future = pd.read_csv('data/future_preprocessed.csv', index_col=0)

# Remove target from data
y_train = real_train.LoadFactor
real_train = real_train.loc[:, real_train.columns != 'LoadFactor']
y_val = real_val.LoadFactor
real_val = real_val.loc[:, real_val.columns != 'LoadFactor']

real_train = real_train.loc[:, real_train.columns != 'TotalDeparturesDay']
real_train = real_train.loc[:, real_train.columns != 'TotalDeparturesWeek']
real_train = real_train.loc[:, real_train.columns != 'TotalDeparturesMonth']

real_val = real_val.loc[:, real_val.columns != 'TotalDeparturesDay']
real_val = real_val.loc[:, real_val.columns != 'TotalDeparturesWeek']
real_val = real_val.loc[:, real_val.columns != 'TotalDeparturesMonth']
# real_test = real_val.loc[:, real_val.columns != ['TotalDeparturesDay', 'TotalDeparturesWeek', 'TotalDeparturesMonth']]
## Make copy of **SeatCapacity** for computing forecast accuracy
real_train['SeatCapacityOriginal'] = real_train.SeatCapacity
real_val['SeatCapacityOriginal'] = real_val.SeatCapacity
real_test['SeatCapacityOriginal'] = real_test.SeatCapacity
# future['SeatCapacityOriginal'] = future.SeatCapacity

X_train = real_train
X_val = real_val


# # 3. Data splitting
# ## Split data into modeling data (will be training and validation) and test data

# ### Make train/val set *0.8 and test *0.2
# def split_model_test(X, y, seed=0, shuffle=False, stratify=False):
#     if stratify:
#         X_model, X_test, y_model, y_test = train_test_split(X, y, test_size=0.2, random_state=seed, shuffle=shuffle, stratify=y)
#     else:
#         X_model, X_test, y_model, y_test = train_test_split(X, y, test_size=0.2, random_state=seed, shuffle=shuffle)
#     return X_model, X_test, y_model, y_test

# def split_train_val(X_m, y_m, seed=0, shuffle=False, stratify=False):
#     if stratify:
#         X_train, X_val, y_train, y_val = train_test_split(X_m, y_m, test_size=0.25, random_state=seed, shuffle=shuffle, stratify=y_m)
#     else:
#         X_train, X_val, y_train, y_val = train_test_split(X_m, y_m, test_size=0.25, random_state=seed, shuffle=shuffle)
#     return X_train, X_val, y_train, y_val

def seperate_SCO(X_train_model, X_val_test):
    X_train_model_SCO, X_val_test_SCO = X_train_model.SeatCapacityOriginal, X_val_test.SeatCapacityOriginal

    X_train_model = X_train_model.loc[:, ~X_train_model.columns.isin(['SeatCapacityOriginal'])]
    X_val_test = X_val_test.loc[:, ~X_val_test.columns.isin(['SeatCapacityOriginal'])]

    return X_train_model, X_val_test, X_train_model_SCO, X_val_test_SCO


# 4. Define validation setup for different models
## Define forecast accuracy function
def mean_forecast_accuracy(loadfactor_forecasted, loadfactor_true, seatcapacity):

    passengers_true = loadfactor_true * seatcapacity
    passengers_forecasted = loadfactor_forecasted * seatcapacity
    
    abs_deviation_per_flight = np.abs((passengers_true-passengers_forecasted) / passengers_true)
    abs_deviation_per_flight[abs_deviation_per_flight >= 10000] = 100

    mean_forecast_acc = np.mean(1 - abs_deviation_per_flight*1)*100
    return mean_forecast_acc

## Define normalizer for training on **SeatCapacity**
def normalize_seatcapacity_fit(X_train):
    scaler = MinMaxScaler()
    scaler.fit(X_train.SeatCapacity.values.reshape(-1, 1))
    X_train.SeatCapacity = scaler.transform(X_train.SeatCapacity.values.reshape(-1, 1))
    return X_train, scaler

def normalize_seatcapacity(X_val, scaler):
    X_val.SeatCapacity = scaler.transform(X_val.SeatCapacity.values.reshape(-1, 1))
    return X_val

## Functions for fitting+validating models, as well as testing models
### Make function for fitting and validating model
def fit_evaluate_model(X_tr_m, X_v_te, y_tr_m, y_v_te, model):
    
    ## Remove original seatcapacity
    X_tr_m, X_v_te, X_tr_m_SCO, X_v_te_SCO = seperate_SCO(X_train_model=X_tr_m, X_val_test=X_v_te)

    ## Normalize seatcapacity
    X_tr_m, fitted_scaler = normalize_seatcapacity_fit(X_train=X_tr_m)
    ## Fit model to the training data
    model.fit(X=X_tr_m, y=y_tr_m)

    ## Normalize validation data SeatCapacity for predictions
    X_v_te = normalize_seatcapacity(X_val=X_v_te, scaler=fitted_scaler)
    ## Make predictions
    pred = model.predict(X_v_te)

    ## Compute forecasting accuracy
    acc = mean_forecast_accuracy(loadfactor_forecasted=pred, loadfactor_true=y_v_te.to_numpy(), seatcapacity=X_v_te_SCO.to_numpy())

    return acc, model

def evaluate_model(X_tr_m, X_v_te, y_tr_m, y_v_te, model):
    
    ## Remove original seatcapacity
    X_tr_m, X_v_te, X_tr_m_SCO, X_v_te_SCO = seperate_SCO(X_train_model=X_tr_m, X_val_test=X_v_te)

    # ## Normalize seatcapacity
    # X_tr_m, fitted_scaler = normalize_seatcapacity_fit(X_train=X_tr_m)

    # ## Normalize validation data SeatCapacity for predictions
    # X_v_te = normalize_seatcapacity(X_val=X_v_te, scaler=fitted_scaler)
    ## Make predictions
    pred = model.predict(X_v_te)

    ## Compute forecasting accuracy
    acc = mean_forecast_accuracy(loadfactor_forecasted=pred, loadfactor_true=y_v_te.to_numpy(), seatcapacity=X_v_te_SCO.to_numpy())

    return acc

In [4]:
X_train

Unnamed: 0,SeatCapacity,Year,Month,WeekNumber,Weekday,HourOfDay,AircraftType_319,AircraftType_320,AircraftType_333,AircraftType_359,...,Destination_YXU,Destination_YXX,Destination_YYJ,Destination_YYZ,Destination_YZF,Season_Fall,Season_Spring,Season_Summer,Season_Winter,SeatCapacityOriginal
0,142,2021,1,53,4,6,0,0,0,0,...,0,0,0,0,0,0,0,0,1,142
1,74,2021,1,53,4,10,0,0,0,0,...,0,0,0,0,0,0,0,0,1,74
2,142,2021,1,53,4,12,0,0,0,0,...,0,0,0,0,0,0,0,0,1,142
3,72,2021,1,53,4,13,0,0,0,0,...,0,0,0,0,0,0,0,0,1,72
4,186,2021,1,53,4,14,0,0,0,0,...,0,0,0,0,0,0,0,0,1,186
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33385,186,2021,12,52,4,18,0,0,0,0,...,0,0,0,0,0,0,0,0,1,186
33386,189,2021,12,52,4,19,0,0,0,0,...,0,0,0,0,0,0,0,0,1,189
33387,186,2021,12,52,4,19,0,1,0,0,...,0,0,0,0,0,0,0,0,1,186
33388,186,2021,12,52,4,19,0,0,0,0,...,0,0,0,0,0,0,0,0,1,186


# Part 1: Find best hyperparameters using val set

# Gradient boost

In [7]:
sweep_config = {
    "method": "random", # try grid or random
    "metric": {
      "name": "accuracy",
      "goal": "maximize"   
    },
    "parameters": {
        "n_estimators": {
            "values": [400, 800, 1200, 1600, 2000, 2500, 3000]
        },
        "max_depth": {
            "values": [3,4,5,6,7,9,11]
            # "distribution": "uniform",
            # "min": 0.00001,
            # "max": 1
        },
        "learning_rate": {
            "values": [0.05, 0.1, 0.15, 0.2]
        }
        # ,
        # "subsample": {
        #     "values": [0.8, 1]
        # }
        # ,
        # "max_features": {
        #     "values": [5,10,15,20]
        # }

    }
}

sweep_id = wandb.sweep(sweep_config, project='02582_case1_boost_sweep', entity='tgml')

def train():
    config_defaults = {
    # 'bootstrap': True,
    # 'criterion': 'mse',
    # 'max_features': 'auto',
    # 'max_leaf_nodes': None,
    # 'min_impurity_decrease': 0.0,
    # 'min_impurity_split': None,
    # 'min_samples_leaf': 1,
    # 'min_samples_split': 2,
    # 'min_weight_fraction_leaf': 0.0,
    # 'n_estimators': 10,
    # 'n_jobs': 1,
    # 'oob_score': False,
    # 'random_state': 42,
    # 'verbose': 0,
    # 'warm_start': False,
    "seed": 0,
    "n_estimators": 1000,
    'max_depth': 3,
    "learning_rate": 0.1,
    "subsample": 1.,
    }

    wandb.init(project='02582_case1_boost_sweep', entity='tgml', config=config_defaults)  # defaults are over-ridden during the sweep
    config = wandb.config

    # fit model on train
    model = GradientBoostingRegressor(n_estimators=config.n_estimators, 
                                        learning_rate=config.learning_rate,
                                        max_depth=config.max_depth,
                                        subsample=config.subsample)

    train_on_val_acc, _ = fit_evaluate_model(X_tr_m=X_train, X_v_te=X_val, y_tr_m=y_train, y_v_te=y_val, model=model)

    wandb.log({"accuracy": train_on_val_acc})


wandb.agent(sweep_id, train, count=20)

Create sweep with ID: 01nh2p42
Sweep URL: https://wandb.ai/tgml/02582_case1_boost_sweep/sweeps/01nh2p42


wandb: Agent Starting Run: ib5naymh with config:
wandb: 	learning_rate: 0.1
wandb: 	max_depth: 9
wandb: 	n_estimators: 1600





VBox(children=(Label(value='2.590 MB of 2.590 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁

0,1
accuracy,19.14147


wandb: Agent Starting Run: cj8lf2xi with config:
wandb: 	learning_rate: 0.2
wandb: 	max_depth: 6
wandb: 	n_estimators: 3000


wandb: Ctrl + C detected. Stopping sweep.
wandb: ERROR Problem finishing run





Traceback (most recent call last):
  File "/home/augustsemrau/.local/lib/python3.8/site-packages/wandb/sdk/wandb_run.py", line 1711, in _atexit_cleanup
    self._on_finish()
  File "/home/augustsemrau/.local/lib/python3.8/site-packages/wandb/sdk/wandb_run.py", line 1835, in _on_finish
    time.sleep(0.1)
Exception


VBox(children=(Label(value='2.590 MB of 2.590 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

# Random Forest

In [10]:
sweep_config = {
    "method": "random", # try grid or random
    "metric": {
      "name": "accuracy",
      "goal": "maximize"   
    },
    "parameters": {
        "n_estimators": {
            "values": [1800, 2000, 2400, 2800]
        },
        "max_depth": {
            "values": [20, 30, 40, 50]
        },
        "min_samples_split": {
            "values": [2,5,10]
        },
        "min_samples_leaf": {
            "values": [1,2,4]
        }
    }
}

sweep_id = wandb.sweep(sweep_config, project='02582_case1_rf_sweep', entity='tgml')

def train():
    config_defaults = {
    'bootstrap': True,
    'criterion': 'mse',
    'max_depth': None,
    'max_features': 'auto',
    'max_leaf_nodes': None,
    'min_impurity_decrease': 0.0,
    'min_impurity_split': None,
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    # 'min_weight_fraction_leaf': 0.0,
    # 'n_estimators': 10,
    # 'n_jobs': 1,
    # 'oob_score': False,
    # 'random_state': 42,
    # 'verbose': 0,
    # 'warm_start': False,
    # "seed": 0,
    # "shuffle": True,
    }

    wandb.init(project='02582_case1_rf_sweep', entity='tgml', config=config_defaults)  # defaults are over-ridden during the sweep
    config = wandb.config


    # fit model on train
    model = RandomForestRegressor(n_estimators=config.n_estimators, bootstrap=config.bootstrap,
                            max_depth=config.max_depth, min_samples_split=config.min_samples_split, min_samples_leaf=config.min_samples_leaf)

    # model_on_test_acc, _ = fit_evaluate_model(X_tr_m=X_model, X_v_te=X_test, y_tr_m=y_model, y_v_te=y_test, model=model)
    # train_on_test_acc, _ = fit_evaluate_model(X_tr_m=X_train, X_v_te=X_test, y_tr_m=y_train, y_v_te=y_test, model=model)
    train_on_val_acc, _ = fit_evaluate_model(X_tr_m=X_train, X_v_te=X_val, y_tr_m=y_train, y_v_te=y_val, model=model)

    wandb.log({"accuracy": train_on_val_acc})


wandb.agent(sweep_id, train, count=5)

Create sweep with ID: dzznjb43
Sweep URL: https://wandb.ai/tgml/02582_case1_rf_sweep/sweeps/dzznjb43


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: i7y2035a with config:
wandb: 	max_depth: 50
wandb: 	min_samples_leaf: 2
wandb: 	min_samples_split: 5
wandb: 	n_estimators: 2800


wandb: Ctrl + C detected. Stopping sweep.





VBox(children=(Label(value='2.553 MB of 2.553 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

# Now make accuracy prediction using test set

# Gradient boost

In [None]:
shuffle = True
M = 1000
config = {
    'n_estimators': 10,
    'max_depth': 6,
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    }

np.random.seed(0)
forecast_acc = []

wandb.init(project='02582_case1_final', entity='tgml', config=config, group='SKLEARN_GB')

## Train and evaluate model
model = GradientBoostingRegressor(n_estimators=config['n_estimators'], 
                                max_features=config['max_features'],
                                max_depth=config['max_depth'], 
                                min_samples_split=config['min_samples_split'], 
                                min_samples_leaf=config['min_samples_leaf'])

## Evaluate best model on test data
test_acc, fitted_model = fit_evaluate_model(X_tr_m=X_train, X_v_te=X_test, y_tr_m=y_train, y_v_te=y_test, model=model)
for m in range(M):

    ## Sample from test set
    seed = np.random.randint(10000)
    real_test_sampled = real_test
    real_test_sampled = real_test_sampled.sample(n=len(real_test), replace=True, random_state=seed)
    y_test = real_test_sampled.LoadFactor
    X_test = real_test_sampled.loc[:, real_test.columns != 'LoadFactor']

    test_acc = evaluate_model(X_tr_m=X_train, X_v_te=X_test, y_tr_m=y_train, y_v_te=y_test, model=fitted_model)
    forecast_acc.append(test_acc)
    wandb.log({"accuracy": test_acc})


print(f'Mean of test accuracies: {np.mean(forecast_acc)}\nStd. of test accuracies: {np.var(forecast_acc)}')

print(f'Mean of test accuracies: {np.mean(forecast_acc)}\nStd. of test accuracies: {np.var(forecast_acc)}')

# Random Forest

In [None]:
shuffle = True
M = 1000
config = {
    # 'bootstrap': True,
    # 'criterion': 'mse',
    # 'max_leaf_nodes': None,
    # 'min_impurity_decrease': 0.0,
    # 'min_impurity_split': None,
    # 'min_weight_fraction_leaf': 0.0,
    # 'n_jobs': 1,
    # 'oob_score': False,
    # 'random_state': 42,
    # 'verbose': 0,
    # 'warm_start': False,
    # "shuffle": True,
    'max_features': 'auto',
    'n_estimators': 2000,
    'max_depth': 30,
    'min_samples_leaf': 4,
    'min_samples_split': 4,
    }

np.random.seed(0)
forecast_acc = []

wandb.init(project='02582_case1_final', entity='tgml', config=config, group='RF')

## Train and evaluate model
model = RandomForestRegressor(n_estimators=config['n_estimators'], 
                                max_features=config['max_features'],
                                max_depth=config['max_depth'], 
                                min_samples_split=config['min_samples_split'], 
                                min_samples_leaf=config['min_samples_leaf'])

## Evaluate best model on test data
test_acc, fitted_model = fit_evaluate_model(X_tr_m=X_train, X_v_te=X_test, y_tr_m=y_train, y_v_te=y_test, model=model)

for m in range(M):

    ## Sample from test set
    seed = np.random.randint(10000)
    real_test_sampled = real_test
    real_test_sampled = real_test_sampled.sample(n=len(real_test), replace=True, random_state=seed)
    y_test = real_test_sampled.LoadFactor
    X_test = real_test_sampled.loc[:, real_test.columns != 'LoadFactor']

    test_acc = evaluate_model(X_tr_m=X_train, X_v_te=X_test, y_tr_m=y_train, y_v_te=y_test, model=fitted_model)
    forecast_acc.append(test_acc)
    wandb.log({"accuracy": test_acc})


print(f'Mean of test accuracies: {np.mean(forecast_acc)}\nStd. of test accuracies: {np.var(forecast_acc)}')

# We now have the best model, make predictions and save to output.txt

In [None]:
realized = pd.read_csv('data/realized_preprocessed_data.csv', index_col=0)
future = pd.read_csv('data/future_preprocessed_data.csv', index_col=0)

# Remove target from data
y = realized.LoadFactor
realized = realized.loc[:, realized.columns != 'LoadFactor']

X = realized

# Fit model
X_tr_m, fitted_scaler = normalize_seatcapacity_fit(X_train=X)

## Fit model to the training data
model.fit(X=X y=y)

## Normalize validation data SeatCapacity for predictions
X_future = normalize_seatcapacity(X_val=future, scaler=fitted_scaler)
## Make predictions
pred = model.predict(X_future)

prediction_file = future
prediction_file['LoadFactor'] = pred



In [None]:
prediction_file

In [None]:
prediction_file.to_txt('output.txt', sep=',', decimal='.')