# Case One: Project Notebook
By August and William

In [4]:
### Imports
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

import wandb
from sklearn.model_selection import train_test_split

import tqdm

# 1. Data Loading
# `train.csv`
# `validation.csv`
# `test.csv`
# `future_preprocessed.csv`
real_train = pd.read_csv('data/train.csv', index_col=0)
real_val = pd.read_csv('data/validation.csv', index_col=0)
real_test = pd.read_csv('data/test.csv', index_col=0)

real_model = pd.concat([real_train, real_val])
real_all = pd.concat([real_model, real_test])

future = pd.read_csv('data/future_preprocessed.csv', index_col=0)

# Remove target from data
y_train = real_train.LoadFactor
real_train = real_train.loc[:, real_train.columns != 'LoadFactor']
y_val = real_val.LoadFactor
real_val = real_val.loc[:, real_val.columns != 'LoadFactor']
y_model = real_model.LoadFactor
real_model = real_model.loc[:, real_model.columns != 'LoadFactor']
y_all = real_all.LoadFactor
real_all = real_all.loc[:, real_all.columns != 'LoadFactor']

# y_test = real_test.LoadFactor
# real_test = real_test.loc[:, real_test.columns != 'LoadFactor']



## Make copy of **SeatCapacity** for computing forecast accuracy
real_train['SeatCapacityOriginal'] = real_train.SeatCapacity
real_val['SeatCapacityOriginal'] = real_val.SeatCapacity
real_test['SeatCapacityOriginal'] = real_test.SeatCapacity
real_model['SeatCapacityOriginal'] = real_model.SeatCapacity
real_all_SCO = real_all.SeatCapacity
# future['SeatCapacityOriginal'] = future.SeatCapacity

X_train = real_train
X_val = real_val
X_model = real_model
X_all = real_all

def seperate_SCO(X_train_model, X_val_test):
    X_train_model_SCO, X_val_test_SCO = X_train_model.SeatCapacityOriginal, X_val_test.SeatCapacityOriginal

    X_train_model = X_train_model.loc[:, ~X_train_model.columns.isin(['SeatCapacityOriginal'])]
    X_val_test = X_val_test.loc[:, ~X_val_test.columns.isin(['SeatCapacityOriginal'])]

    return X_train_model, X_val_test, X_train_model_SCO, X_val_test_SCO


# 4. Define validation setup for different models
## Define forecast accuracy function
def mean_forecast_accuracy(loadfactor_forecasted, loadfactor_true, seatcapacity):

    passengers_true = loadfactor_true * seatcapacity
    passengers_forecasted = loadfactor_forecasted * seatcapacity
    
    abs_deviation_per_flight = np.abs((passengers_true-passengers_forecasted) / passengers_true)
    abs_deviation_per_flight[abs_deviation_per_flight >= 10000] = 100

    mean_forecast_acc = np.mean(1 - abs_deviation_per_flight*1)*100
    return mean_forecast_acc

## Define normalizer for training on **SeatCapacity**
def normalize_seatcapacity_fit(X_train):
    scaler = MinMaxScaler()
    scaler.fit(X_train.SeatCapacity.values.reshape(-1, 1))
    X_train_norm = X_train
    X_train_norm.SeatCapacity = scaler.transform(X_train_norm.SeatCapacity.values.reshape(-1, 1))
    return X_train_norm, scaler

def normalize_seatcapacity(X_val, scaler):
    X_val_norm = X_val
    X_val_norm.SeatCapacity = scaler.transform(X_val.SeatCapacity.values.reshape(-1, 1))
    return X_val_norm

## Functions for fitting+validating models, as well as testing models
### Make function for fitting and validating model
def fit_evaluate_model(X_tr_m, X_v_te, y_tr_m, y_v_te, model):
    
    ## Remove original seatcapacity
    X_tr_m, X_v_te, X_tr_m_SCO, X_v_te_SCO = seperate_SCO(X_train_model=X_tr_m, X_val_test=X_v_te)

    ## Normalize seatcapacity
    X_tr_m, fitted_scaler = normalize_seatcapacity_fit(X_train=X_tr_m)

    ## Fit model to the training data
    model.fit(X=X_tr_m, y=y_tr_m)

    X_v_te['LoadFactor'] = y_v_te
    X_v_te['SeatCapacityOriginal'] = X_v_te.SeatCapacity

    ## Normalize validation data SeatCapacity for predictions
    X_v_te = normalize_seatcapacity(X_val=X_v_te, scaler=fitted_scaler)

    sample_accs = []
    for m in range(100):

        ## Sample from test set
        seed = np.random.randint(10000)
        X_v_te_sampled = X_v_te
        # X_v_te_sampled['LoadFactor'] = y_v_te
        X_v_te_sampled = X_v_te_sampled.sample(n=len(X_v_te), replace=True, random_state=seed)
        y_sampled = X_v_te_sampled.LoadFactor
        X_sampled = X_v_te_sampled.loc[:, X_v_te_sampled.columns != 'LoadFactor']
        SCO_sampled = X_v_te_sampled.SeatCapacityOriginal
        X_sampled = X_sampled.loc[:, X_sampled.columns != 'SeatCapacityOriginal']

        ## Make predictions
        sample_pred = model.predict(X_sampled)

        ## Compute forecasting accuracy
        # test_acc = evaluate_model(X_v_te=X_test, y_v_te=y_test, model=fitted_model, fitted_scaler=fitted_scaler)
        acc = mean_forecast_accuracy(loadfactor_forecasted=sample_pred, loadfactor_true=y_sampled.to_numpy(), seatcapacity=SCO_sampled.to_numpy())
        sample_accs.append(acc)
        # print(acc)
    # wandb.log({"accuracy": np.mean(sample_accs)})

    # pred = model.predict(X_v_te)
    # ## Compute forecasting accuracy
    # acc = mean_forecast_accuracy(loadfactor_forecasted=pred, loadfactor_true=y_v_te.to_numpy(), seatcapacity=X_v_te_SCO.to_numpy())
    
    

    return sample_accs, model, fitted_scaler

def fit_model(X_tr_m, y_tr_m, model):
    
    ## Remove original seatcapacity
    # X_tr_m, X_v_te, X_tr_m_SCO, X_v_te_SCO = seperate_SCO(X_train_model=X_tr_m, X_val_test=X_v_te)
    # X_train_model_SCO= X_tr_m.SeatCapacityOriginal
    X_tr_m = X_tr_m.loc[:, ~X_tr_m.columns.isin(['SeatCapacityOriginal'])]

    ## Normalize seatcapacity
    X_tr_m, fitted_scaler = normalize_seatcapacity_fit(X_train=X_tr_m)
    
    ## Fit model to the training data
    model.fit(X=X_tr_m, y=y_tr_m)

    return model, fitted_scaler
    
def evaluate_model(X_v_te, y_v_te, model, fitted_scaler):
    
    ## Remove original seatcapacity
    X_test_SCO= X_v_te.SeatCapacityOriginal
    X_v_te = X_v_te.loc[:, ~X_v_te.columns.isin(['SeatCapacityOriginal'])]
    
    ## Normalize validation data SeatCapacity for predictions
    X_v_te = normalize_seatcapacity(X_val=X_v_te, scaler=fitted_scaler)
    # ## Normalize seatcapacity
    # X_tr_m, fitted_scaler = normalize_seatcapacity_fit(X_train=X_tr_m)

    # ## Normalize validation data SeatCapacity for predictions
    # X_v_te = normalize_seatcapacity(X_val=X_v_te, scaler=fitted_scaler)
    ## Make predictions
    pred = model.predict(X_v_te)

    ## Compute forecasting accuracy
    acc = mean_forecast_accuracy(loadfactor_forecasted=pred, loadfactor_true=y_v_te.to_numpy(), seatcapacity=X_test_SCO.to_numpy())

    return acc

In [10]:
y_all

0        0.408451
1        0.189189
2        0.570423
3        0.333333
4        0.204301
           ...   
39444    0.847222
39445    0.871795
39446    0.857143
39447    0.682796
39448    0.820513
Name: LoadFactor, Length: 39449, dtype: float64

# Part 1: Find best hyperparameters using val set

# Gradient boost

In [None]:
sweep_config = {
    "method": "random", # try grid or random
    "metric": {
      "name": "accuracy",
      "goal": "maximize"   
    },
    "parameters": {
        "n_estimators": {
            # "values": [400, 800, 1200, 1600, 2000, 2500, 3000]
            "values": [2000, 3000, 4000, 5000]
        },
        "max_depth": {
            "values": [5,6,7]
            # "distribution": "uniform",
            # "min": 0.00001,
            # "max": 1
        },
        "learning_rate": {
            "values": [0.001]#, 0.01, 0.05]#, 0.1, 0.15]
        }
        ,
        "subsample": {
            "values": [0.8, 1]
        }
        ,
        "max_features": {
            "values": [10, 20, 30]
        }

    }
}

sweep_id = wandb.sweep(sweep_config, project='02582_case1_boost_sweep', entity='tgml')

def train():
    config_defaults = {
    # 'bootstrap': True,
    # 'criterion': 'mse',
    # 'max_features': 'auto',
    # 'max_leaf_nodes': None,
    # 'min_impurity_decrease': 0.0,
    # 'min_impurity_split': None,
    # 'min_samples_leaf': 1,
    # 'min_samples_split': 2,
    # 'min_weight_fraction_leaf': 0.0,
    # 'n_estimators': 10,
    # 'n_jobs': 1,
    # 'oob_score': False,
    # 'random_state': 42,
    # 'verbose': 0,
    # 'warm_start': False,
    "seed": 0,
    "n_estimators": 1000,
    'max_depth': 3,
    "learning_rate": 0.05,
    "subsample": 1.,
    }

    wandb.init(project='02582_case1_boost_sweep', entity='tgml', config=config_defaults)  # defaults are over-ridden during the sweep
    config = wandb.config

    # fit model on train
    model = GradientBoostingRegressor(n_estimators=config.n_estimators, 
                                        learning_rate=config.learning_rate,
                                        max_depth=config.max_depth,
                                        subsample=config.subsample)
    np.random.seed(0)
    train_on_val_accs, fitted_model, fitted_scaler = fit_evaluate_model(X_tr_m=X_train, X_v_te=X_val, y_tr_m=y_train, y_v_te=y_val, model=model)

    wandb.log({"accuracy": np.mean(train_on_val_accs), "std_accuracy": np.std(train_on_val_accs)})

    forecast_acc = []
    np.random.seed(0)

    M = 1000
    for m in range(M):

        ## Sample from test set
        seed = np.random.randint(10000)
        real_test_sampled = real_test
        real_test_sampled = real_test_sampled.sample(n=len(real_test), replace=True, random_state=seed)
        y_test = real_test_sampled.LoadFactor
        X_test = real_test_sampled.loc[:, real_test.columns != 'LoadFactor']

        test_acc = evaluate_model(X_v_te=X_test, y_v_te=y_test, model=fitted_model, fitted_scaler=fitted_scaler)
        forecast_acc.append(test_acc)
    # wandb.log({"accuracy": test_acc})

    wandb.log({"mean_test_accuracy": np.mean(forecast_acc), "std_test_accuracy": np.std(forecast_acc), 'abs_difference': np.abs(np.mean(forecast_acc)-np.mean(train_on_val_accs))})
    print(f'Mean of test accuracies: {np.mean(forecast_acc)}\nStd. of test accuracies: {np.std(forecast_acc)}\n And relative difference: {np.abs(np.mean(forecast_acc)-np.mean(train_on_val_accs))}')


wandb.agent(sweep_id, train, count=100)

# Random Forest

In [None]:
sweep_config = {
    "method": "random", # try grid or random
    "metric": {
      "name": "accuracy",
      "goal": "maximize"   
    },
    "parameters": {
        "n_estimators": {
            "values": [1800, 2000, 2400, 2800]
        },
        "max_depth": {
            "values": [20, 30, 40, 50]
        },
        "min_samples_split": {
            "values": [2,5,10]
        },
        "min_samples_leaf": {
            "values": [1,2,4]
        }
    }
}

sweep_id = wandb.sweep(sweep_config, project='02582_case1_rf_sweep', entity='tgml')

def train():
    config_defaults = {
    'bootstrap': True,
    'n_estimators': 1000,
    'max_depth': None,
    'max_features': 'auto',
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    # 'max_leaf_nodes': None,
    # 'criterion': 'mse',
    # 'min_impurity_decrease': 0.0,
    # 'min_impurity_split': None,
    # 'min_weight_fraction_leaf': 0.0,
    # 'n_jobs': 1,
    # 'oob_score': False,
    # 'random_state': 42,
    # 'verbose': 0,
    # 'warm_start': False,
    # "seed": 0,
    # "shuffle": True,
    }

    wandb.init(project='02582_case1_rf_sweep', entity='tgml', config=config_defaults)  # defaults are over-ridden during the sweep
    config = wandb.config


    # fit model on train
    model = RandomForestRegressor(n_estimators=config.n_estimators, bootstrap=config.bootstrap,
                            max_depth=config.max_depth, min_samples_split=config.min_samples_split, min_samples_leaf=config.min_samples_leaf)

    np.random.seed(0)
    train_on_val_accs, fitted_model, fitted_scaler = fit_evaluate_model(X_tr_m=X_train, X_v_te=X_val, y_tr_m=y_train, y_v_te=y_val, model=model)

    wandb.log({"accuracy": np.mean(train_on_val_accs), "std_accuracy": np.std(train_on_val_accs)})

    forecast_acc = []
    np.random.seed(0)

    M = 100
    for m in range(M):

        ## Sample from test set
        seed = np.random.randint(10000)
        real_test_sampled = real_test
        real_test_sampled = real_test_sampled.sample(n=len(real_test), replace=True, random_state=seed)
        y_test = real_test_sampled.LoadFactor
        X_test = real_test_sampled.loc[:, real_test.columns != 'LoadFactor']

        test_acc = evaluate_model(X_v_te=X_test, y_v_te=y_test, model=fitted_model, fitted_scaler=fitted_scaler)
        forecast_acc.append(test_acc)
    # wandb.log({"accuracy": test_acc})

    wandb.log({"mean_test_accuracy": np.mean(forecast_acc), "std_test_accuracy": np.std(forecast_acc), 'abs_difference': np.abs(np.mean(forecast_acc)-np.mean(train_on_val_accs))})
    print(f'Mean of test accuracies: {np.mean(forecast_acc)}\nStd. of test accuracies: {np.std(forecast_acc)}\n And relative difference: {np.abs(np.mean(forecast_acc)-np.mean(train_on_val_accs))}')


wandb.agent(sweep_id, train, count=100)

# Now make accuracy prediction using test set

# Gradient boost

In [None]:
shuffle = True
M = 1000
config = {
    'n_estimators': 3000,
    'max_depth': 6,
    # 'min_samples_leaf': 1,
    # 'min_samples_split': 2,
    'learning_rate': 0.001,
    'max_features': 10,
    }

np.random.seed(0)
forecast_acc = []

wandb.init(project='02582_case1_final', entity='tgml', config=config, group='SKLEARN_GB')

## Train and evaluate model
model = GradientBoostingRegressor(n_estimators=config['n_estimators'], 
                                max_features=config['max_features'],
                                max_depth=config['max_depth'], 
                                # min_samples_split=config['min_samples_split'], 
                                # min_samples_leaf=config['min_samples_leaf'],
                                learning_rate=config['learning_rate'])

## Evaluate best model on test data
fitted_model, fitted_scaler = fit_model(X_tr_m=X_train, y_tr_m=y_train, model=model)
for m in range(M):

    ## Sample from test set
    seed = np.random.randint(10000)
    real_test_sampled = real_test
    real_test_sampled = real_test_sampled.sample(n=len(real_test), replace=True, random_state=seed)
    y_test = real_test_sampled.LoadFactor
    X_test = real_test_sampled.loc[:, real_test.columns != 'LoadFactor']

    test_acc = evaluate_model(X_v_te=X_test, y_v_te=y_test, model=fitted_model, fitted_scaler=fitted_scaler)
    forecast_acc.append(test_acc)
    wandb.log({"accuracy": test_acc})

wandb.log({"mean_accuracy": np.mean(forecast_acc), "std_accuracy": np.std(forecast_acc)})
print(f'Mean of test accuracies: {np.mean(forecast_acc)}\nStd. of test accuracies: {np.std(forecast_acc)}')


In [6]:
shuffle = True
M = 1000
config = {
    'n_estimators': 3000,
    'max_depth': 6,
    'learning_rate': 0.001,
    'max_features': 10,
    }

np.random.seed(0)
forecast_acc = []

wandb.init(project='02582_case1_final', entity='tgml', config=config, group='SKLEARN_GB')

## Train and evaluate model
model = GradientBoostingRegressor(n_estimators=config['n_estimators'], 
                                max_features=config['max_features'],
                                max_depth=config['max_depth'], 
                                # min_samples_split=config['min_samples_split'], 
                                # min_samples_leaf=config['min_samples_leaf'],
                                learning_rate=config['learning_rate'])

## Evaluate best model on test data
fitted_model, fitted_scaler = fit_model(X_tr_m=X_model, y_tr_m=y_model, model=model)
for m in range(M):

    ## Sample from test set
    seed = np.random.randint(10000)
    real_test_sampled = real_test
    real_test_sampled = real_test_sampled.sample(n=len(real_test), replace=True, random_state=seed)
    y_test = real_test_sampled.LoadFactor
    X_test = real_test_sampled.loc[:, real_test.columns != 'LoadFactor']

    test_acc = evaluate_model(X_v_te=X_test, y_v_te=y_test, model=fitted_model, fitted_scaler=fitted_scaler)
    forecast_acc.append(test_acc)
    wandb.log({"accuracy": test_acc})

wandb.log({"mean_accuracy": np.mean(forecast_acc), "std_accuracy": np.std(forecast_acc)})
print(f'Mean of test accuracies: {np.mean(forecast_acc)}\nStd. of test accuracies: {np.std(forecast_acc)}')


wandb: Currently logged in as: augustsemrau (use `wandb login --relogin` to force relogin)


Mean of test accuracies: 17.163884275728634
Std. of test accuracies: 10.040060353989276


# Random Forest

In [None]:
shuffle = True
M = 1000
config = {
    # 'bootstrap': True,
    # 'criterion': 'mse',
    # 'max_leaf_nodes': None,
    # 'min_impurity_decrease': 0.0,
    # 'min_impurity_split': None,
    # 'min_weight_fraction_leaf': 0.0,
    # 'n_jobs': 1,
    # 'oob_score': False,
    # 'random_state': 42,
    # 'verbose': 0,
    # 'warm_start': False,
    # "shuffle": True,
    'max_features': 'auto',
    'n_estimators': 2000,
    'max_depth': 30,
    'min_samples_leaf': 4,
    'min_samples_split': 4,
    }

np.random.seed(0)
forecast_acc = []

wandb.init(project='02582_case1_final', entity='tgml', config=config, group='RF')

## Train and evaluate model
model = RandomForestRegressor(n_estimators=config['n_estimators'], 
                                max_features=config['max_features'],
                                max_depth=config['max_depth'], 
                                min_samples_split=config['min_samples_split'], 
                                min_samples_leaf=config['min_samples_leaf'])


## Evaluate best model on test data
fitted_model, fitted_scaler = fit_model(X_tr_m=X_train, y_tr_m=y_train, model=model)
for m in tqdm(range(M)):

    ## Sample from test set
    seed = np.random.randint(10000)
    real_test_sampled = real_test
    real_test_sampled = real_test_sampled.sample(n=len(real_test), replace=True, random_state=seed)
    y_test = real_test_sampled.LoadFactor
    X_test = real_test_sampled.loc[:, real_test.columns != 'LoadFactor']

    test_acc = evaluate_model(X_v_te=X_test, y_v_te=y_test, model=fitted_model, fitted_scaler=fitted_scaler)
    forecast_acc.append(test_acc)
    wandb.log({"accuracy": test_acc})

wandb.log({"mean_accuracy": np.mean(forecast_acc), "std_accuracy": np.std(forecast_acc)})
print(f'Mean of test accuracies: {np.mean(forecast_acc)}\nStd. of test accuracies: {np.std(forecast_acc)}')


# We now have the best model, make predictions and save to output.txt

In [5]:
# Fit model
X_all_normalized, fitted_scaler = normalize_seatcapacity_fit(X_train=X_all)
X_all_normalized

Unnamed: 0,SeatCapacity,Month,WeekNumber,Weekday,HourOfDay,AircraftType_319,AircraftType_320,AircraftType_333,AircraftType_359,AircraftType_32A,...,Destination_YXJ,Destination_YXU,Destination_YXX,Destination_YYJ,Destination_YYZ,Destination_YZF,Season_Fall,Season_Spring,Season_Summer,Season_Winter
0,0.299320,1,53,4,6,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0.145125,1,53,4,10,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0.299320,1,53,4,12,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0.140590,1,53,4,13,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0.399093,1,53,4,14,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39444,0.303855,2,9,0,18,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
39445,0.331066,2,9,0,19,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
39446,0.199546,2,9,0,20,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
39447,0.399093,2,9,0,19,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [6]:
y_all

0        0.408451
1        0.189189
2        0.570423
3        0.333333
4        0.204301
           ...   
39444    0.847222
39445    0.871795
39446    0.857143
39447    0.682796
39448    0.820513
Name: LoadFactor, Length: 39449, dtype: float64

In [9]:
future_copy = future
future_copy

Unnamed: 0,SeatCapacity,Month,WeekNumber,Weekday,HourOfDay,AircraftType_319,AircraftType_320,AircraftType_333,AircraftType_359,AircraftType_32A,...,Destination_YXJ,Destination_YXU,Destination_YXX,Destination_YYJ,Destination_YYZ,Destination_YZF,Season_Fall,Season_Spring,Season_Summer,Season_Winter
0,131,3,9,1,5,1,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
1,143,3,9,1,7,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,220,3,9,1,6,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0
3,180,3,9,1,6,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
4,174,3,9,1,7,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4808,186,3,13,3,21,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
4809,230,3,13,3,20,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4810,186,3,13,3,21,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4811,186,3,13,3,20,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [10]:

config = {
    'n_estimators': 3000,
    'max_depth': 6,
    'learning_rate': 0.001,
    'max_features': 10,
    }
## Train and evaluate model
model = GradientBoostingRegressor(n_estimators=config['n_estimators'], 
                                max_features=config['max_features'],
                                max_depth=config['max_depth'],
                                learning_rate=config['learning_rate'])
                                
## Fit model to the training data
model.fit(X=X_all_normalized, y=y_all)

## Normalize validation data SeatCapacity for predictions
X_future = normalize_seatcapacity(X_val=future, scaler=fitted_scaler)
## Make predictions
pred = model.predict(X_future)
pred

array([0.55485304, 0.58638387, 0.53573335, ..., 0.51679311, 0.5335857 ,
       0.5531989 ])

In [16]:
pred[-10:]


array([0.59688954, 0.53753126, 0.49988509, 0.50877797, 0.40756355,
       0.37156914, 0.5341701 , 0.51679311, 0.5335857 , 0.5531989 ])

In [3]:
prediction_file = future
prediction_file['LoadFactor'] = pred

Unnamed: 0,SeatCapacity,Month,WeekNumber,Weekday,HourOfDay,AircraftType_319,AircraftType_320,AircraftType_333,AircraftType_359,AircraftType_32A,...,Destination_YXU,Destination_YXX,Destination_YYJ,Destination_YYZ,Destination_YZF,Season_Fall,Season_Spring,Season_Summer,Season_Winter,LoadFactor
0,0.274376,3,9,1,5,1,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0.554891
1,0.301587,3,9,1,7,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0.587106
2,0.476190,3,9,1,6,0,1,0,0,0,...,0,0,0,1,0,0,1,0,0,0.536515
3,0.385488,3,9,1,6,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0.576709
4,0.371882,3,9,1,7,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0.527518
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4808,0.399093,3,13,3,21,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0.375904
4809,0.498866,3,13,3,20,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0.535492
4810,0.399093,3,13,3,21,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0.522240
4811,0.399093,3,13,3,20,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0.537306


In [13]:
FUTURE_DATA_PATH = 'future_data.xls'
df_future = pd.read_excel(FUTURE_DATA_PATH)
df_future

Unnamed: 0,ScheduleTime,Airline,FlightNumber,Destination,AircraftType,FlightType,Sector,SeatCapacity
0,2022-03-01 05:45:00,UK,1315,YXX,319,J,CA,131
1,2022-03-01 07:15:00,PW,950,YLW,319,J,CA,143
2,2022-03-01 06:45:00,VW,770,YYZ,320,J,CA,220
3,2022-03-01 06:40:00,DO,2568,YXU,32N,J,CA,180
4,2022-03-01 07:30:00,AY,984,YYG,320,J,CA,174
...,...,...,...,...,...,...,...,...
4808,2022-03-31 21:05:00,SV,1988,YYJ,73H,J,CA,186
4809,2022-03-31 20:55:00,MD,1242,YBR,321,J,CA,230
4810,2022-03-31 21:50:00,LJ,506,YUL,320,J,CA,186
4811,2022-03-31 20:30:00,LJ,772,YEG,320,J,CA,186


In [14]:
df_future['LoadFactor'] = pred
df_future

Unnamed: 0,ScheduleTime,Airline,FlightNumber,Destination,AircraftType,FlightType,Sector,SeatCapacity,LoadFactor
0,2022-03-01 05:45:00,UK,1315,YXX,319,J,CA,131,0.554853
1,2022-03-01 07:15:00,PW,950,YLW,319,J,CA,143,0.586384
2,2022-03-01 06:45:00,VW,770,YYZ,320,J,CA,220,0.535733
3,2022-03-01 06:40:00,DO,2568,YXU,32N,J,CA,180,0.575609
4,2022-03-01 07:30:00,AY,984,YYG,320,J,CA,174,0.526869
...,...,...,...,...,...,...,...,...,...
4808,2022-03-31 21:05:00,SV,1988,YYJ,73H,J,CA,186,0.371569
4809,2022-03-31 20:55:00,MD,1242,YBR,321,J,CA,230,0.534170
4810,2022-03-31 21:50:00,LJ,506,YUL,320,J,CA,186,0.516793
4811,2022-03-31 20:30:00,LJ,772,YEG,320,J,CA,186,0.533586


In [19]:
df_future.to_csv('output.txt', sep=',',decimal='.', index=False)


In [21]:
dd = pd.read_csv('output.txt')
dd

Unnamed: 0,ScheduleTime,Airline,FlightNumber,Destination,AircraftType,FlightType,Sector,SeatCapacity,LoadFactor
0,2022-03-01 05:45:00,UK,1315,YXX,319,J,CA,131,0.554853
1,2022-03-01 07:15:00,PW,950,YLW,319,J,CA,143,0.586384
2,2022-03-01 06:45:00,VW,770,YYZ,320,J,CA,220,0.535733
3,2022-03-01 06:40:00,DO,2568,YXU,32N,J,CA,180,0.575609
4,2022-03-01 07:30:00,AY,984,YYG,320,J,CA,174,0.526869
...,...,...,...,...,...,...,...,...,...
4808,2022-03-31 21:05:00,SV,1988,YYJ,73H,J,CA,186,0.371569
4809,2022-03-31 20:55:00,MD,1242,YBR,321,J,CA,230,0.534170
4810,2022-03-31 21:50:00,LJ,506,YUL,320,J,CA,186,0.516793
4811,2022-03-31 20:30:00,LJ,772,YEG,320,J,CA,186,0.533586
