# Case One: Project Notebook
By August and William

In [1]:
### Imports
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

from sklearn import linear_model
import numpy as np
import matplotlib.pyplot as plt

from xgboost import XGBRegressor

# 1. Data Preprocessing
## Load data and remove nan's

In [2]:
df_full = pd.read_excel('dataset_full.xls')
df_full.FlightNumber = df_full.FlightNumber.astype(object)
df = df_full.dropna()
df = df.loc[df['FlightType'].isin(['C', 'J'])]

## Augment **ScheduleTime** column

In [3]:
df['Year'] = df.ScheduleTime.dt.year.astype(object);
df['Month'] = df['ScheduleTime'].dt.month.astype(object);
df['WeekNumber'] = df['ScheduleTime'].dt.isocalendar().week % 52;
df['Weekday'] = df['ScheduleTime'].dt.dayofweek;
df.WeekNumber = df.WeekNumber.astype(object);
df.Weekday = df.Weekday.astype(object);
df['HourOfDay'] = df['ScheduleTime'].dt.hour.astype(object);
df['MinuteOfHour'] = df['ScheduleTime'].dt.minute.astype(object);

## One-Hot-Encode nominal variables

In [4]:
### Define feature columns
# feature_cols = ['ScheduleTime', 'Airline', 'FlightNumber', 'Destination', 'AircraftType', 'FlightType', 'Sector', 'SeatCapacity', 'Year', 'Month', 'WeekNumber', 'Weekday', 'HourOfDay', 'MinuteOfHour']
feature_cols = ['Airline', 'FlightNumber', 'Destination', 'AircraftType', 'FlightType', 'Sector', 'SeatCapacity', 'Year', 'Month', 'WeekNumber', 'Weekday', 'HourOfDay', 'MinuteOfHour']
under_15_cols = ['FlightType', 'Sector', 'Year', 'Month', 'Weekday', 'MinuteOfHour']
over_15_cols = ['Airline', 'FlightNumber', 'Destination', 'AircraftType', 'WeekNumber', 'HourOfDay']
nominal_cols = ['Airline', 'FlightNumber', 'Destination', 'AircraftType', 'FlightType', 'Sector', 'Year', 'Month', 'WeekNumber', 'Weekday', 'HourOfDay', 'MinuteOfHour']
ordinal_cols = ['SeatCapacity']

### Split target from feature data
X_full = df[feature_cols]
y = df['LoadFactor']

### Encode features with one-hot-encoding
# X = pd.get_dummies(data=X, columns=under_15_cols)
X_full = pd.get_dummies(data=X_full, columns=nominal_cols);

### Print dataframe
X_full

Unnamed: 0,SeatCapacity,Airline_5M,Airline_AY,Airline_BJ,Airline_BT,Airline_BZ,Airline_CD,Airline_CL,Airline_CN,Airline_DO,...,MinuteOfHour_15,MinuteOfHour_20,MinuteOfHour_25,MinuteOfHour_30,MinuteOfHour_35,MinuteOfHour_40,MinuteOfHour_45,MinuteOfHour_50,MinuteOfHour_54,MinuteOfHour_55
0,142,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,74,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,142,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,72,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
4,186,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39444,144,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
39445,156,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
39446,98,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39447,186,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [5]:
### Print shape of target
y.shape

(39446,)

We now have the following objects:  
- **X_full** containing all training data, 1312 features (without ScheduleTime).  
- **y** with target data.  
- **X_SeatCapacity** with original seat numbers.  

# 2. Feature Selection
## Berid training data of insignificant features

In [6]:
### We use the variance threshold method for removing features
from sklearn.feature_selection import VarianceThreshold, SelectFdr
FEATURE_SELECTION_VARIANCE_THRESHOLD = 0.005
FEATURE_SELECTION_BH_ALPHA = 0.00005

selector = VarianceThreshold(FEATURE_SELECTION_VARIANCE_THRESHOLD)
X = pd.DataFrame(selector.fit_transform(X_full, y), columns=selector.get_feature_names_out())
X

Unnamed: 0,SeatCapacity,Airline_AY,Airline_CL,Airline_CN,Airline_DO,Airline_EM,Airline_GQ,Airline_IA,Airline_IK,Airline_IR,...,MinuteOfHour_10,MinuteOfHour_15,MinuteOfHour_20,MinuteOfHour_25,MinuteOfHour_30,MinuteOfHour_35,MinuteOfHour_40,MinuteOfHour_45,MinuteOfHour_50,MinuteOfHour_55
0,142,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
1,74,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,142,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,72,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,186,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39441,144,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
39442,156,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
39443,98,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39444,186,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [7]:
y

0        0.408451
1        0.189189
2        0.570423
3        0.333333
4        0.204301
           ...   
39444    0.847222
39445    0.871795
39446    0.857143
39447    0.682796
39448    0.820513
Name: LoadFactor, Length: 39446, dtype: float64

### Now, the training data has been reduced from1312 features to 254 features.

## Make copy of **SeatCapacity** for computing forecast accuracy

In [8]:
# SeatCapacity = df.SeatCapacity
X['SeatCapacityOriginal'] = X.SeatCapacity
X

Unnamed: 0,SeatCapacity,Airline_AY,Airline_CL,Airline_CN,Airline_DO,Airline_EM,Airline_GQ,Airline_IA,Airline_IK,Airline_IR,...,MinuteOfHour_15,MinuteOfHour_20,MinuteOfHour_25,MinuteOfHour_30,MinuteOfHour_35,MinuteOfHour_40,MinuteOfHour_45,MinuteOfHour_50,MinuteOfHour_55,SeatCapacityOriginal
0,142,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,142
1,74,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,74
2,142,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,142
3,72,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,72
4,186,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,186
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39441,144,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,144
39442,156,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,156
39443,98,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,98
39444,186,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,186


# 3. Data splitting
## Split data into modeling data (will be training and validation) and test data

In [19]:
from sklearn.model_selection import train_test_split

### Make train/val set *0.8 and test *0.2
def split_model_test(X, y, seed=0, shuffle=False):
    X_model, X_test, y_model, y_test = train_test_split(X, y, test_size=0.2, random_state=seed, shuffle=shuffle);
    return X_model, X_test, y_model, y_test

def split_train_val(X_m, y_m, seed=0, shuffle=False):
    X_train, X_val, y_train, y_val = train_test_split(X_m, y_m, test_size=0.25, random_state=seed, shuffle=shuffle)
    return X_train, X_val, y_train, y_val

def seperate_SCO(X_train_model, X_val_test):
    X_train_model_SCO, X_val_test_SCO = X_train_model.SeatCapacityOriginal, X_val_test.SeatCapacityOriginal

    X_train_model = X_train_model.loc[:, ~X_train_model.columns.isin(['SeatCapacityOriginal'])]
    X_val_test = X_val_test.loc[:, ~X_val_test.columns.isin(['SeatCapacityOriginal'])]

    return X_train_model, X_val_test, X_train_model_SCO, X_val_test_SCO


# 4. Define validation setup for different models
## Define forecast accuracy function

In [10]:
def mean_forecast_accuracy(loadfactor_forecasted, loadfactor_true, seatcapacity):

    passengers_true = loadfactor_true * seatcapacity
    passengers_forecasted = loadfactor_forecasted * seatcapacity
    # eps = np.finfo(float).eps
    
    abs_deviation_per_flight = np.abs((passengers_true-passengers_forecasted) / passengers_true)
    abs_deviation_per_flight[abs_deviation_per_flight >= 10000] = 100

    mean_forecast_acc = np.mean(100 - abs_deviation_per_flight)
    print(f'Mean forecast accuracy = {mean_forecast_acc}')
    return mean_forecast_acc

## Define nomralizer for training on **SeatCapacity**

In [11]:
def normalize_seatcapacity_train(X_train):
    scaler = MinMaxScaler()
    scaler.fit(X_train.SeatCapacity.values.reshape(-1, 1))
    X_train.SeatCapacity = scaler.transform(X_train.SeatCapacity.values.reshape(-1, 1))
    return X_train, scaler

def normalize_seatcapacity_val(X_val, scaler):
    X_val.SeatCapacity = scaler.transform(X_val.SeatCapacity.values.reshape(-1, 1))
    return X_val

## Functions for fitting+validating models, as well as testing models

In [12]:
### Make function for fitting and validating model
def train_validate_model(X_train, X_val, y_train, y_val, model):
    
    ## Remove original seatcapacity
    X_train, X_val, X_train_SCO, X_val_SCO = seperate_SCO(X_train_model=X_train, X_val_test=X_val)

    ## Normalize seatcapacity
    X_train, fitted_scaler = normalize_seatcapacity_train(X_train=X_train)

    ## Fit model to the training data
    model.fit(X=X_train, y=y_train)

    ## Normalize validation data SeatCapacity for predictions
    X_val = normalize_seatcapacity_val(X_val=X_val, scaler=fitted_scaler)

    
    ## Make predictions
    val_pred = model.predict(X_val)

    ## Compute forecasting accuracy
    val_acc = mean_forecast_accuracy(loadfactor_forecasted=val_pred, loadfactor_true=y_val.to_numpy(), seatcapacity=X_val_SCO.to_numpy())

    return val_acc, model


### Make function for fitting model to all modeling data at validating on test set
def train_test_model(X_model, y_model, X_test, y_test, model):
    
    ## Remove original seatcapacity
    X_model, X_test, X_model_SCO, X_test_SCO = seperate_SCO(X_train_model=X_model, X_val_test=X_test)

    ## Normalize seatcapacity
    X_model, fitted_scaler = normalize_seatcapacity_train(X_train=X_model)

    ## Fit model to the training data
    model.fit(X=X_model, y=y_model)

    ## Normalize validation data SeatCapacity for predictions
    X_test = normalize_seatcapacity_val(X_val=X_test, scaler=fitted_scaler)

    
    ## Make predictions
    test_pred = model.predict(X_test)

    ## Compute forecasting accuracy
    test_acc = mean_forecast_accuracy(loadfactor_forecasted=test_pred, loadfactor_true=y_test.to_numpy(), seatcapacity=X_test_SCO.to_numpy())

    return test_acc, model

## Linear models

In [18]:
### Train-Validation run

X_model, X_test, y_model, y_test = split_model_test(X=X, y=y, seed=1, shuffle=False)
X_train, X_val, y_train, y_val = split_train_val(X_m=X_model, y_m=y_model, seed=1, shuffle=False)

### Linear Regression
linreg_model = linear_model.LinearRegression()
linreg_val_acc, linreg_trained_model = train_validate_model(X_train=X_train, X_val=X_val, y_train=y_train, y_val=y_val, model=linreg_model)

print(f'Linear Regresion Forecast Accuracy: {linreg_val_acc}')

Mean forecast accuracy = 54.34156780635255
Linear Regresion Forecast Accuracy: 54.34156780635255


In [14]:
### Model-test run

X_model, X_test, y_model, y_test = split_model_test(X=X, y=y, seed=0, shuffle=False)

### Linear Regression
linreg_model = linear_model.LinearRegression()
linreg_test_acc, linreg_fully_trained_model = train_test_model(X_model=X_model, y_model=y_model, X_test=X_test, y_test=y_test, model=linreg_model)

print(f'Linear Regresion Forecast Accuracy: {linreg_test_acc}')

Mean forecast accuracy = 96.03164764998851
Linear Regresion Forecast Accuracy: 96.03164764998851


In [15]:
def 
np.random.randint(10000)

5471

In [16]:
### Ridge Regression

ridge_acc = []
alphas = np.linspace(start=0, stop=1, num=50)[1:]
for i in alphas:
    print(f'alpha = {i}')
    ridge_model = linear_model.Ridge(alpha=float(i))
    ridge_val_acc, ridge_trained_model = train_validate_model(X_model=X_model, y_model=y_model, model=ridge_model)

    ridge_acc.append(ridge_val_acc)

plt.plot(alphas, ridge_acc)

alpha = 0.02040816326530612


TypeError: train_validate_model() got an unexpected keyword argument 'X_model'

# Tree Models

In [None]:
X_model, X_test, y_model, y_test = split_model_test(X, y, seed=0)

### XGBoost Regression
xgb_model = XGBRegressor()
xgb_val_acc, xgb_trained_model = train_validate_model(X_model=X_model, y_model=y_model, model=xgb_model)

print(f'XGBoost Regresion Forecast Accuracy on training data: {xgb_val_acc}')

Mean forecast accuracy = 99.35487740825978
XGBoost Regresion Forecast Accuracy on training data: 99.35487740825978


In [None]:
X_model, X_test, y_model, y_test = split_model_test(X, y, seed=0)

### XGBoost Regression
xgb_model = XGBRegressor()
xgb_test_acc, xgb_fully_trained_model = train_test_model(X_model=X_model, y_model=y_model, X_test=X_test, y_test=y_test, model=xgb_model)

print(f'XGBoost Regresion Forecast Accuracy on modeling data: {xgb_test_acc}')

Mean forecast accuracy = 99.36444654776558
XGBoost Regresion Forecast Accuracy on modeling data: 99.36444654776558
