In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold

# For calculating feature wise distribution similarity
from scipy.stats import ks_2samp

# Suppressing warnings
import warnings
warnings.filterwarnings('ignore')

# Global Parameters

In [2]:
# For loading data
REALIZED_DATA_PATH = 'Realized Schedule 20210101-20220228.xlsx'
FUTURE_DATA_PATH = 'Future Schedule 20220301-20220331.xlsx'

# For saving data
LINEAR_REALIZED_PATH = 'data/linear_realized_preprocessed_data.csv'
LINEAR_FUTURE_PATH = 'data/linear_future_preprocessed_data.csv'
TREE_REALIZED_PATH = 'data/tree_realized_preprocessed_data.csv'
TREE_FUTURE_PATH = 'data/tree_future_preprocessed_data.csv'

# Data Preprocessing of Case 1
### By August Semrau and William Marstrand
This notebook consists of data preprocessing, analysis and modelling is found in seperate notebooks

## Data

In [3]:
### Load data
df_realized = pd.read_excel(REALIZED_DATA_PATH)
df_realized

Unnamed: 0,ScheduleTime,Airline,FlightNumber,Destination,AircraftType,FlightType,Sector,SeatCapacity,LoadFactor
0,2021-01-01 06:35:00,IA,874,DEN,73W,J,US,142,0.408451
1,2021-01-01 10:35:00,JZ,818,YHM,AT7,J,CA,74,0.189189
2,2021-01-01 12:05:00,IA,876,DEN,73W,J,US,142,0.570423
3,2021-01-01 13:20:00,CN,514,EST,AT7,J,US,72,0.333333
4,2021-01-01 14:20:00,LJ,3140,DEN,32A,J,US,186,0.204301
...,...,...,...,...,...,...,...,...,...
39444,2022-02-28 18:45:00,VW,986,YYZ,319,J,CA,144,0.847222
39445,2022-02-28 19:25:00,LJ,667,YDQ,319,J,CA,156,0.871795
39446,2022-02-28 20:00:00,VW,3406,MYV,E90,J,US,98,0.857143
39447,2022-02-28 19:30:00,LJ,663,RUT,32N,J,US,186,0.682796


In [4]:
df_future = pd.read_excel(FUTURE_DATA_PATH)
df_future

Unnamed: 0,ScheduleTime,Airline,FlightNumber,Destination,AircraftType,FlightType,Sector,SeatCapacity
0,2022-03-01 05:45:00,UK,1315,YXX,319,J,CA,131
1,2022-03-01 07:15:00,PW,950,YLW,319,J,CA,143
2,2022-03-01 06:45:00,VW,770,YYZ,320,J,CA,220
3,2022-03-01 06:40:00,DO,2568,YXU,32N,J,CA,180
4,2022-03-01 07:30:00,AY,984,YYG,320,J,CA,174
...,...,...,...,...,...,...,...,...
4808,2022-03-31 21:05:00,SV,1988,YYJ,73H,J,CA,186
4809,2022-03-31 20:55:00,MD,1242,YBR,321,J,CA,230
4810,2022-03-31 21:50:00,LJ,506,YUL,320,J,CA,186
4811,2022-03-31 20:30:00,LJ,772,YEG,320,J,CA,186


First we convert the flight number to an object type, since it is really a categorical variable, but Pandas thinks it's numerical

In [5]:
## Convert FlightNumber to object, it is not a numerical value
df_realized.FlightNumber = df_realized.FlightNumber.astype(object)
df_future.FlightNumber = df_realized.FlightNumber.astype(object)
## Investigate data for dtypes and stuff
print('\ndtypes of the datasets columns:')
df_realized.dtypes


dtypes of the datasets columns:


ScheduleTime    datetime64[ns]
Airline                 object
FlightNumber            object
Destination             object
AircraftType            object
FlightType              object
Sector                  object
SeatCapacity             int64
LoadFactor             float64
dtype: object

## Feature Engineering
Based on the different data types, we already know we want to do a couple of things to each column/feature, and these are described below:
<br><br>
For **ScheduleTime**, we want to do two things; Firstly, we want to convert the datetime64[ns] format to a more simple datetime format. We further want to create new object columns; one which encodes which year it is, one for the month of the year the flight is, which week, weekday, hour of day and minute of hour. These will also be encoded in the next step.
These new features extrated from the timestamp will be categorical features.
<br><br>
For **Airline**, **Flightnumber** (infact categorical), **Destination**, **Aircrafttype**, **Flighttype** and **Sector**, columns which are encoded as objects (they are categorical), we need to make an alternative encoding. If they hold few different classes, this is true for Flighttype which is either *J* or *C*, we will one-hot-encode them, else they will be label encoded. 
<br><br>
SeatCapacity is made categorical, though it differently from the rest, is in fact ordinal.
<br><br>
First, we remove rows with empty values

In [6]:
def remove_empty(df):
    missing_values_count = df.isnull().sum()
    print(f'List of null data counts for each column: \n{missing_values_count}')

    ## As there are so few missing values, a total of five rows, we simply remove these
    df_no_nan = df.dropna()
    # rea_df = rea_df.loc[rea_df['FlightType'].isin(['C', 'J'])]
    print(f'\nData had {df.shape[0]} rows pre-nan-removal, now has {df_no_nan.shape[0]} rows.')

    return df_no_nan

def seperate_target(df, target):
    ## Seperate data from target
    X, y = df.loc[:, df.columns != target], df.loc[:, df.columns == target]
    return X, y


In [7]:
target = 'LoadFactor'
print('###### Realized Data #######')
no_nan_rea_df = remove_empty(df_realized)
rea_X, rea_y = seperate_target(no_nan_rea_df, target)

print('###### Future Data #######')
no_nan_fut_df = remove_empty(df_future)
fut_X, fut_y = seperate_target(no_nan_fut_df, target)

###### Realized Data #######
List of null data counts for each column: 
ScheduleTime    0
Airline         0
FlightNumber    0
Destination     0
AircraftType    0
FlightType      0
Sector          0
SeatCapacity    0
LoadFactor      0
dtype: int64

Data had 39449 rows pre-nan-removal, now has 39449 rows.
###### Future Data #######
List of null data counts for each column: 
ScheduleTime    0
Airline         0
FlightNumber    0
Destination     0
AircraftType    0
FlightType      0
Sector          0
SeatCapacity    0
dtype: int64

Data had 4813 rows pre-nan-removal, now has 4813 rows.


## Preprocess **ScheduleTime**
The `ScheduleTime` feature is a `datetime64` dtype. To be able to use it in a meaningful way for the modelling, we extract certain time features from it e.g. `Year`, `Month`, `Day`, and `WeekNumber`.

In [8]:
def extract_scheduletime_features(df):
    df = df.copy()
    ### Preprosses time for getting year, month, week number, weekday, hour of day and minute of hour
    df['Year'] = df['ScheduleTime'].dt.year
    df['Month'] = df['ScheduleTime'].dt.month
    df['WeekNumber'] = df['ScheduleTime'].dt.isocalendar().week % 52
    df['Weekday'] = df['ScheduleTime'].dt.dayofweek
    df['HourOfDay'] = df['ScheduleTime'].dt.hour
    df['MinuteOfHour'] = df['ScheduleTime'].dt.minute

    df.Year = df.Year.astype(object)
    df.Month = df.Month.astype(object)
    df.WeekNumber = df.WeekNumber.astype(object)
    df.Weekday = df.Weekday.astype(object)
    df.HourOfDay = df.HourOfDay.astype(object)
    df.MinuteOfHour = df.MinuteOfHour.astype(object)

    df = df.drop('ScheduleTime', axis=1)

    return df

In [9]:
rea_X_schedule_time = extract_scheduletime_features(rea_X)
fut_X_schedule_time = extract_scheduletime_features(fut_X)

In [10]:
print(f'Shape of rea_X_schedule_time: {rea_X_schedule_time.shape}\nShape of fut_X_schedule_time: {fut_X_schedule_time.shape}')

Shape of rea_X_schedule_time: (39449, 13)
Shape of fut_X_schedule_time: (4813, 13)


## Preprocess categorical features
The categorical features are not numerical types, but we need them to be for the model to handle them.
<br>
Therefore we perform feature encoding on them. The type of encoding which is more suitable depends on the different types of models we train.
<br>
Therefore we create two types of datasets. One encoded with `one-hot-encoding` for linear regression and one encoded with `label-encoding` for tree based models.

In [11]:
### We need to find out how many categorical values exist for each feature
rea_X_schedule_time.describe(include = ['object'])

Unnamed: 0,Airline,FlightNumber,Destination,AircraftType,FlightType,Sector,Year,Month,WeekNumber,Weekday,HourOfDay,MinuteOfHour
count,39449,39449,39449,39449,39449,39449,39449,39449,39449,39449,39449,39449
unique,105,825,230,58,3,12,2,12,52,7,23,13
top,DO,771,CKE,73H,J,CA,2021,10,1,6,8,40
freq,10483,366,2706,8970,38229,19311,32348,4705,1427,6495,3004,3368


In [12]:
fut_X_schedule_time.describe(include = ['object'])

Unnamed: 0,Airline,FlightNumber,Destination,AircraftType,FlightType,Sector,Year,Month,WeekNumber,Weekday,HourOfDay,MinuteOfHour
count,4813,4813,4813,4813,4813,4813,4813,4813,4813,4813,4813,4813
unique,71,260,172,38,4,10,1,1,5,7,24,12
top,DO,556,DEN,73H,J,CA,2022,3,12,3,8,40
freq,1256,124,255,1169,4693,2434,4813,4813,1086,791,414,435


### Encoding

In [13]:
def label_encode(df, cols):
    le = LabelEncoder()
    df = df.copy()

    for c in tqdm(cols):
        df[c] = le.fit_transform(df[c].apply(str))
    
    return df

In [14]:
nominal_cols = ['AircraftType', 'FlightType', 'Sector', 'Year', 'Month', 'WeekNumber', 'Weekday', 'HourOfDay',
                'MinuteOfHour','Airline', 'FlightNumber', 'Destination'] 
linear_rea_X = pd.get_dummies(rea_X_schedule_time, columns=nominal_cols, drop_first=True)
linear_fut_X = pd.get_dummies(fut_X_schedule_time, columns=nominal_cols, drop_first=True)

tree_rea_X = label_encode(rea_X_schedule_time, nominal_cols)
tree_fut_X = label_encode(fut_X_schedule_time, nominal_cols)

100%|██████████| 12/12 [00:00<00:00, 88.10it/s]
100%|██████████| 12/12 [00:00<00:00, 516.82it/s]


In [15]:
print(f'Shape of linear_fut_X: {linear_fut_X.shape}\nShape of linear_rea_X: {linear_rea_X.shape}')
print(f'Shape of tree_fut_X: {tree_fut_X.shape}\nShape of tree_rea_X: {tree_rea_X.shape}')

Shape of linear_fut_X: (4813, 594)
Shape of linear_rea_X: (39449, 1331)
Shape of tree_fut_X: (4813, 13)
Shape of tree_rea_X: (39449, 13)


Now we remove any columns that holds identical values for all rows i.e. has no information to differentiate data points

In [16]:
def remove_zero_variance_cols(df):
    selector = VarianceThreshold(0)
    new_df = pd.DataFrame(selector.fit_transform(df), columns=selector.get_feature_names_out())
    print(f'New Shape: {new_df.shape}')

In [17]:
print('######## linear_rea_X ##########')
selected_linear_rea_X = remove_zero_variance_cols(linear_rea_X)
print('######## linear_fut_X ##########')
selected_linear_fut_X = remove_zero_variance_cols(linear_fut_X)
print('######## tree_rea_X ##########')
selected_tree_rea_X = remove_zero_variance_cols(tree_rea_X)
print('######## tree_fut_X ##########')
selected_tree_fut_X = remove_zero_variance_cols(tree_fut_X)

######## linear_rea_X ##########
New Shape: (39449, 1331)
######## linear_fut_X ##########
New Shape: (4813, 594)
######## tree_rea_X ##########
New Shape: (39449, 13)
######## tree_fut_X ##########
New Shape: (4813, 11)


We now only choose the columns that are both in the future and realized data

In [18]:
def choose_intersection(rea_df, fut_df):
    rea_fut_cols = list(set(rea_df.columns).intersection(set(fut_df.columns)))
    chosen_rea = rea_df[rea_fut_cols]
    chosen_fut = rea_df[rea_fut_cols]
    print(f'Number of cols: {len(rea_fut_cols)}')
    return chosen_rea, chosen_fut

In [19]:
print('######### linear_rea_X #########')
chosen_linear_rea_X, chosen_linear_fut_X = choose_intersection(linear_rea_X, linear_fut_X)
print('######### tree_rea_X #########')
chosen_tree_rea_X, chosen_tree_fut_X = choose_intersection(tree_rea_X, tree_fut_X)

######### linear_rea_X #########
Number of cols: 586
######### tree_rea_X #########
Number of cols: 13


## Distribution Similarity
It is important that features in the realized training dataset and features in the future prediction dataset have similar distributions.
<br>
Therefore we perform measurements of the featurewise distribution similarities between the features in the realized and future datasets.
<br> 
The similarity is computed using `scipy.stats.ks_2samp`. If the pairwise distribution similarity is **very low** between a set of features we drop the feature. This is done to avoid the model (over)fitting to the feature in the training data and then performing poorly on the future data, because it is different. Two distirbutions are deemed different if the result from the `Kolmogorov-Smirnov test` returns a pvalue below 1%, which means the null hypothesis (that the two distributions are identical) can be rejected.

In [20]:
def find_and_plot_different_feature_dist(df_rea, df_fut):
    features = df_rea.columns
    diff_features = []

    # Find features with significantly different distributions
    for f in tqdm(features):
        f_rea = df_rea[f]
        f_fut = df_fut[f]
        result = ks_2samp(f_rea, f_fut)
        if result.pvalue < 0.01:
            diff_features.append((f, result.pvalue, result.statistic))

    if len(diff_features) > 0:
        # Plot significantly different features
        fig, ax = plt.subplots(len(diff_features), 2, figsize=(15, 50))

        for i, (feature, pvalue, prob) in enumerate(tqdm(diff_features)):
            ax[i,0].set_title(f'Realized {feature} (pvalue={pvalue}, prob={prob})')
            sns.histplot(df_rea[feature], ax=ax[i,0])
            ax[i,1].set_title(f'Future {feature} (pvalue={pvalue}, prob={prob})')
            sns.histplot(df_fut[feature], ax=ax[i,1])

        plt.tight_layout()
        plt.show()
    else:
        print('No significantly diffrent feature distributions found')

### Linear Model Dataset

In [21]:
find_and_plot_different_feature_dist(chosen_linear_rea_X, chosen_linear_fut_X)

100%|██████████| 586/586 [00:02<00:00, 225.16it/s]

No significantly diffrent feature distributions found





### Tree Based Model Dataset

In [22]:
find_and_plot_different_feature_dist(chosen_tree_rea_X, chosen_tree_fut_X)

100%|██████████| 13/13 [00:00<00:00, 141.07it/s]

No significantly diffrent feature distributions found





## Saving the data
We now save the preprocessed realized and future datasets.
<br>
We produce 4 datasets: 
* `linear_realized_preprocessed_data.csv`
* `linear_future_preprocessed_data.csv`
* `tree_realized_preprocessed_data.csv`
* `tree_future_preprocessed_data.csv`

In [26]:
chosen_linear_rea = chosen_linear_rea_X
chosen_linear_rea['LoadFactor'] = rea_y
chosen_tree_rea = chosen_tree_rea_X
chosen_tree_rea['LoadFactor'] = rea_y
chosen_tree_rea

Unnamed: 0,Year,FlightNumber,Sector,FlightType,Airline,SeatCapacity,Weekday,HourOfDay,MinuteOfHour,WeekNumber,Month,AircraftType,Destination,LoadFactor
0,0,750,11,2,23,142,4,19,6,1,0,24,46,0.408451
1,0,727,1,2,32,74,4,1,6,1,0,40,177,0.189189
2,0,752,11,2,23,142,4,3,9,1,0,24,46,0.570423
3,0,497,11,2,7,72,4,4,3,1,0,40,59,0.333333
4,0,422,11,2,40,186,4,5,3,1,0,8,46,0.204301
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39444,1,818,1,2,83,144,0,9,8,51,4,4,222,0.847222
39445,1,614,1,2,40,156,0,10,4,51,4,4,172,0.871795
39446,1,452,11,2,83,98,0,12,0,51,4,48,118,0.857143
39447,1,610,11,2,40,186,0,10,5,51,4,10,146,0.682796


In [27]:
chosen_linear_rea_X.to_csv(LINEAR_REALIZED_PATH, sep=',', decimal='.')
chosen_linear_fut_X.to_csv(LINEAR_FUTURE_PATH, sep=',', decimal='.')
chosen_tree_rea_X.to_csv(TREE_REALIZED_PATH, sep=',', decimal='.')
chosen_tree_fut_X.to_csv(TREE_FUTURE_PATH, sep=',', decimal='.')