In [67]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold

# For calculating feature wise distribution similarity
from scipy.spatial.distance import jensenshannon as js_dist

# Suppressing warnings
import warnings
warnings.filterwarnings('ignore')

# Global Parameters

In [68]:
TARGET = 'LoadFactor'

# For loading data
REALIZED_DATA_PATH = 'Realized Schedule 20210101-20220228.xlsx'
FUTURE_DATA_PATH = 'Future Schedule 20220301-20220331.xlsx'

# For saving data
LINEAR_REALIZED_PATH = 'data/linear_realized_preprocessed_data.csv'
LINEAR_FUTURE_PATH = 'data/linear_future_preprocessed_data.csv'
TREE_REALIZED_PATH = 'data/tree_realized_preprocessed_data.csv'
TREE_FUTURE_PATH = 'data/tree_future_preprocessed_data.csv'

# Data Preprocessing of Case 1
### By August Semrau and William Marstrand
This notebook consists of data preprocessing, analysis and modelling is found in seperate notebooks

## Data

In [69]:
### Load data
df_realized = pd.read_excel(REALIZED_DATA_PATH)
df_realized

Unnamed: 0,ScheduleTime,Airline,FlightNumber,Destination,AircraftType,FlightType,Sector,SeatCapacity,LoadFactor
0,2021-01-01 06:35:00,IA,874,DEN,73W,J,US,142,0.408451
1,2021-01-01 10:35:00,JZ,818,YHM,AT7,J,CA,74,0.189189
2,2021-01-01 12:05:00,IA,876,DEN,73W,J,US,142,0.570423
3,2021-01-01 13:20:00,CN,514,EST,AT7,J,US,72,0.333333
4,2021-01-01 14:20:00,LJ,3140,DEN,32A,J,US,186,0.204301
...,...,...,...,...,...,...,...,...,...
39444,2022-02-28 18:45:00,VW,986,YYZ,319,J,CA,144,0.847222
39445,2022-02-28 19:25:00,LJ,667,YDQ,319,J,CA,156,0.871795
39446,2022-02-28 20:00:00,VW,3406,MYV,E90,J,US,98,0.857143
39447,2022-02-28 19:30:00,LJ,663,RUT,32N,J,US,186,0.682796


In [70]:
df_future = pd.read_excel(FUTURE_DATA_PATH)
df_future

Unnamed: 0,ScheduleTime,Airline,FlightNumber,Destination,AircraftType,FlightType,Sector,SeatCapacity
0,2022-03-01 05:45:00,UK,1315,YXX,319,J,CA,131
1,2022-03-01 07:15:00,PW,950,YLW,319,J,CA,143
2,2022-03-01 06:45:00,VW,770,YYZ,320,J,CA,220
3,2022-03-01 06:40:00,DO,2568,YXU,32N,J,CA,180
4,2022-03-01 07:30:00,AY,984,YYG,320,J,CA,174
...,...,...,...,...,...,...,...,...
4808,2022-03-31 21:05:00,SV,1988,YYJ,73H,J,CA,186
4809,2022-03-31 20:55:00,MD,1242,YBR,321,J,CA,230
4810,2022-03-31 21:50:00,LJ,506,YUL,320,J,CA,186
4811,2022-03-31 20:30:00,LJ,772,YEG,320,J,CA,186


First we convert the flight number to an object type, since it is really a categorical variable, but Pandas thinks it's numerical

In [71]:
## Convert FlightNumber to object, it is not a numerical value
df_realized.FlightNumber = df_realized.FlightNumber.astype(object)
df_future.FlightNumber = df_realized.FlightNumber.astype(object)
## Investigate data for dtypes and stuff
print('\ndtypes of the datasets columns:')
df_realized.dtypes


dtypes of the datasets columns:


ScheduleTime    datetime64[ns]
Airline                 object
FlightNumber            object
Destination             object
AircraftType            object
FlightType              object
Sector                  object
SeatCapacity             int64
LoadFactor             float64
dtype: object

## Data Cleaning
We want to first make sure that the data itself is sane before we go any further and start extracting new features from the data.
<br>
The checks we do are:
* Checking for and cleaning null values
* Checking for illegal values e.g. zero or negative seat capacity or impossible dates in ScheduleTime
* Checking for and possibly removing outliers

### Null Values

In [72]:
def remove_empty(df):
    nan_values_count = df.isnull().sum()
    empty_values_count = df.apply(lambda df: df == '', axis=1).sum()
    print(f'List of null/empty data counts for each column: \n{nan_values_count+empty_values_count}')

    ## As there are so few missing values, a total of five rows, we simply remove these
    df_no_nan = df.dropna()[df.apply(lambda df: df != '')]
    # rea_df = rea_df.loc[rea_df['FlightType'].isin(['C', 'J'])]
    print(f'\nData had {df.shape[0]} rows pre-nan-removal, now has {df_no_nan.shape[0]} rows.')

    return df_no_nan

In [73]:
print('###### Realized Data #######')
no_nan_rea_df = remove_empty(df_realized)
print('###### Future Data #######')
no_nan_fut_df = remove_empty(df_future)

###### Realized Data #######
List of null/empty data counts for each column: 
ScheduleTime    0
Airline         0
FlightNumber    0
Destination     0
AircraftType    0
FlightType      0
Sector          0
SeatCapacity    0
LoadFactor      0
dtype: int64

Data had 39449 rows pre-nan-removal, now has 39449 rows.
###### Future Data #######
List of null/empty data counts for each column: 
ScheduleTime    0
Airline         0
FlightNumber    0
Destination     0
AircraftType    0
FlightType      0
Sector          0
SeatCapacity    0
dtype: int64

Data had 4813 rows pre-nan-removal, now has 4813 rows.


### Illegal or Extremely Unlikly Values

In [74]:
def is_before_2021_or_in_the_future(df):
    min_st = pd.Timestamp(2021, 1, 1)
    max_st = pd.Timestamp(2022, 4, 1)
    return df['ScheduleTime'].apply(lambda st: st < min_st or st >= max_st)

In [75]:
# Check if any ScheduleTime in Realized dataset is before 2021 or into the future
df_realized[is_before_2021_or_in_the_future(df_realized)]

Unnamed: 0,ScheduleTime,Airline,FlightNumber,Destination,AircraftType,FlightType,Sector,SeatCapacity,LoadFactor


In [76]:
# Check if any ScheduleTime in Future dataset is before 2021 or into the future
df_future[is_before_2021_or_in_the_future(df_future)]

Unnamed: 0,ScheduleTime,Airline,FlightNumber,Destination,AircraftType,FlightType,Sector,SeatCapacity


In [77]:
def is_negative_or_zero(df):
    return df['SeatCapacity'] <= 0

In [78]:
# Check if Realized SeatCapacity is zero or negative
df_realized[is_negative_or_zero(df_realized)]

Unnamed: 0,ScheduleTime,Airline,FlightNumber,Destination,AircraftType,FlightType,Sector,SeatCapacity,LoadFactor


In [79]:
# Check if Future SeatCapacity is zero or negative
df_future[is_negative_or_zero(df_future)]

Unnamed: 0,ScheduleTime,Airline,FlightNumber,Destination,AircraftType,FlightType,Sector,SeatCapacity


In [80]:
def mask_too_high_passenger_count(df, limit=10):
    passenger_count = (df['LoadFactor']*df['SeatCapacity']).astype(int)
    mask = (df['LoadFactor'] > 1)
    return mask, passenger_count[mask]

In [81]:
# Specifically for Realized Data
# Check if LoadFactor and SeatCapacity gives extremely low passenger counts
mask, passenger_count = mask_too_high_passenger_count(df_realized)
unlikly_passenger_counts = df_realized[mask]
unlikly_passenger_counts['PassengerCount'] = passenger_count
unlikly_passenger_counts

Unnamed: 0,ScheduleTime,Airline,FlightNumber,Destination,AircraftType,FlightType,Sector,SeatCapacity,LoadFactor,PassengerCount
764,2021-01-28 16:45:00,CN,515,EST,AT7,J,US,72,1.013889,72
1056,2021-02-13 11:35:00,CN,512,EST,AT7,J,US,72,1.013889,72
1096,2021-02-15 11:00:00,CN,513,EST,AT4,J,US,46,1.043478,47
1443,2021-03-05 16:40:00,CN,556,CKE,AT4,J,US,46,1.021739,46
1488,2021-03-07 18:35:00,OF,575,OTH,32Q,J,US,160,1.006250,161
...,...,...,...,...,...,...,...,...,...,...
38402,2022-02-26 18:30:00,FZ,609,AMN,73H,C,US,186,1.021505,189
38784,2022-02-13 09:55:00,ZD,742,YAD,73H,J,CA,189,1.021164,192
38799,2022-02-13 15:00:00,LJ,769,YEG,320,J,CA,186,1.016129,188
39311,2022-02-26 08:55:00,ZD,742,YAD,73H,J,CA,189,1.015873,191


So no negative load factors.
And we have some load factors which are greater than 1 indicating that the flight has more passengers than seats. While it might be tempting to remove these records on the basis of illegal data, we are keeping them due to the fact, that we understand the description of the LoadFactor variable to mean the share of occupied seats meaning that a load factor greater than 1 just indicates an overbooking for the flight. Overbooking is a generally used practice as described in this [article](https://www.aeroclass.org/why-are-airlines-allowed-to-overbook/) and as such we accept the data.
<br>
Also, som flights are empty or close to empty, but as explained in this [article](https://eu.usatoday.com/story/travel/columnist/cox/2019/08/14/ask-captain-why-fly-plane-just-one-passenger/2006241001/) this is also a perfectly valid scenario.

## Feature Engineering
Based on the different data types, we already know we want to do a couple of things to each column/feature, and these are described below:
<br><br>
For **ScheduleTime**, we want to do two things; Firstly, we want to convert the datetime64[ns] format to a more simple datetime format. We further want to create new object columns; one which encodes which year it is, one for the month of the year the flight is, which week, weekday, hour of day and minute of hour. These will also be encoded in the next step.
These new features extrated from the timestamp will be categorical features.
<br><br>
For **Airline**, **Flightnumber** (infact categorical), **Destination**, **Aircrafttype**, **Flighttype** and **Sector**, columns which are encoded as objects (they are categorical), we need to make an alternative encoding.
<br><br>
SeatCapacity is made categorical, though it differently from the rest, is in fact ordinal.

### Preprocess **ScheduleTime**
The `ScheduleTime` feature is a `datetime64` dtype. To be able to use it in a meaningful way for the modelling, we extract certain time features from it e.g. `Year`, `Month`, `Day`, and `WeekNumber`.

In [82]:
def extract_scheduletime_features(df):
    df = df.copy()
    ### Preprosses time for getting year, month, week number, weekday, hour of day and minute of hour
    df['Weekday'] = df['ScheduleTime'].dt.dayofweek
    df['HourOfDay'] = df['ScheduleTime'].dt.hour
    df['MinuteOfHour'] = df['ScheduleTime'].dt.minute

    df.Weekday = df.Weekday.astype(object)
    df.HourOfDay = df.HourOfDay.astype(object)
    df.MinuteOfHour = df.MinuteOfHour.astype(object)

    df = df.drop('ScheduleTime', axis=1)

    return df

In [83]:
rea_X_schedule_time = extract_scheduletime_features(no_nan_rea_df)
fut_X_schedule_time = extract_scheduletime_features(no_nan_fut_df)

In [84]:
print(f'Shape of rea_X_schedule_time: {rea_X_schedule_time.shape}\nShape of fut_X_schedule_time: {fut_X_schedule_time.shape}')

Shape of rea_X_schedule_time: (39449, 11)
Shape of fut_X_schedule_time: (4813, 10)


### Preprocess categorical features
The categorical features are not numerical types, but we need them to be for the model to handle them.
<br>
Therefore we perform feature encoding on them. The type of encoding which is more suitable depends on the different types of models we train.
<br>
Therefore we create two types of datasets. One encoded with `one-hot-encoding` for linear regression and one encoded with `label-encoding` for tree based models.

In [85]:
### We need to find out how many categorical values exist for each feature
rea_X_schedule_time.describe(include = ['object'])

Unnamed: 0,Airline,FlightNumber,Destination,AircraftType,FlightType,Sector,Weekday,HourOfDay,MinuteOfHour
count,39449,39449,39449,39449,39449,39449,39449,39449,39449
unique,105,825,230,58,3,12,7,23,13
top,DO,771,CKE,73H,J,CA,6,8,40
freq,10483,366,2706,8970,38229,19311,6495,3004,3368


In [86]:
fut_X_schedule_time.describe(include = ['object'])

Unnamed: 0,Airline,FlightNumber,Destination,AircraftType,FlightType,Sector,Weekday,HourOfDay,MinuteOfHour
count,4813,4813,4813,4813,4813,4813,4813,4813,4813
unique,71,260,172,38,4,10,7,24,12
top,DO,556,DEN,73H,J,CA,3,8,40
freq,1256,124,255,1169,4693,2434,791,414,435


### Encoding

In [87]:
def label_encode(df, cols):
    le = LabelEncoder()
    df = df.copy()

    for c in tqdm(cols):
        df[c] = le.fit_transform(df[c].apply(str))
    
    return df

In [88]:
linear_nominal_cols = ['AircraftType', 'FlightType', 'Sector', 'Weekday', 'HourOfDay',
                'MinuteOfHour','Airline', 'FlightNumber', 'Destination'] 
linear_rea_X = pd.get_dummies(rea_X_schedule_time, columns=linear_nominal_cols, drop_first=True)
linear_fut_X = pd.get_dummies(fut_X_schedule_time, columns=linear_nominal_cols, drop_first=True)

# Need to combine datasets during the labelling, so similar label id means the same
tree_nominal_cols = ['AircraftType', 'FlightType', 'Sector', 'Airline', 'FlightNumber', 'Destination'] 
rea_y = rea_X_schedule_time[TARGET]
combined_X = pd.concat([rea_X_schedule_time.drop(TARGET, axis=1), fut_X_schedule_time], axis=0)
labeled_combined_X = label_encode(combined_X, tree_nominal_cols)
tree_rea_X = labeled_combined_X.iloc[:rea_X_schedule_time.shape[0],:]
tree_rea_X[TARGET] = rea_y
tree_fut_X = labeled_combined_X.iloc[rea_X_schedule_time.shape[0]:,:]

100%|██████████| 6/6 [00:00<00:00, 81.75it/s]


In [89]:
print(f'Shape of linear_fut_X: {linear_fut_X.shape}\nShape of linear_rea_X: {linear_rea_X.shape}')
print(f'Shape of tree_fut_X: {tree_fut_X.shape}\nShape of tree_rea_X: {tree_rea_X.shape}')

Shape of linear_fut_X: (4813, 590)
Shape of linear_rea_X: (39449, 1269)
Shape of tree_fut_X: (4813, 10)
Shape of tree_rea_X: (39449, 11)


Now we remove any columns that holds identical values for all rows i.e. has no information to differentiate data points

In [90]:
def remove_zero_variance_cols(df):
    selector = VarianceThreshold(0)
    new_df = pd.DataFrame(selector.fit_transform(df), columns=selector.get_feature_names_out())
    print(f'New Shape: {new_df.shape}')

In [91]:
print('######## linear_rea_X ##########')
selected_linear_rea_X = remove_zero_variance_cols(linear_rea_X)
print('######## linear_fut_X ##########')
selected_linear_fut_X = remove_zero_variance_cols(linear_fut_X)
print('######## tree_rea_X ##########')
selected_tree_rea_X = remove_zero_variance_cols(tree_rea_X)
print('######## tree_fut_X ##########')
selected_tree_fut_X = remove_zero_variance_cols(tree_fut_X)

######## linear_rea_X ##########
New Shape: (39449, 1269)
######## linear_fut_X ##########
New Shape: (4813, 590)
######## tree_rea_X ##########
New Shape: (39449, 11)
######## tree_fut_X ##########
New Shape: (4813, 10)


We now only choose the columns that are both in the future and realized data

In [92]:
def choose_intersection(rea_df, fut_df, target):
    rea_fut_cols = list(set(rea_df.columns).intersection(set(fut_df.columns)))
    print(f'Number of shared cols: {len(rea_fut_cols)}')

    chosen_fut = fut_df[rea_fut_cols]
    rea_fut_cols.append(target)
    chosen_rea = rea_df[rea_fut_cols]
    return chosen_rea, chosen_fut

In [93]:
print('######### linear_rea_X #########')
chosen_linear_rea_X, chosen_linear_fut_X = choose_intersection(linear_rea_X, linear_fut_X, TARGET)
print('######### tree_rea_X #########')
chosen_tree_rea_X, chosen_tree_fut_X = choose_intersection(tree_rea_X, tree_fut_X, TARGET)

######### linear_rea_X #########
Number of shared cols: 582
######### tree_rea_X #########
Number of shared cols: 10


## Distribution Similarity
It is important that features in the realized training dataset and features in the future prediction dataset have similar distributions.
<br>
Therefore we perform measurements of the featurewise distribution similarities between the features in the realized and future datasets.
<br> 
The similarity is computed using `scipy.spatial.distance.jensenshannon`. If the pairwise distribution similarity is **very low** (i.e. large JS distance) between a set of features we drop the feature-pair. This is done to avoid the model (over)fitting to the feature in the training data and then performing poorly on the future data, because it is different. Two distirbutions are deemed different if they have a JS distance larger than 0.7. This measurement was chosen emperically, but no feature pairs had a distance larger than 0.4, so they were all deemed similar enough to work with.

In [94]:
# Calculation of the Jensen Shannon Distance
def compute_probs(data, n=10): 
    h, e = np.histogram(data, n)
    p = h/data.shape[0]
    return e, p

def support_intersection(p, q): 
    sup_int = (
        list(
            filter(
                lambda x: (x[0]!=0) & (x[1]!=0), zip(p, q)
            )
        )
    )
    return sup_int

def get_probs(list_of_tuples): 
    p = np.array([p[0] for p in list_of_tuples])
    q = np.array([p[1] for p in list_of_tuples])
    return p, q

def compute_js_dist(f_rea, f_fut):
    f_rea_len = f_rea.nunique()
    e, p = compute_probs(f_rea, n=f_rea_len)
    _, q = compute_probs(f_fut, n=e)

    list_of_tuples = support_intersection(p, q)
    p, q = get_probs(list_of_tuples)
    
    return js_dist(p, q, base=2)


In [95]:
def find_and_plot_different_feature_dist(df_rea, df_fut):
    features = df_fut.columns
    diff_features = []

    # Find features with different distributions
    for f in tqdm(features):
        f_rea = df_rea[f]
        f_fut = df_fut[f]
        js_dist = compute_js_dist(f_rea, f_fut)
        if js_dist >= 0.4:
            diff_features.append(f)

    if len(diff_features) > 0:
        # Plot significantly different features
        fig, ax = plt.subplots(len(diff_features), 2, figsize=(15, int(5*len(diff_features))))

        for i, f in enumerate(tqdm(diff_features)):
            f_rea_len = df_rea[f].nunique()
            f_fut_len = df_fut[f].nunique()
            bins = f_rea_len if f_rea_len > f_fut_len else f_fut_len
            f_values = df_rea[f].unique().tolist() + df_fut[f].unique().tolist()
            brange = (min(f_values), max(f_values))
            if len(diff_features) > 1:
                ax[i,0].set_title(f'Realized {f}')
                sns.histplot(df_rea[f], ax=ax[i,0], bins=bins, binrange=brange)
                ax[i,1].set_title(f'Future {f}')
                sns.histplot(df_fut[f], ax=ax[i,1], bins=bins, binrange=brange)
            else:
                ax[0].set_title(f'Realized {f}')
                sns.histplot(df_rea[f], ax=ax[0], bins=bins, binrange=brange)
                ax[1].set_title(f'Future {f}')
                sns.histplot(df_fut[f], ax=ax[1], bins=bins, binrange=brange)

        plt.tight_layout()
        plt.show()
    else:
        print('No significantly diffrent feature distributions found')

### Linear Model Dataset

In [96]:
find_and_plot_different_feature_dist(chosen_linear_rea_X, chosen_linear_fut_X)

100%|██████████| 582/582 [00:00<00:00, 1094.97it/s]

No significantly diffrent feature distributions found





### Tree Based Model Dataset

In [97]:
find_and_plot_different_feature_dist(chosen_tree_rea_X, chosen_tree_fut_X)

100%|██████████| 10/10 [00:00<00:00, 210.05it/s]

No significantly diffrent feature distributions found





## Saving the data
We now save the preprocessed realized and future datasets.
<br>
We produce 4 datasets: 
* `linear_realized_preprocessed_data.csv`
* `linear_future_preprocessed_data.csv`
* `tree_realized_preprocessed_data.csv`
* `tree_future_preprocessed_data.csv`

In [98]:
def set_target_last(df, target):
    df = df.copy()
    tmp_target = df[target]
    df.drop(target, axis=1)
    df[target] = tmp_target
    return df

In [99]:
set_target_last(chosen_linear_rea_X, TARGET).to_csv(LINEAR_REALIZED_PATH, sep=',', decimal='.')
chosen_linear_fut_X.to_csv(LINEAR_FUTURE_PATH, sep=',', decimal='.')
set_target_last(chosen_tree_rea_X, TARGET).to_csv(TREE_REALIZED_PATH, sep=',', decimal='.')
chosen_tree_fut_X.to_csv(TREE_FUTURE_PATH, sep=',', decimal='.')