In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tabulate import tabulate


# Data analysis of Case 1
### By August Semrau and William Marstrand
This notebook consists of data preprocessing and analysis

In [4]:
### Load data
df_full = pd.read_excel('data/dataset.xls')
print('First 10 columns of the dataset:')
print(tabulate(df_full.head(20), headers='keys', tablefmt='plain'))

## Convert FlightNumber to object, it is not a numerical value
df_full.FlightNumber = df_full.FlightNumber.astype(object)

## Investigate data for dtypes and stuff
print('\ndtypes of the datasets columns:')
df_full.dtypes


First 10 columns of the dataset:
    ScheduleTime         Airline      FlightNumber  Destination    AircraftType    FlightType    Sector      SeatCapacity    LoadFactor
 0  2021-01-01 06:35:00  IA                    874  DEN            73W             J             US                   142     0.408451
 1  2021-01-01 10:35:00  JZ                    818  YHM            AT7             J             CA                    74     0.189189
 2  2021-01-01 12:05:00  IA                    876  DEN            73W             J             US                   142     0.570423
 3  2021-01-01 13:20:00  CN                    514  EST            AT7             J             US                    72     0.333333
 4  2021-01-01 14:20:00  LJ                   3140  DEN            32A             J             US                   186     0.204301
 5  2021-01-01 13:50:00  FP                    550  DXB            77W             J             AE                   428     0.231308
 6  2021-01-01 14:35:

ScheduleTime    datetime64[ns]
Airline                 object
FlightNumber            object
Destination             object
AircraftType            object
FlightType              object
Sector                  object
SeatCapacity             int64
LoadFactor             float64
dtype: object

#### Based on the different data types, we already know we want to do a couple of things to each column/feature, and these are described below:

#### For **ScheduleTime**, we want to do two things; Firstly, we want to convert the datetime64[ns] format to a more simple datetime format. We further want to create five new object columns, one which encodes which month of the year the flight is, which week, weekday, hour of day and minute of hour. These will also be encoded in the next step.

#### For **Airline**, **Flightnumber** (infact categorical), **Destination**, **Aircrafttype**, **Flighttype** and **Sector**, columns which are encoded as objects (they are categorical), we need to make an alternative encoding. If they hold few different classes, this is true for Flighttype which is either *J* or *C*, we will one-hot-encode them, else they will be label encoded. 

#### SeatCapacity remains the same.

#### First, we remove rows with empty values

In [5]:
### Investigate for missing data
missing_values_count = df_full.isnull().sum()
print(f'List of null data counts for each column: \n{missing_values_count}')

## As there are so few missing values, a total of five rows, we simply remove these
df = df_full.dropna()
print(f'\nData had {df_full.shape[0]} rows pre-nan-removal, now has {df.shape[0]} rows.')

## Seperate data from target
X, y = df.loc[:, df.columns != 'LoadFactor'], df.loc[:, df.columns == 'LoadFactor']

List of null data counts for each column: 
ScheduleTime    0
Airline         1
FlightNumber    0
Destination     2
AircraftType    0
FlightType      0
Sector          2
SeatCapacity    0
LoadFactor      0
dtype: int64

Data had 36770 rows pre-nan-removal, now has 36768 rows.


## Preprocess **ScheduleTime**

In [6]:
### Preprosses time for getting year, month, week number, weekday, hour of day and minute of hour
X['Year'] = X['ScheduleTime'].dt.year
X['Month'] = X['ScheduleTime'].dt.month
X['WeekNumber'] = X['ScheduleTime'].dt.week % 52;
X['Weekday'] = X['ScheduleTime'].dt.dayofweek + 1;
X['HourOfDay'] = X['ScheduleTime'].dt.hour
X['MinuteOfHour'] = X['ScheduleTime'].dt.minute

X.Year = X.Year.astype(object)
X.Month = X.Month.astype(object)
X.WeekNumber = X.WeekNumber.astype(object)
X.Weekday = X.Weekday.astype(object)
X.HourOfDay = X.HourOfDay.astype(object)
X.MinuteOfHour = X.MinuteOfHour.astype(object)

X[1000:]

  X['WeekNumber'] = X['ScheduleTime'].dt.week % 52;


Unnamed: 0,ScheduleTime,Airline,FlightNumber,Destination,AircraftType,FlightType,Sector,SeatCapacity,Year,Month,WeekNumber,Weekday,HourOfDay,MinuteOfHour
1000,2021-02-10 15:35:00,TN,553,DOH,359,J,QA,283,2021,2,6,3,15,35
1001,2021-02-10 15:20:00,CN,515,EST,AT4,J,US,48,2021,2,6,3,15,20
1002,2021-02-10 18:05:00,JZ,818,YHM,AT7,J,CA,74,2021,2,6,3,18,5
1003,2021-02-10 17:30:00,CN,516,EST,AT4,J,US,48,2021,2,6,3,17,30
1004,2021-02-10 18:00:00,OF,575,OTH,32Q,J,US,185,2021,2,6,3,18,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36765,2022-02-08 18:30:00,VW,986,YYZ,320,J,CA,180,2022,2,6,2,18,30
36766,2022-02-08 19:30:00,LJ,579,YQU,319,J,CA,156,2022,2,6,2,19,30
36767,2022-02-08 19:40:00,LJ,506,YUL,319,J,CA,156,2022,2,6,2,19,40
36768,2022-02-08 19:05:00,CL,2708,DAN,73H,J,US,189,2022,2,6,2,19,5


## Preprocess categorical features

In [7]:
### We need to find out how many categorical values exist for each feature
X.describe(include = ['int64', 'object'])


Unnamed: 0,Airline,FlightNumber,Destination,AircraftType,FlightType,Sector,SeatCapacity,Year,Month,WeekNumber,Weekday,HourOfDay,MinuteOfHour
count,36768,36768.0,36768,36768,36768,36768,36768.0,36768.0,36768.0,36768.0,36768.0,36768.0,36768.0
unique,95,807.0,230,56,3,12,,2.0,12.0,52.0,7.0,23.0,13.0
top,DO,771.0,CKE,73H,J,CA,,2021.0,10.0,1.0,7.0,7.0,40.0
freq,9808,341.0,2570,8316,35642,17906,,32348.0,4705.0,1427.0,6025.0,2774.0,3133.0
mean,,,,,,,155.286091,,,,,,
std,,,,,,,58.852431,,,,,,
min,,,,,,,10.0,,,,,,
25%,,,,,,,98.0,,,,,,
50%,,,,,,,180.0,,,,,,
75%,,,,,,,189.0,,,,,,


In [10]:
### Make one-hot-encodings for the columns which have few categories

## All categorical features
X_dummy = pd.get_dummies(data=X)
X_dummy

# ## Only up to 15 different values per category
# X_dummy = pd.get_dummies(data=X, columns=['FlightType', 'Sector', 'Year', 'Month', 'Weekday', 'MinuteOfHour'])
# X_dummy

Unnamed: 0,ScheduleTime,SeatCapacity,Airline_AY,Airline_BJ,Airline_BT,Airline_BZ,Airline_CL,Airline_CN,Airline_DO,Airline_DT,...,MinuteOfHour_15,MinuteOfHour_20,MinuteOfHour_25,MinuteOfHour_30,MinuteOfHour_35,MinuteOfHour_40,MinuteOfHour_45,MinuteOfHour_50,MinuteOfHour_54,MinuteOfHour_55
0,2021-01-01 06:35:00,142,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,2021-01-01 10:35:00,74,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,2021-01-01 12:05:00,142,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2021-01-01 13:20:00,72,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
4,2021-01-01 14:20:00,186,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36765,2022-02-08 18:30:00,180,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
36766,2022-02-08 19:30:00,156,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
36767,2022-02-08 19:40:00,156,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
36768,2022-02-08 19:05:00,189,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Extra

In [None]:
from category_encoders import TargetEncoder
encoder = TargetEncoder()
X_dummy2 = X
X_dummy2['Animal Encoded'] = encoder.fit_transform(X_dummy2['FlightNumber'], df['LoadFactor'])
X_dummy2