In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
### Load data
df_full = pd.read_excel('data/dataset.xls')
df_full

Unnamed: 0,ScheduleTime,Airline,FlightNumber,Destination,AircraftType,FlightType,Sector,SeatCapacity,LoadFactor
0,2021-01-01 06:35:00,IA,874,DEN,73W,J,US,142,0.408451
1,2021-01-01 10:35:00,JZ,818,YHM,AT7,J,CA,74,0.189189
2,2021-01-01 12:05:00,IA,876,DEN,73W,J,US,142,0.570423
3,2021-01-01 13:20:00,CN,514,EST,AT7,J,US,72,0.333333
4,2021-01-01 14:20:00,LJ,3140,DEN,32A,J,US,186,0.204301
...,...,...,...,...,...,...,...,...,...
36765,2022-02-08 18:30:00,VW,986,YYZ,320,J,CA,180,0.522222
36766,2022-02-08 19:30:00,LJ,579,YQU,319,J,CA,156,0.532051
36767,2022-02-08 19:40:00,LJ,506,YUL,319,J,CA,156,0.602564
36768,2022-02-08 19:05:00,CL,2708,DAN,73H,J,US,189,0.417989


In [4]:
## Convert FlightNumber to object, it is not a numerical value
df_full.FlightNumber = df_full.FlightNumber.astype(object)
## Investigate data for dtypes and stuff
print('\ndtypes of the datasets columns:')
df_full.dtypes


dtypes of the datasets columns:


ScheduleTime    datetime64[ns]
Airline                 object
FlightNumber            object
Destination             object
AircraftType            object
FlightType              object
Sector                  object
SeatCapacity             int64
LoadFactor             float64
dtype: object

In [5]:
### Investigate for missing data
missing_values_count = df_full.isnull().sum()
print(f'List of null data counts for each column: \n{missing_values_count}')

## As there are so few missing values, a total of five rows, we simply remove these
df = df_full.dropna()
df = df.loc[df['FlightType'].isin(['C', 'J'])]
print(f'\nData had {df_full.shape[0]} rows pre-nan-removal, now has {df.shape[0]} rows.')

## Seperate data from target
X, y = df.loc[:, df.columns != 'LoadFactor'], df.loc[:, df.columns == 'LoadFactor']

List of null data counts for each column: 
ScheduleTime    0
Airline         1
FlightNumber    0
Destination     2
AircraftType    0
FlightType      0
Sector          2
SeatCapacity    0
LoadFactor      0
dtype: int64

Data had 36770 rows pre-nan-removal, now has 36767 rows.


In [6]:
### Preprosses time for getting year, month, week number, weekday, hour of day and minute of hour
X['Year'] = X['ScheduleTime'].dt.year
X['Month'] = X['ScheduleTime'].dt.month
X['WeekNumber'] = X['ScheduleTime'].dt.isocalendar().week % 52;
X['Weekday'] = X['ScheduleTime'].dt.dayofweek;
X['HourOfDay'] = X['ScheduleTime'].dt.hour
X['MinuteOfHour'] = X['ScheduleTime'].dt.minute

X.Year = X.Year.astype(object)
X.Month = X.Month.astype(object)
X.WeekNumber = X.WeekNumber.astype(object)
X.Weekday = X.Weekday.astype(object)
X.HourOfDay = X.HourOfDay.astype(object)
X.MinuteOfHour = X.MinuteOfHour.astype(object)

X

Unnamed: 0,ScheduleTime,Airline,FlightNumber,Destination,AircraftType,FlightType,Sector,SeatCapacity,Year,Month,WeekNumber,Weekday,HourOfDay,MinuteOfHour
0,2021-01-01 06:35:00,IA,874,DEN,73W,J,US,142,2021,1,1,4,6,35
1,2021-01-01 10:35:00,JZ,818,YHM,AT7,J,CA,74,2021,1,1,4,10,35
2,2021-01-01 12:05:00,IA,876,DEN,73W,J,US,142,2021,1,1,4,12,5
3,2021-01-01 13:20:00,CN,514,EST,AT7,J,US,72,2021,1,1,4,13,20
4,2021-01-01 14:20:00,LJ,3140,DEN,32A,J,US,186,2021,1,1,4,14,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36765,2022-02-08 18:30:00,VW,986,YYZ,320,J,CA,180,2022,2,6,1,18,30
36766,2022-02-08 19:30:00,LJ,579,YQU,319,J,CA,156,2022,2,6,1,19,30
36767,2022-02-08 19:40:00,LJ,506,YUL,319,J,CA,156,2022,2,6,1,19,40
36768,2022-02-08 19:05:00,CL,2708,DAN,73H,J,US,189,2022,2,6,1,19,5


In [12]:
weekly_departures = X.groupby(['Year', 'WeekNumber'])['ScheduleTime'].count()
weekly_departures

Year  WeekNumber
2021  0              680
      1              457
      2              217
      3              183
      4              153
      5              147
      6              154
      7              157
      8              146
      9              170
      10             167
      11             168
      12             202
      13             220
      14             230
      15             223
      16             221
      17             217
      18             251
      19             277
      20             288
      21             329
      22             412
      23             422
      24             491
      25             553
      26             726
      27             783
      28             814
      29             864
      30             875
      31             886
      32             875
      33             909
      34             921
      35             978
      36            1016
      37            1022
      38            1046
      39

In [28]:
week_diffs = weekly_departures.diff().rename('d_departure')
week_diffs

Year  WeekNumber
2021  0               NaN
      1            -223.0
      2            -240.0
      3             -34.0
      4             -30.0
      5              -6.0
      6               7.0
      7               3.0
      8             -11.0
      9              24.0
      10             -3.0
      11              1.0
      12             34.0
      13             18.0
      14             10.0
      15             -7.0
      16             -2.0
      17             -4.0
      18             34.0
      19             26.0
      20             11.0
      21             41.0
      22             83.0
      23             10.0
      24             69.0
      25             62.0
      26            173.0
      27             57.0
      28             31.0
      29             50.0
      30             11.0
      31             11.0
      32            -11.0
      33             34.0
      34             12.0
      35             57.0
      36             38.0
      37             

In [29]:
week_diffs.diff()

Year  WeekNumber
2021  0                NaN
      1                NaN
      2              -17.0
      3              206.0
      4                4.0
      5               24.0
      6               13.0
      7               -4.0
      8              -14.0
      9               35.0
      10             -27.0
      11               4.0
      12              33.0
      13             -16.0
      14              -8.0
      15             -17.0
      16               5.0
      17              -2.0
      18              38.0
      19              -8.0
      20             -15.0
      21              30.0
      22              42.0
      23             -73.0
      24              59.0
      25              -7.0
      26             111.0
      27            -116.0
      28             -26.0
      29              19.0
      30             -39.0
      31               0.0
      32             -22.0
      33              45.0
      34             -22.0
      35              45.0
      36   

In [30]:
X.join(week_diffs, on=['Year', 'WeekNumber'], how='left')

Unnamed: 0,ScheduleTime,Airline,FlightNumber,Destination,AircraftType,FlightType,Sector,SeatCapacity,Year,Month,WeekNumber,Weekday,HourOfDay,MinuteOfHour,d_departure
0,2021-01-01 06:35:00,IA,874,DEN,73W,J,US,142,2021,1,1,4,6,35,-223.0
1,2021-01-01 10:35:00,JZ,818,YHM,AT7,J,CA,74,2021,1,1,4,10,35,-223.0
2,2021-01-01 12:05:00,IA,876,DEN,73W,J,US,142,2021,1,1,4,12,5,-223.0
3,2021-01-01 13:20:00,CN,514,EST,AT7,J,US,72,2021,1,1,4,13,20,-223.0
4,2021-01-01 14:20:00,LJ,3140,DEN,32A,J,US,186,2021,1,1,4,14,20,-223.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36765,2022-02-08 18:30:00,VW,986,YYZ,320,J,CA,180,2022,2,6,1,18,30,-579.0
36766,2022-02-08 19:30:00,LJ,579,YQU,319,J,CA,156,2022,2,6,1,19,30,-579.0
36767,2022-02-08 19:40:00,LJ,506,YUL,319,J,CA,156,2022,2,6,1,19,40,-579.0
36768,2022-02-08 19:05:00,CL,2708,DAN,73H,J,US,189,2022,2,6,1,19,5,-579.0
