In [65]:
import pandas as pd
import numpy as np
import time
import os
from os import listdir
from os.path import isfile, join, basename

import sys

from lag_features import *
from other_functions import *

import datetime
from sklearn.preprocessing import LabelEncoder

In [66]:
DIR = '/Users/carlosperezricardo/Desktop/TFM'

CARRIER = 19393
years_to_load = ['2017','2018','2019']

In [67]:
folder = os.path.join(DIR, 'datasets')
files = [f for f in listdir(folder) if isfile(join(folder, f))]
files = sorted(files)

In [68]:
df = pd.DataFrame()
for file in files:
    loading = []
    if '.zip' in file:
        for year in years_to_load:
            loading.append(year in file)
        output = any(loading)
        if output:
            add_df = pd.read_csv(os.path.join(DIR,'datasets',file), parse_dates=['FL_DATE'])
            add_df = add_df[add_df.OP_CARRIER_AIRLINE_ID == CARRIER]
            df = pd.concat([df, add_df], axis=0)
df.reset_index(drop=True, inplace=True)

In [69]:
df.head()

Unnamed: 0,FL_DATE,OP_UNIQUE_CARRIER,OP_CARRIER_AIRLINE_ID,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,...,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 29
0,2017-10-01,WN,19393,WN,N789SW,2937,10140,1014003,30140,ABQ,...,1204.0,-11.0,0.0,,0.0,,,,,
1,2017-10-01,WN,19393,WN,N7825A,2736,10140,1014003,30140,ABQ,...,1532.0,-8.0,0.0,,0.0,,,,,
2,2017-10-01,WN,19393,WN,N464WN,5108,10140,1014003,30140,ABQ,...,2113.0,-22.0,0.0,,0.0,,,,,
3,2017-10-01,WN,19393,WN,N462WN,2934,10140,1014003,30140,ABQ,...,923.0,-12.0,0.0,,0.0,,,,,
4,2017-10-01,WN,19393,WN,N7878A,3315,10140,1014003,30140,ABQ,...,2049.0,4.0,0.0,,0.0,,,,,


In [70]:
ini_date = datetime.datetime(2017, 10, 1)

train_ini_date = datetime.datetime(2018, 1, 1)
train_fin_date = datetime.datetime(2019, 9, 1)

test_ini_date = datetime.datetime(2019, 9, 1)
test_fin_date = datetime.datetime(2019, 9, 30)

In [71]:
df = df[(df.FL_DATE > ini_date) & (df.FL_DATE <= test_fin_date)]

In [72]:
df = df[df.OP_CARRIER_AIRLINE_ID == CARRIER]

In [73]:
df.shape

(2592098, 30)

In [74]:
# Feature Generation 
df = date_features(df, 'FL_DATE')

In [75]:
df.groupby('TAIL_NUM')['ARR_DELAY'].apply(lambda x: x.shift(1))

3505        NaN
3506        NaN
3507        NaN
3508        NaN
3509        NaN
           ... 
2595598   -13.0
2595599     2.0
2595600   -14.0
2595601    -8.0
2595602     1.0
Name: ARR_DELAY, Length: 2592098, dtype: float64

In [76]:
df.groupby('TAIL_NUM')['ARR_DELAY'].apply(lambda x: x.shift(1).rolling(5).apply(np.mean))

3505        NaN
3506        NaN
3507        NaN
3508        NaN
3509        NaN
           ... 
2595598   -11.4
2595599     4.0
2595600    -5.2
2595601   -10.0
2595602     9.8
Name: ARR_DELAY, Length: 2592098, dtype: float64

In [77]:
df.shape

(2592098, 35)

In [78]:
df.head()

Unnamed: 0,FL_DATE,OP_UNIQUE_CARRIER,OP_CARRIER_AIRLINE_ID,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,...,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 29,month,day,year,year_month,weekday
3505,2017-10-02,WN,19393,WN,N8693A,938,12191,1219102,31453,HOU,...,,,,,,10,2,2017,2017_010,0
3506,2017-10-02,WN,19393,WN,N8683D,4825,12191,1219102,31453,HOU,...,,,,,,10,2,2017,2017_010,0
3507,2017-10-02,WN,19393,WN,N8511K,4373,12191,1219102,31453,HOU,...,,,,,,10,2,2017,2017_010,0
3508,2017-10-02,WN,19393,WN,N7815L,4205,12191,1219102,31453,HOU,...,,,,,,10,2,2017,2017_010,0
3509,2017-10-02,WN,19393,WN,N7720F,47,12191,1219102,31453,HOU,...,,,,,,10,2,2017,2017_010,0


In [79]:
df.columns

Index(['FL_DATE', 'OP_UNIQUE_CARRIER', 'OP_CARRIER_AIRLINE_ID', 'OP_CARRIER',
       'TAIL_NUM', 'OP_CARRIER_FL_NUM', 'ORIGIN_AIRPORT_ID',
       'ORIGIN_AIRPORT_SEQ_ID', 'ORIGIN_CITY_MARKET_ID', 'ORIGIN',
       'ORIGIN_CITY_NAME', 'ORIGIN_STATE_ABR', 'DEST_AIRPORT_ID',
       'DEST_AIRPORT_SEQ_ID', 'DEST_CITY_MARKET_ID', 'DEST', 'DEST_CITY_NAME',
       'DEST_STATE_ABR', 'DEP_TIME', 'DEP_DELAY', 'ARR_TIME', 'ARR_DELAY',
       'CANCELLED', 'CANCELLATION_CODE', 'DIVERTED', 'CARRIER_DELAY',
       'WEATHER_DELAY', 'NAS_DELAY', 'LATE_AIRCRAFT_DELAY', 'Unnamed: 29',
       'month', 'day', 'year', 'year_month', 'weekday'],
      dtype='object')

In [80]:
df['OP_CARRIER_AIRLINE_ID'].value_counts()

19393    2592098
Name: OP_CARRIER_AIRLINE_ID, dtype: int64

In [81]:
train_df = df[(df.FL_DATE > datetime.datetime(2018, 1, 1)) & (df.FL_DATE <= datetime.datetime(2019, 9, 1))]
test_df = df[(df.FL_DATE > datetime.datetime(2019, 9, 1)) & (df.FL_DATE <= datetime.datetime(2019, 9, 30))]

In [82]:
print(train_df.shape)
print(test_df.shape)

(2266109, 35)
(106432, 35)


In [83]:
df_ = df[df.OP_CARRIER_AIRLINE_ID == CARRIER]

In [84]:
df_ = df_[(df_.CANCELLED != 1) & (df_.DIVERTED != 1)]

In [85]:
# Lag features by: TAIL_NUM
calculations = {}
#calculations['calc1'] = {'gb_list':['TAIL_NUM','FL_DATE'],'target':'ARR_DELAY','shifts':[30,45,60], 'windows':[10], 'funs':['mean','std']}
#calculations['calc2'] = {'gb_list':['TAIL_NUM','FL_DATE'],'target':'ARR_DELAY','shifts':[365], 'windows':[5,10], 'funs':['mean']}

#df_ = apply_calc(df_, calculations)

In [86]:
# Lag features by: DEST_AIRPORT_ID
calculations = {}
calculations['calc3'] = {'gb_list':['DEST_AIRPORT_ID','FL_DATE'],'target':'ARR_DELAY','shifts':[30,45,60], 'windows':[10,30], 'funs':['mean']}
calculations['calc4'] = {'gb_list':['DEST_AIRPORT_ID','FL_DATE'],'target':'ARR_DELAY','shifts':[30], 'windows':[5,10,30], 'funs':['median','std']}
calculations['calc5'] = {'gb_list':['DEST_AIRPORT_ID','FL_DATE'],'target':'ARR_DELAY','shifts':[365], 'windows':[10], 'funs':['mean','std']}

df_ = apply_calc(df_, calculations)

Generating ARR_DELAY_DEST_AIRPORT_ID_FL_DATE_s30_r10_mean
Generating ARR_DELAY_DEST_AIRPORT_ID_FL_DATE_s45_r10_mean
Generating ARR_DELAY_DEST_AIRPORT_ID_FL_DATE_s60_r10_mean
Generating ARR_DELAY_DEST_AIRPORT_ID_FL_DATE_s30_r30_mean
Generating ARR_DELAY_DEST_AIRPORT_ID_FL_DATE_s45_r30_mean
Generating ARR_DELAY_DEST_AIRPORT_ID_FL_DATE_s60_r30_mean
Generating ARR_DELAY_DEST_AIRPORT_ID_FL_DATE_s30_r5_median
Generating ARR_DELAY_DEST_AIRPORT_ID_FL_DATE_s30_r10_median
Generating ARR_DELAY_DEST_AIRPORT_ID_FL_DATE_s30_r30_median
Generating ARR_DELAY_DEST_AIRPORT_ID_FL_DATE_s30_r5_std
Generating ARR_DELAY_DEST_AIRPORT_ID_FL_DATE_s30_r10_std
Generating ARR_DELAY_DEST_AIRPORT_ID_FL_DATE_s30_r30_std
Generating ARR_DELAY_DEST_AIRPORT_ID_FL_DATE_s365_r10_mean
Generating ARR_DELAY_DEST_AIRPORT_ID_FL_DATE_s365_r10_std


In [87]:
# Lag features by: ORIGIN_AIRPORT_ID
calculations = {}
calculations['calc6'] = {'gb_list':['ORIGIN_AIRPORT_ID','FL_DATE'],'target':'DEP_DELAY','shifts':[30,45,60], 'windows':[5,10,30], 'funs':['mean']}
calculations['calc7'] = {'gb_list':['DEST_AIRPORT_ID','FL_DATE'],'target':'ARR_DELAY','shifts':[30], 'windows':[5,10,30], 'funs':['median','std']}
calculations['calc8'] = {'gb_list':['ORIGIN_AIRPORT_ID','FL_DATE'],'target':'DEP_DELAY','shifts':[365], 'windows':[10], 'funs':['mean']}

df_ = apply_calc(df_, calculations)

Generating DEP_DELAY_ORIGIN_AIRPORT_ID_FL_DATE_s30_r5_mean
Generating DEP_DELAY_ORIGIN_AIRPORT_ID_FL_DATE_s45_r5_mean
Generating DEP_DELAY_ORIGIN_AIRPORT_ID_FL_DATE_s60_r5_mean
Generating DEP_DELAY_ORIGIN_AIRPORT_ID_FL_DATE_s30_r10_mean
Generating DEP_DELAY_ORIGIN_AIRPORT_ID_FL_DATE_s45_r10_mean
Generating DEP_DELAY_ORIGIN_AIRPORT_ID_FL_DATE_s60_r10_mean
Generating DEP_DELAY_ORIGIN_AIRPORT_ID_FL_DATE_s30_r30_mean
Generating DEP_DELAY_ORIGIN_AIRPORT_ID_FL_DATE_s45_r30_mean
Generating DEP_DELAY_ORIGIN_AIRPORT_ID_FL_DATE_s60_r30_mean
Generating ARR_DELAY_DEST_AIRPORT_ID_FL_DATE_s30_r5_median
Generating ARR_DELAY_DEST_AIRPORT_ID_FL_DATE_s30_r10_median
Generating ARR_DELAY_DEST_AIRPORT_ID_FL_DATE_s30_r30_median
Generating ARR_DELAY_DEST_AIRPORT_ID_FL_DATE_s30_r5_std
Generating ARR_DELAY_DEST_AIRPORT_ID_FL_DATE_s30_r10_std
Generating ARR_DELAY_DEST_AIRPORT_ID_FL_DATE_s30_r30_std
Generating DEP_DELAY_ORIGIN_AIRPORT_ID_FL_DATE_s365_r10_mean


In [88]:
# Convert times 
# [ARR_TIME, DEP_TIME]

df_['DEP_TIME'] = df_['DEP_TIME']/100
df_['ARR_TIME'] = df_['ARR_TIME']/100

df_['DEP_TIME'] = df_['DEP_TIME'].astype(int)
df_['ARR_TIME'] = df_['ARR_TIME'].astype(int)

In [89]:
drop_cols = ['CANCELLED', 'CANCELLATION_CODE', 'DIVERTED', 'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'LATE_AIRCRAFT_DELAY', 'Unnamed: 29']
for col in drop_cols:
    del df_[col]

In [90]:
keep = list(df_.select_dtypes(include=np.number).columns)

for col in df_.select_dtypes(exclude=np.number).columns:
    print(col, len(df_[col].unique()))
    if len(df_[col].unique()) < 100:
        keep.append(col)
        le = LabelEncoder()
        df_[col] = le.fit_transform(df_[col])
        df_[col] = df_[col].astype('category')

FL_DATE 698
OP_UNIQUE_CARRIER 1
OP_CARRIER 1
TAIL_NUM 754
ORIGIN 89
ORIGIN_CITY_NAME 88
ORIGIN_STATE_ABR 42
DEST 89
DEST_CITY_NAME 88
DEST_STATE_ABR 42
year_month 23


In [91]:
keep.remove('ORIGIN_CITY_NAME')
keep.remove('DEST_CITY_NAME')

In [92]:
train_df = df_.loc[(df_.FL_DATE > datetime.datetime(2018, 1, 1)) & (df_.FL_DATE <= datetime.datetime(2019, 9, 1)), keep]
test_df = df_.loc[(df_.FL_DATE > datetime.datetime(2019, 9, 1)) & (df_.FL_DATE <= datetime.datetime(2019, 9, 30)), keep]

In [93]:
print(train_df.shape)
print(test_df.shape)

(2214976, 47)
(104484, 47)


In [94]:
TARGET = 'ARR_DELAY'
drop = ['DEP_DELAY']

In [95]:
features = [ col for col in train_df.columns if col != TARGET and col not in drop ]

X_train = train_df[features]
y_train = train_df[TARGET]

X_test = test_df[features]
y_test = test_df[TARGET]

In [96]:
train_df.head()

Unnamed: 0,OP_CARRIER_AIRLINE_ID,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_MARKET_ID,DEP_TIME,DEP_DELAY,...,DEP_DELAY_ORIGIN_AIRPORT_ID_FL_DATE_s45_r30_mean,DEP_DELAY_ORIGIN_AIRPORT_ID_FL_DATE_s60_r30_mean,DEP_DELAY_ORIGIN_AIRPORT_ID_FL_DATE_s365_r10_mean,OP_UNIQUE_CARRIER,OP_CARRIER,ORIGIN,ORIGIN_STATE_ABR,DEST,DEST_STATE_ABR,year_month
223062,19393,992,13232,1323202,30977,14679,1467903,33570,20,50.0,...,,,,0,0,50,11,75,3,5
223063,19393,4972,13232,1323202,30977,14679,1467903,33570,11,-2.0,...,,,,0,0,50,11,75,3,5
223064,19393,5103,13232,1323202,30977,14679,1467903,33570,8,19.0,...,,,,0,0,50,11,75,3,5
223065,19393,2681,13232,1323202,30977,14683,1468305,33214,16,52.0,...,,,,0,0,50,11,76,37,5
223066,19393,5213,13232,1323202,30977,14683,1468305,33214,8,29.0,...,,,,0,0,50,11,76,37,5


In [97]:
import lightgbm as lgbm
reg = lgbm.LGBMRegressor(metric='rmse', n_estimators=400)

In [98]:
reg.fit(X_train, y_train)

LGBMRegressor(metric='rmse', n_estimators=400)

In [99]:
pd.DataFrame({'columns': X_train.columns,'feature_importance':reg.feature_importances_}).sort_values('feature_importance',ascending=False)

Unnamed: 0,columns,feature_importance
40,ORIGIN,2461
42,DEST,2379
44,year_month,1234
9,ARR_TIME,987
8,DEP_TIME,870
1,OP_CARRIER_FL_NUM,531
13,weekday,249
11,day,238
31,DEP_DELAY_ORIGIN_AIRPORT_ID_FL_DATE_s30_r10_mean,218
28,DEP_DELAY_ORIGIN_AIRPORT_ID_FL_DATE_s30_r5_mean,193


In [100]:
train_df_ = train_df.copy(deep=True)
train_df_['pred'] = reg.predict(X_train)

In [101]:
train_df_[['ARR_DELAY','pred']].head(20)

Unnamed: 0,ARR_DELAY,pred
223062,16.0,7.755956
223063,-39.0,3.761844
223064,-4.0,-3.828278
223065,31.0,5.878802
223066,12.0,-4.486181
223067,14.0,0.544601
223068,-3.0,-3.9235
223069,-8.0,-5.297008
223070,17.0,1.559596
223071,34.0,17.671832


In [102]:
from sklearn.metrics import mean_squared_error

rmse = mean_squared_error(train_df_['ARR_DELAY'], train_df_['pred'], squared=True)
rmse

690.7567059248379

In [103]:
rmse = mean_squared_error(test_df['ARR_DELAY'], reg.predict(X_test), squared=True)
rmse

474.9687419970279

In [104]:
train_df_[['ARR_DELAY','pred']].sort_values('ARR_DELAY', ascending=False).head(20)

Unnamed: 0,ARR_DELAY,pred
1944033,809.0,194.971408
665786,698.0,351.445547
1973484,631.0,105.43406
1254786,625.0,264.852674
1527731,616.0,65.479488
991289,614.0,102.415637
711116,608.0,407.942468
1281755,595.0,69.118075
1256577,587.0,78.717336
2445308,579.0,35.953002


In [105]:
# Feature Generation

In [106]:
train_df.head()

Unnamed: 0,OP_CARRIER_AIRLINE_ID,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_MARKET_ID,DEP_TIME,DEP_DELAY,...,DEP_DELAY_ORIGIN_AIRPORT_ID_FL_DATE_s45_r30_mean,DEP_DELAY_ORIGIN_AIRPORT_ID_FL_DATE_s60_r30_mean,DEP_DELAY_ORIGIN_AIRPORT_ID_FL_DATE_s365_r10_mean,OP_UNIQUE_CARRIER,OP_CARRIER,ORIGIN,ORIGIN_STATE_ABR,DEST,DEST_STATE_ABR,year_month
223062,19393,992,13232,1323202,30977,14679,1467903,33570,20,50.0,...,,,,0,0,50,11,75,3,5
223063,19393,4972,13232,1323202,30977,14679,1467903,33570,11,-2.0,...,,,,0,0,50,11,75,3,5
223064,19393,5103,13232,1323202,30977,14679,1467903,33570,8,19.0,...,,,,0,0,50,11,75,3,5
223065,19393,2681,13232,1323202,30977,14683,1468305,33214,16,52.0,...,,,,0,0,50,11,76,37,5
223066,19393,5213,13232,1323202,30977,14683,1468305,33214,8,29.0,...,,,,0,0,50,11,76,37,5


In [107]:
keep = list()
for col in train_df.select_dtypes(exclude=np.number).columns:
    print(col, len(train_df[col].unique()))
    if len(train_df[col].unique()) < 100:
        keep.append(col)
    #le = LabelEncoder()
    #articles_df[col] = le.fit_transform(articles_df[col])
    #articles_df[col] = articles_df[col].astype('category')

OP_UNIQUE_CARRIER 1
OP_CARRIER 1
ORIGIN 89
ORIGIN_STATE_ABR 42
DEST 89
DEST_STATE_ABR 42
year_month 21


In [108]:
train_df = train_df[keep]