In [1]:
# Basics
import pandas as pd
import numpy as np
import time
import os
from os import listdir
from os.path import isfile, join, basename
from tqdm import tqdm

import math
import sys
import datetime

# Helper functions
from lag_features import *
from other_functions import *

# Preprocessing
from sklearn.preprocessing import LabelEncoder

# Models
from sklearn.ensemble import GradientBoostingRegressor as gbr
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Metrics
from sklearn.metrics import mean_squared_error


In [2]:
DIR = '/Users/carlosperezricardo/Desktop/TFM'

#[10397, 13930, 11298, 11292, 12892]
AIRPORT = 10397
CARRIER = 19393
years_to_load = ['2016','2017','2018','2019']

In [3]:
folder = os.path.join(DIR, 'datasets')
files = [f for f in listdir(folder) if isfile(join(folder, f))]
files = sorted(files)

In [4]:
df = pd.DataFrame()
for file in files:
    loading = []
    if '.zip' in file:
        for year in years_to_load:
            loading.append(year in file)
        output = any(loading)
        if output:
            add_df = pd.read_csv(os.path.join(DIR,'datasets',file), parse_dates=['FL_DATE'])
            add_df = add_df[(add_df.DEST_AIRPORT_ID == AIRPORT)]
            df = pd.concat([df, add_df], axis=0)
df.reset_index(drop=True, inplace=True)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [5]:
df.head()

Unnamed: 0,FL_DATE,OP_UNIQUE_CARRIER,OP_CARRIER_AIRLINE_ID,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,...,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 29
0,2016-01-01,AA,19805,AA,N3KVAA,1178,11298,1129804,30194,DFW,...,24.0,-3.0,0.0,,0.0,,,,,
1,2016-01-01,AA,19805,AA,N3AUAA,1427,11298,1129804,30194,DFW,...,1205.0,-10.0,0.0,,0.0,,,,,
2,2016-01-01,AA,19805,AA,N479AA,1442,11298,1129804,30194,DFW,...,1439.0,42.0,0.0,,0.0,1.0,0.0,0.0,41.0,
3,2016-01-01,AS,19930,AS,N477AS,746,14747,1474703,30559,SEA,...,1736.0,-2.0,0.0,,0.0,,,,,
4,2016-01-01,AS,19930,AS,N462AS,750,14747,1474703,30559,SEA,...,2354.0,-4.0,0.0,,0.0,,,,,


In [6]:
df['DEST'].value_counts()

ATL    1500685
Name: DEST, dtype: int64

In [7]:
ini_date = datetime.datetime(2016, 1, 1)

train_ini_date = datetime.datetime(2018, 1, 1)
train_fin_date = datetime.datetime(2019, 9, 1)

test_ini_date = datetime.datetime(2019, 9, 1)
test_fin_date = datetime.datetime(2019, 9, 30)

In [8]:
df = df[(df.FL_DATE > ini_date) & (df.FL_DATE <= test_fin_date)]

In [9]:
df.shape

(1402736, 30)

In [10]:
# Feature Generation 
df = date_features(df, 'FL_DATE')

In [11]:
# Add flight distance in km
def flight_distance(lat1, lon1, lat2, lon2):
    R = 6371
    phi1 = lat1 * math.pi/180
    phi2 = lat2 * math.pi/180
    delta_phi = (lat2-lat1) * math.pi/180
    delta_lambda = (lon2-lon1) * math.pi/180
    
    a = np.sin(delta_phi/2) * np.sin(delta_lambda/2) + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda/2) * np.sin(delta_lambda/2);
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))

    return R * c # in kilometres

In [12]:
airports_df = pd.read_csv(os.path.join('/Users/carlosperezricardo/Desktop/TFM/airlines_airports', 'L_AIRPORT_ID.csv'))
airports_df = airports_df[['Code','Description','City_State','City','State_Country','lat','lon']]

In [13]:
df = pd.merge(df, airports_df[['Code','lat','lon']], left_on='ORIGIN_AIRPORT_ID', right_on='Code')
df.rename({'lat':'lat_ORIGIN','lon':'lon_ORIGIN'}, axis=1, inplace=True)

df = pd.merge(df, airports_df[['Code','lat','lon']], left_on='DEST_AIRPORT_ID', right_on='Code')
df.rename({'lat':'lat_DEST','lon':'lon_DEST'}, axis=1, inplace=True)

In [14]:
df['flight_distance'] = flight_distance(df['lat_ORIGIN'], df['lon_ORIGIN'], df['lat_DEST'], df['lon_DEST'])

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [15]:
# Add weather
WEATHER_DIR = '/Users/carlosperezricardo/Desktop/TFM/weather'
weather_df = pd.read_csv(os.path.join(WEATHER_DIR, 'weather.csv'))

In [16]:
iata_code = list(df['DEST'].unique())[0]
iata_code

'ATL'

In [17]:
airports_df = pd.read_csv(os.path.join('/Users/carlosperezricardo/Desktop/TFM/airlines_airports', 'airports.csv'))

airports_code = pd.read_excel(os.path.join('/Users/carlosperezricardo/Desktop/TFM/airlines_airports', 'US_airports.xlsx'), sheet_name='Airports')
airports_code.columns = airports_code.iloc[0]
airports_code = airports_code.iloc[1:]
    
states = pd.read_excel(os.path.join('/Users/carlosperezricardo/Desktop/TFM/airlines_airports', 'US_airports.xlsx'), sheet_name='States')
states.columns = ['state_name','state_acronym']

airports = pd.merge(airports_code, states, left_on='City', right_on='state_name', how='left')

airports[['state_name','state_acronym']] = airports[['state_name','state_acronym']].ffill(axis = 0)
airports = airports[~airports['FAA'].isnull()]

In [18]:
icao_code = list(airports[airports.IATA == iata_code]['ICAO'])[0]
icao_code

'KATL'

In [19]:
weather_df = weather_df[weather_df.ICAO == icao_code]
weather_df['Day'] = pd.to_datetime(weather_df['Day'])
weather_df.head()

Unnamed: 0.1,Unnamed: 0,Day,Temperature_Max,Temperature_Avg,Temperature_Min,Dew Point_Max,Dew Point_Avg,Dew Point_Min,Humidity_Max,Humidity_Avg,Humidity_Min,Wind Speed_Max,Wind Speed_Avg,Wind Speed_Min,Pressure_Max,Pressure_Avg,Pressure_Min,Precipitation,ICAO
85792,85792,2019-01-01,59,53.8,49,53,49.0,44,93,84.3,69,9.0,5.4,0.0,29.1,29.1,28.9,0.0,KATL
85793,85793,2019-01-02,55,53.1,50,53,50.6,49,96,91.3,86,14.0,4.6,0.0,29.0,28.9,28.9,0.6,KATL
85794,85794,2019-01-03,66,55.8,45,58,50.7,37,97,83.8,58,25.0,10.9,3.0,28.9,28.7,28.6,1.0,KATL
85795,85795,2019-01-04,59,49.0,44,38,36.4,34,79,63.3,41,21.0,14.2,7.0,29.0,28.9,28.7,0.56,KATL
85796,85796,2019-01-05,69,54.5,43,41,38.2,36,80,56.7,33,10.0,6.6,0.0,29.1,29.1,29.0,0.0,KATL


In [20]:
df = pd.merge(df, weather_df, left_on='FL_DATE', right_on='Day', how='left')

In [21]:
df.head()

Unnamed: 0,FL_DATE,OP_UNIQUE_CARRIER,OP_CARRIER_AIRLINE_ID,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,...,Humidity_Avg,Humidity_Min,Wind Speed_Max,Wind Speed_Avg,Wind Speed_Min,Pressure_Max,Pressure_Avg,Pressure_Min,Precipitation,ICAO
0,2016-01-02,AA,19805,AA,N471AA,232,11298,1129804,30194,DFW,...,,,,,,,,,,
1,2016-01-02,AA,19805,AA,N3JXAA,1487,11298,1129804,30194,DFW,...,,,,,,,,,,
2,2016-01-02,AA,19805,AA,N472AA,1641,11298,1129804,30194,DFW,...,,,,,,,,,,
3,2016-01-02,AA,19805,AA,N3CHAA,2408,11298,1129804,30194,DFW,...,,,,,,,,,,
4,2016-01-02,DL,19790,DL,N952DN,30,11298,1129804,30194,DFW,...,,,,,,,,,,


In [22]:
df = df[(df.CANCELLED != 1) & (df.DIVERTED != 1)]

In [23]:
df['DEP_TIME'] = df['DEP_TIME'].astype(int)/100
df['ARR_TIME'] = df['ARR_TIME'].astype(int)/100

df['airport_DEP_n_flights_per_hour'] = df.groupby(['FL_DATE','DEP_TIME'])['ARR_TIME'].transform( lambda x: x.count())
df['airport_ARR_n_flights_per_hour'] = df.groupby(['FL_DATE','ARR_TIME'])['DEP_TIME'].transform( lambda x: x.count())

df['airport_n_flights_per_hour'] = df['airport_DEP_n_flights_per_hour'] + df['airport_ARR_n_flights_per_hour'] 

df['airport_DEP_n_flights_cumsum'] = df.groupby(['FL_DATE'])['airport_DEP_n_flights_per_hour'].transform( lambda x: x.cumsum())
df['airport_ARR_n_flights_cumsum'] = df.groupby(['FL_DATE'])['airport_ARR_n_flights_per_hour'].transform( lambda x: x.cumsum())
df['airport_n_flights_cumsum'] = df.groupby(['FL_DATE'])['airport_n_flights_per_hour'].transform( lambda x: x.cumsum())

In [24]:
df = df[df.OP_CARRIER_AIRLINE_ID == CARRIER]

In [25]:
df.shape

(150937, 68)

In [26]:
df.head()

Unnamed: 0,FL_DATE,OP_UNIQUE_CARRIER,OP_CARRIER_AIRLINE_ID,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,...,Pressure_Avg,Pressure_Min,Precipitation,ICAO,airport_DEP_n_flights_per_hour,airport_ARR_n_flights_per_hour,airport_n_flights_per_hour,airport_DEP_n_flights_cumsum,airport_ARR_n_flights_cumsum,airport_n_flights_cumsum
25420,2016-01-02,WN,19393,WN,N240WN,5270,12892,1289203,32575,LAX,...,,,,,3.0,2.0,5.0,61.0,55.0,116.0
25421,2016-01-02,WN,19393,WN,N446WN,5863,12892,1289203,32575,LAX,...,,,,,12.0,2.0,14.0,73.0,57.0,130.0
25422,2016-01-02,WN,19393,WN,N402WN,5981,12892,1289203,32575,LAX,...,,,,,1.0,2.0,3.0,74.0,59.0,133.0
25425,2016-01-03,WN,19393,WN,N950WN,4097,12892,1289203,32575,LAX,...,,,,,3.0,3.0,6.0,42.0,50.0,92.0
25426,2016-01-03,WN,19393,WN,N283WN,4557,12892,1289203,32575,LAX,...,,,,,2.0,2.0,4.0,44.0,52.0,96.0


In [27]:
df.columns

Index(['FL_DATE', 'OP_UNIQUE_CARRIER', 'OP_CARRIER_AIRLINE_ID', 'OP_CARRIER',
       'TAIL_NUM', 'OP_CARRIER_FL_NUM', 'ORIGIN_AIRPORT_ID',
       'ORIGIN_AIRPORT_SEQ_ID', 'ORIGIN_CITY_MARKET_ID', 'ORIGIN',
       'ORIGIN_CITY_NAME', 'ORIGIN_STATE_ABR', 'DEST_AIRPORT_ID',
       'DEST_AIRPORT_SEQ_ID', 'DEST_CITY_MARKET_ID', 'DEST', 'DEST_CITY_NAME',
       'DEST_STATE_ABR', 'DEP_TIME', 'DEP_DELAY', 'ARR_TIME', 'ARR_DELAY',
       'CANCELLED', 'CANCELLATION_CODE', 'DIVERTED', 'CARRIER_DELAY',
       'WEATHER_DELAY', 'NAS_DELAY', 'LATE_AIRCRAFT_DELAY', 'Unnamed: 29',
       'month', 'day', 'year', 'FL_DATE_quarter', 'weekday', 'year_month',
       'Code_x', 'lat_ORIGIN', 'lon_ORIGIN', 'Code_y', 'lat_DEST', 'lon_DEST',
       'flight_distance', 'Unnamed: 0', 'Day', 'Temperature_Max',
       'Temperature_Avg', 'Temperature_Min', 'Dew Point_Max', 'Dew Point_Avg',
       'Dew Point_Min', 'Humidity_Max', 'Humidity_Avg', 'Humidity_Min',
       'Wind Speed_Max', 'Wind Speed_Avg', 'Wind Speed_Min

In [28]:
df_ = df.copy(deep=True)

In [29]:
#df_['ARR_DELAY_'] = df_['ARR_DELAY'] > 15
#df_['DEP_DELAY_'] = df_['DEP_DELAY'] > 15

In [30]:
s1 = 7
s2 = 14

In [31]:
# Lag features by: TAIL_NUM
calculations = {}
calculations['calc1'] = {'gb_list':['TAIL_NUM','FL_DATE'],'target':'DEP_DELAY','shifts':[s1], 'windows':[3,5,10], 'funs':['mean']}

df_ = apply_calc(df_, calculations)

Generating DEP_DELAY_TAIL_NUM_FL_DATE_s7_r3_mean
Generating DEP_DELAY_TAIL_NUM_FL_DATE_s7_r5_mean
Generating DEP_DELAY_TAIL_NUM_FL_DATE_s7_r10_mean


In [32]:
# Lag features by: DEST_AIRPORT_ID
calculations = {}
calculations['calc2'] = {'gb_list':['FL_DATE'],'target':'DEP_DELAY','shifts':[s1,s2], 'windows':[1,5,10], 'funs':['mean','std']}
calculations['calc3'] = {'gb_list':['FL_DATE'],'target':'DEP_DELAY','shifts':[s1,s2], 'windows':[1,15,30], 'funs':['median','std']}

df_ = apply_calc(df_, calculations)

Generating DEP_DELAY_FL_DATE_s7_r1_mean
Generating DEP_DELAY_FL_DATE_s14_r1_mean
Generating DEP_DELAY_FL_DATE_s7_r5_mean
Generating DEP_DELAY_FL_DATE_s14_r5_mean
Generating DEP_DELAY_FL_DATE_s7_r10_mean
Generating DEP_DELAY_FL_DATE_s14_r10_mean
Generating DEP_DELAY_FL_DATE_s7_r1_std
Generating DEP_DELAY_FL_DATE_s14_r1_std
Generating DEP_DELAY_FL_DATE_s7_r5_std
Generating DEP_DELAY_FL_DATE_s14_r5_std
Generating DEP_DELAY_FL_DATE_s7_r10_std
Generating DEP_DELAY_FL_DATE_s14_r10_std
Generating DEP_DELAY_FL_DATE_s7_r1_median
Generating DEP_DELAY_FL_DATE_s14_r1_median
Generating DEP_DELAY_FL_DATE_s7_r15_median
Generating DEP_DELAY_FL_DATE_s14_r15_median
Generating DEP_DELAY_FL_DATE_s7_r30_median
Generating DEP_DELAY_FL_DATE_s14_r30_median
Generating DEP_DELAY_FL_DATE_s7_r1_std
Generating DEP_DELAY_FL_DATE_s14_r1_std
Generating DEP_DELAY_FL_DATE_s7_r15_std
Generating DEP_DELAY_FL_DATE_s14_r15_std
Generating DEP_DELAY_FL_DATE_s7_r30_std
Generating DEP_DELAY_FL_DATE_s14_r30_std


In [33]:
# Lag features by: OP_CARRIER_AIRLINE_ID
calculations = {}
calculations['calc4'] = {'gb_list':['OP_CARRIER_AIRLINE_ID','FL_DATE'],'target':'DEP_DELAY','shifts':[s1,s2], 'windows':[1,5,10], 'funs':['mean']}
calculations['calc5'] = {'gb_list':['OP_CARRIER_AIRLINE_ID','FL_DATE'],'target':'DEP_DELAY','shifts':[s2], 'windows':[10], 'funs':['median','std']}

df_ = apply_calc(df_, calculations)

Generating DEP_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s7_r1_mean
Generating DEP_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s14_r1_mean
Generating DEP_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s7_r5_mean
Generating DEP_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s14_r5_mean
Generating DEP_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s7_r10_mean
Generating DEP_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s14_r10_mean
Generating DEP_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s14_r10_median
Generating DEP_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s14_r10_std


In [34]:
calculations = {}
calculations['calc4'] = {'gb_list':['ORIGIN','FL_DATE'],'target':'DEP_DELAY','shifts':[s1,s2], 'windows':[1,5,10], 'funs':['mean']}
calculations['calc5'] = {'gb_list':['ORIGIN','FL_DATE'],'target':'DEP_DELAY','shifts':[s1,s2], 'windows':[10], 'funs':['median','std']}

df_ = apply_calc(df_, calculations)

Generating DEP_DELAY_ORIGIN_FL_DATE_s7_r1_mean
Generating DEP_DELAY_ORIGIN_FL_DATE_s14_r1_mean
Generating DEP_DELAY_ORIGIN_FL_DATE_s7_r5_mean
Generating DEP_DELAY_ORIGIN_FL_DATE_s14_r5_mean
Generating DEP_DELAY_ORIGIN_FL_DATE_s7_r10_mean
Generating DEP_DELAY_ORIGIN_FL_DATE_s14_r10_mean
Generating DEP_DELAY_ORIGIN_FL_DATE_s7_r10_median
Generating DEP_DELAY_ORIGIN_FL_DATE_s14_r10_median
Generating DEP_DELAY_ORIGIN_FL_DATE_s7_r10_std
Generating DEP_DELAY_ORIGIN_FL_DATE_s14_r10_std


In [35]:
# Convert times 
# [ARR_TIME, DEP_TIME]

df_['DEP_TIME'] = df_['DEP_TIME']/100
df_['ARR_TIME'] = df_['ARR_TIME']/100
df_['DEP_TIME'] = df_['DEP_TIME'].astype(int)
df_['ARR_TIME'] = df_['ARR_TIME'].astype(int)

In [36]:
df_['DEP_n_flights_per_hour'] = df_.groupby(['FL_DATE','DEP_TIME'])['ARR_TIME'].transform( lambda x: x.count())
df_['ARR_n_flights_per_hour'] = df_.groupby(['FL_DATE','ARR_TIME'])['DEP_TIME'].transform( lambda x: x.count())

df_['n_flights_per_hour'] = df_['DEP_n_flights_per_hour'] + df_['ARR_n_flights_per_hour'] 

df_['DEP_n_flights_cumsum'] = df_.groupby(['FL_DATE'])['DEP_n_flights_per_hour'].transform( lambda x: x.cumsum())
df_['ARR_n_flights_cumsum'] = df_.groupby(['FL_DATE'])['ARR_n_flights_per_hour'].transform( lambda x: x.cumsum())
df_['n_flights_cumsum'] = df_.groupby(['FL_DATE'])['n_flights_per_hour'].transform( lambda x: x.cumsum())

In [38]:
# Not Lagged
calculations = {}
calculations['calc6'] = {'gb_list':['OP_CARRIER_AIRLINE_ID','FL_DATE'],'target':'DEP_DELAY','shifts':[s1], 'funs':['mean','std']}
calculations['calc7'] = {'gb_list':['OP_CARRIER_AIRLINE_ID','FL_DATE'],'target':'DEP_DELAY','shifts':[s1], 'funs':['mean','std']}

df_ = apply_calc(df_, calculations)

Generating DEP_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s7_mean
Generating DEP_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s7_std
Generating DEP_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s7_mean
Generating DEP_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s7_std


In [39]:
# Type of delay
calculations = {}
calculations['calc1'] = {'gb_list':['OP_CARRIER_AIRLINE_ID','FL_DATE'],'target':'CARRIER_DELAY','shifts':[s1], 'funs':['mean']}
calculations['calc2'] = {'gb_list':['OP_CARRIER_AIRLINE_ID','FL_DATE'],'target':'LATE_AIRCRAFT_DELAY','shifts':[s1], 'funs':['median']}
calculations['calc3'] = {'gb_list':['OP_CARRIER_AIRLINE_ID','FL_DATE'],'target':'NAS_DELAY','shifts':[s1], 'funs':['mean']}
calculations['calc4'] = {'gb_list':['FL_DATE'],'target':'WEATHER_DELAY','shifts':[s1], 'funs':['mean']}
calculations['calc5'] = {'gb_list':['OP_CARRIER_AIRLINE_ID','FL_DATE'],'target':'LATE_AIRCRAFT_DELAY','shifts':[s1], 'funs':['mean']}

calculations['calc6'] = {'gb_list':['ORIGIN_AIRPORT_ID'],'target':'WEATHER_DELAY','shifts':[s1], 'funs':['mean']}
calculations['calc7'] = {'gb_list':['ORIGIN_AIRPORT_ID'],'target':'NAS_DELAY','shifts':[s1], 'funs':['mean']}

df_ = apply_calc(df_, calculations)

Generating CARRIER_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s7_mean
Generating LATE_AIRCRAFT_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s7_median
Generating NAS_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s7_mean
Generating WEATHER_DELAY_FL_DATE_s7_mean
Generating LATE_AIRCRAFT_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s7_mean
Generating WEATHER_DELAY_ORIGIN_AIRPORT_ID_s7_mean
Generating NAS_DELAY_ORIGIN_AIRPORT_ID_s7_mean


In [40]:
calculations = {}
calculations['calc'] = {'gb_list':['FL_DATE'],'target':'DEP_DELAY','shifts':[s1, s2], 'windows':[1,7,14], 'funs':['mean','std']}

df_ = apply_calc(df_, calculations)

Generating DEP_DELAY_FL_DATE_s7_r1_mean
Generating DEP_DELAY_FL_DATE_s14_r1_mean
Generating DEP_DELAY_FL_DATE_s7_r7_mean
Generating DEP_DELAY_FL_DATE_s14_r7_mean
Generating DEP_DELAY_FL_DATE_s7_r14_mean
Generating DEP_DELAY_FL_DATE_s14_r14_mean
Generating DEP_DELAY_FL_DATE_s7_r1_std
Generating DEP_DELAY_FL_DATE_s14_r1_std
Generating DEP_DELAY_FL_DATE_s7_r7_std
Generating DEP_DELAY_FL_DATE_s14_r7_std
Generating DEP_DELAY_FL_DATE_s7_r14_std
Generating DEP_DELAY_FL_DATE_s14_r14_std


In [41]:
drop_cols = ['CANCELLED', 'CANCELLATION_CODE', 'DIVERTED', 'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'LATE_AIRCRAFT_DELAY', 'Unnamed: 29']
for col in drop_cols:
    del df_[col]

In [42]:
keep = list(df_.select_dtypes(include=np.number).columns)

for col in df_.select_dtypes(exclude=np.number).columns:
    print(col, len(df_[col].unique()))
    if len(df_[col].unique()) < 100:
        keep.append(col)
        le = LabelEncoder()
        df_[col] = le.fit_transform(df_[col])
        df_[col] = df_[col].astype('category')

FL_DATE 1338
OP_UNIQUE_CARRIER 1
OP_CARRIER 1
TAIL_NUM 873
ORIGIN 40
ORIGIN_CITY_NAME 39
ORIGIN_STATE_ABR 24
DEST 1
DEST_CITY_NAME 1
DEST_STATE_ABR 1
year_month 44
Day 274
ICAO 2


In [43]:
drop_cols = ['DEST_CITY_NAME','ORIGIN_CITY_NAME', 'Unnamed: 0','year_month']

for col in drop_cols:
    try:
        keep.remove(col)
    except:
        pass


In [44]:
df_.fillna(0, inplace=True)

In [45]:
df_.head()

Unnamed: 0,FL_DATE,OP_UNIQUE_CARRIER,OP_CARRIER_AIRLINE_ID,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,...,WEATHER_DELAY_ORIGIN_AIRPORT_ID_s7_mean,NAS_DELAY_ORIGIN_AIRPORT_ID_s7_mean,DEP_DELAY_FL_DATE_s7_r7_mean,DEP_DELAY_FL_DATE_s14_r7_mean,DEP_DELAY_FL_DATE_s7_r14_mean,DEP_DELAY_FL_DATE_s14_r14_mean,DEP_DELAY_FL_DATE_s7_r7_std,DEP_DELAY_FL_DATE_s14_r7_std,DEP_DELAY_FL_DATE_s7_r14_std,DEP_DELAY_FL_DATE_s14_r14_std
25420,2016-01-02,0,19393,0,N240WN,5270,12892,1289203,32575,18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25421,2016-01-02,0,19393,0,N446WN,5863,12892,1289203,32575,18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25422,2016-01-02,0,19393,0,N402WN,5981,12892,1289203,32575,18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25425,2016-01-03,0,19393,0,N950WN,4097,12892,1289203,32575,18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25426,2016-01-03,0,19393,0,N283WN,4557,12892,1289203,32575,18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
train_df = df_.loc[(df_.FL_DATE > datetime.datetime(2018, 1, 1)) & (df_.FL_DATE < datetime.datetime(2019, 9, 1)), keep]
test_df = df_.loc[(df_.FL_DATE >= datetime.datetime(2019, 9, 1)) & (df_.FL_DATE <= datetime.datetime(2019, 9, 30)), keep]

In [47]:
print(train_df.shape)
print(test_df.shape)

(67793, 119)
(3061, 119)


In [48]:
TARGET = 'DEP_DELAY'
drop = ['DEP_DELAY','ARR_DELAY']

In [49]:
features = [ col for col in train_df.columns if col != TARGET and col not in drop ]

X_train = train_df[features]
y_train = train_df[TARGET]

X_test = test_df[features]
y_test = test_df[TARGET]

In [53]:
reg = gbr(loss='absolute_error', n_estimators=200)
reg.fit(X_train, y_train)
x = model_metrics(X_train, y_train, X_test, y_test, reg)

 --- TRAIN --- 
     - RMSE:  981.8515420307573
     - MAE:  12.709101136029734
     - R2:  -0.06018788874450709
 --- TEST --- 
     - RMSE:  362.03194373813284
     - MAE:  7.72874612056339
     - R2:  -0.03423082433766811


(981.8515420307573,
 362.03194373813284,
 12.709101136029734,
 7.72874612056339,
 -0.06018788874450709,
 -0.06018788874450709)

In [55]:
reg = gbr(loss='squared_error', n_estimators=200)
reg.fit(X_train, y_train)
x = model_metrics(X_train, y_train, X_test, y_test, reg)

 --- TRAIN --- 
     - RMSE:  816.723273626254
     - MAE:  15.254494805451184
     - R2:  0.11811502443289312
 --- TEST --- 
     - RMSE:  373.23778797597043
     - MAE:  11.50268781213721
     - R2:  -0.06624299819125823


(816.723273626254,
 373.23778797597043,
 15.254494805451184,
 11.50268781213721,
 0.11811502443289312,
 0.11811502443289312)

In [59]:
model = reg

In [58]:
model = gbr(random_state=42)

gridParams = {
    'n_estimators': [200, 250, 300],
    'max_depth': [10, 20, 30],
    'num_leaves':[20, 30, 40]#,
    #'loss':['squared_error','absolute_error']
}

grid = GridSearchCV(model, gridParams, verbose=1)
# Run the grid
grid.fit(X_train, y_train)

# Print the best parameters found
print(grid.best_params_)
print(grid.best_score_)

model = grid.best_estimator_

Fitting 5 folds for each of 27 candidates, totalling 135 fits


ValueError: Invalid parameter num_leaves for estimator GradientBoostingRegressor(max_depth=10, n_estimators=200, random_state=42). Check the list of available parameters with `estimator.get_params().keys()`.

In [60]:
pd.DataFrame({'columns': X_train.columns,'feature_importance':model.feature_importances_}).sort_values('feature_importance',ascending=False).head(25)

Unnamed: 0,columns,feature_importance
93,DEP_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s7_mean,0.205442
40,airport_n_flights_per_hour,0.135731
97,NAS_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s7_mean,0.055544
1,OP_CARRIER_FL_NUM,0.035892
101,NAS_DELAY_ORIGIN_AIRPORT_ID_s7_mean,0.034134
100,WEATHER_DELAY_ORIGIN_AIRPORT_ID_s7_mean,0.028717
105,DEP_DELAY_FL_DATE_s14_r14_mean,0.028485
63,DEP_DELAY_FL_DATE_s7_r30_median,0.028252
38,airport_DEP_n_flights_per_hour,0.022758
61,DEP_DELAY_FL_DATE_s7_r15_median,0.019362


In [62]:
x = model_metrics(X_train, y_train, X_test, y_test, model)

 --- TRAIN --- 
     - RMSE:  816.723273626254
     - MAE:  15.254494805451184
     - R2:  0.11811502443289312
 --- TEST --- 
     - RMSE:  373.23778797597043
     - MAE:  11.50268781213721
     - R2:  -0.06624299819125823


In [63]:
train_df_ = train_df.copy(deep=True)
train_df_['pred'] = reg.predict(X_train)
train_df_[['DEP_DELAY','pred']].sort_values('DEP_DELAY', ascending=False).head(30)

Unnamed: 0,DEP_DELAY,pred
167736,500.0,10.801282
43693,486.0,10.192161
39143,479.0,12.124447
429083,470.0,2.695511
1019599,462.0,2.147632
298109,453.0,25.924867
554615,433.0,8.683569
1075239,431.0,14.060411
499468,416.0,35.100722
1032656,412.0,56.651794


In [64]:
train_df_[['DEP_DELAY','pred']].sort_values('DEP_DELAY', ascending=True).head(30)

Unnamed: 0,DEP_DELAY,pred
461599,-20.0,4.716314
762510,-18.0,6.322034
428413,-15.0,5.039362
629179,-15.0,10.595571
924235,-14.0,10.58855
463026,-13.0,2.464375
462040,-13.0,3.690988
924060,-13.0,2.943405
697454,-13.0,7.202932
760863,-13.0,-3.615882


In [65]:
# ARR_DELAY

In [66]:
df_ = df.copy(deep=True)

In [67]:
df_ = df_[(df_.CANCELLED != 1) & (df_.DIVERTED != 1)]

In [68]:
s1 = 7
s2 = 14
s3 = 365

In [69]:
# Lag features by: TAIL_NUM
calculations = {}
calculations['calc1'] = {'gb_list':['TAIL_NUM','FL_DATE'],'target':'ARR_DELAY','shifts':[s1], 'windows':[3,5,10], 'funs':['mean']}

df_ = apply_calc(df_, calculations)

Generating ARR_DELAY_TAIL_NUM_FL_DATE_s7_r3_mean
Generating ARR_DELAY_TAIL_NUM_FL_DATE_s7_r5_mean
Generating ARR_DELAY_TAIL_NUM_FL_DATE_s7_r10_mean


In [70]:
# Lag features by: DEST_AIRPORT_ID
calculations = {}
calculations['calc2'] = {'gb_list':['FL_DATE'],'target':'ARR_DELAY','shifts':[s1,s2], 'windows':[1,5,10], 'funs':['mean','std']}
calculations['calc3'] = {'gb_list':['FL_DATE'],'target':'ARR_DELAY','shifts':[s1,s2], 'windows':[1,15,30], 'funs':['median','std']}

df_ = apply_calc(df_, calculations)

Generating ARR_DELAY_FL_DATE_s7_r1_mean
Generating ARR_DELAY_FL_DATE_s14_r1_mean
Generating ARR_DELAY_FL_DATE_s7_r5_mean
Generating ARR_DELAY_FL_DATE_s14_r5_mean
Generating ARR_DELAY_FL_DATE_s7_r10_mean
Generating ARR_DELAY_FL_DATE_s14_r10_mean
Generating ARR_DELAY_FL_DATE_s7_r1_std
Generating ARR_DELAY_FL_DATE_s14_r1_std
Generating ARR_DELAY_FL_DATE_s7_r5_std
Generating ARR_DELAY_FL_DATE_s14_r5_std
Generating ARR_DELAY_FL_DATE_s7_r10_std
Generating ARR_DELAY_FL_DATE_s14_r10_std
Generating ARR_DELAY_FL_DATE_s7_r1_median
Generating ARR_DELAY_FL_DATE_s14_r1_median
Generating ARR_DELAY_FL_DATE_s7_r15_median
Generating ARR_DELAY_FL_DATE_s14_r15_median
Generating ARR_DELAY_FL_DATE_s7_r30_median
Generating ARR_DELAY_FL_DATE_s14_r30_median
Generating ARR_DELAY_FL_DATE_s7_r1_std
Generating ARR_DELAY_FL_DATE_s14_r1_std
Generating ARR_DELAY_FL_DATE_s7_r15_std
Generating ARR_DELAY_FL_DATE_s14_r15_std
Generating ARR_DELAY_FL_DATE_s7_r30_std
Generating ARR_DELAY_FL_DATE_s14_r30_std


In [71]:
# Lag features by: ORIGIN
calculations = {}
calculations['calc4'] = {'gb_list':['ORIGIN'],'target':'ARR_DELAY','shifts':[s1,s2], 'windows':[1,5,10], 'funs':['mean','std']}
calculations['calc5'] = {'gb_list':['ORIGIN'],'target':'ARR_DELAY','shifts':[s1,s2], 'windows':[1,15,30], 'funs':['median','std']}

df_ = apply_calc(df_, calculations)

Generating ARR_DELAY_ORIGIN_s7_r1_mean
Generating ARR_DELAY_ORIGIN_s14_r1_mean
Generating ARR_DELAY_ORIGIN_s7_r5_mean
Generating ARR_DELAY_ORIGIN_s14_r5_mean
Generating ARR_DELAY_ORIGIN_s7_r10_mean
Generating ARR_DELAY_ORIGIN_s14_r10_mean
Generating ARR_DELAY_ORIGIN_s7_r1_std
Generating ARR_DELAY_ORIGIN_s14_r1_std
Generating ARR_DELAY_ORIGIN_s7_r5_std
Generating ARR_DELAY_ORIGIN_s14_r5_std
Generating ARR_DELAY_ORIGIN_s7_r10_std
Generating ARR_DELAY_ORIGIN_s14_r10_std
Generating ARR_DELAY_ORIGIN_s7_r1_median
Generating ARR_DELAY_ORIGIN_s14_r1_median
Generating ARR_DELAY_ORIGIN_s7_r15_median
Generating ARR_DELAY_ORIGIN_s14_r15_median
Generating ARR_DELAY_ORIGIN_s7_r30_median
Generating ARR_DELAY_ORIGIN_s14_r30_median
Generating ARR_DELAY_ORIGIN_s7_r1_std
Generating ARR_DELAY_ORIGIN_s14_r1_std
Generating ARR_DELAY_ORIGIN_s7_r15_std
Generating ARR_DELAY_ORIGIN_s14_r15_std
Generating ARR_DELAY_ORIGIN_s7_r30_std
Generating ARR_DELAY_ORIGIN_s14_r30_std
Generating ARR_DELAY_ORIGIN_s7_r1_mean
G

In [72]:
# Lag features by: OP_CARRIER_AIRLINE_ID
calculations = {}
calculations['calc8'] = {'gb_list':['OP_CARRIER_AIRLINE_ID','FL_DATE'],'target':'ARR_DELAY','shifts':[s1,s2], 'windows':[1,5,10], 'funs':['mean']}
calculations['calc9'] = {'gb_list':['OP_CARRIER_AIRLINE_ID','FL_DATE'],'target':'ARR_DELAY','shifts':[s2], 'windows':[10], 'funs':['median','std']}

df_ = apply_calc(df_, calculations)

Generating ARR_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s7_r1_mean
Generating ARR_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s14_r1_mean
Generating ARR_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s7_r5_mean
Generating ARR_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s14_r5_mean
Generating ARR_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s7_r10_mean
Generating ARR_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s14_r10_mean
Generating ARR_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s14_r10_median
Generating ARR_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s14_r10_std
Generating DEP_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s7_r1_mean
Generating DEP_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s14_r1_mean
Generating DEP_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s7_r5_mean
Generating DEP_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s14_r5_mean
Generating DEP_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s7_r10_mean
Generating DEP_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s14_r10_mean
Generating DEP_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s7_r10_median
Generating DEP_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s

In [73]:
# Convert times 
# [ARR_TIME, DEP_TIME]

df_['DEP_TIME'] = df_['DEP_TIME']/100
df_['ARR_TIME'] = df_['ARR_TIME']/100

df_['DEP_TIME'] = df_['DEP_TIME'].astype(int)
df_['ARR_TIME'] = df_['ARR_TIME'].astype(int)

In [74]:
df_['DEP_n_flights_per_hour'] = df_.groupby(['FL_DATE','DEP_TIME'])['ARR_TIME'].transform( lambda x: x.count())
df_['ARR_n_flights_per_hour'] = df_.groupby(['FL_DATE','ARR_TIME'])['DEP_TIME'].transform( lambda x: x.count())

df_['n_flights_per_hour'] = df_['DEP_n_flights_per_hour'] + df_['ARR_n_flights_per_hour'] 

df_['DEP_n_flights_cumsum'] = df_.groupby(['FL_DATE'])['DEP_n_flights_per_hour'].transform( lambda x: x.cumsum())
df_['ARR_n_flights_cumsum'] = df_.groupby(['FL_DATE'])['ARR_n_flights_per_hour'].transform( lambda x: x.cumsum())
df_['n_flights_cumsum'] = df_.groupby(['FL_DATE'])['n_flights_per_hour'].transform( lambda x: x.cumsum())

In [75]:
# Not Lagged
calculations = {}

calculations['calc12'] = {'gb_list':['OP_CARRIER_AIRLINE_ID','FL_DATE'],'target':'ARR_DELAY','shifts':[s1], 'funs':['mean','std']}
calculations['calc13'] = {'gb_list':['OP_CARRIER_AIRLINE_ID','FL_DATE'],'target':'ARR_DELAY','shifts':[s1], 'funs':['mean','std']}

df_ = apply_calc(df_, calculations)

Generating ARR_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s7_mean
Generating ARR_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s7_std
Generating ARR_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s7_mean
Generating ARR_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s7_std


In [76]:
# Type of delay
calculations = {}
calculations['calc1'] = {'gb_list':['OP_CARRIER_AIRLINE_ID','FL_DATE'],'target':'CARRIER_DELAY','shifts':[s1], 'funs':['mean']}
calculations['calc2'] = {'gb_list':['OP_CARRIER_AIRLINE_ID','FL_DATE'],'target':'LATE_AIRCRAFT_DELAY','shifts':[s1], 'funs':['median']}
calculations['calc3'] = {'gb_list':['OP_CARRIER_AIRLINE_ID','FL_DATE'],'target':'NAS_DELAY','shifts':[s1], 'funs':['mean']}
calculations['calc4'] = {'gb_list':['FL_DATE'],'target':'WEATHER_DELAY','shifts':[s1], 'funs':['mean']}
calculations['calc5'] = {'gb_list':['OP_CARRIER_AIRLINE_ID','FL_DATE'],'target':'LATE_AIRCRAFT_DELAY','shifts':[s1], 'funs':['mean']}

calculations['calc6'] = {'gb_list':['ORIGIN_AIRPORT_ID'],'target':'WEATHER_DELAY','shifts':[s1], 'funs':['mean']}
calculations['calc7'] = {'gb_list':['ORIGIN_AIRPORT_ID'],'target':'NAS_DELAY','shifts':[s1], 'funs':['mean']}

df_ = apply_calc(df_, calculations)

Generating CARRIER_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s7_mean
Generating LATE_AIRCRAFT_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s7_median
Generating NAS_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s7_mean
Generating WEATHER_DELAY_FL_DATE_s7_mean
Generating LATE_AIRCRAFT_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s7_mean
Generating WEATHER_DELAY_ORIGIN_AIRPORT_ID_s7_mean
Generating NAS_DELAY_ORIGIN_AIRPORT_ID_s7_mean


In [77]:
drop_cols = ['CANCELLED', 'CANCELLATION_CODE', 'DIVERTED', 'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'LATE_AIRCRAFT_DELAY', 'Unnamed: 29']
for col in drop_cols:
    del df_[col]

In [78]:
keep = list(df_.select_dtypes(include=np.number).columns)

for col in df_.select_dtypes(exclude=np.number).columns:
    print(col, len(df_[col].unique()))
    if len(df_[col].unique()) < 100:
        keep.append(col)
        le = LabelEncoder()
        df_[col] = le.fit_transform(df_[col])
        df_[col] = df_[col].astype('category')

FL_DATE 1338
OP_UNIQUE_CARRIER 1
OP_CARRIER 1
TAIL_NUM 873
ORIGIN 40
ORIGIN_CITY_NAME 39
ORIGIN_STATE_ABR 24
DEST 1
DEST_CITY_NAME 1
DEST_STATE_ABR 1
year_month 44
Day 274
ICAO 2


In [79]:
drop_cols = ['DEST_CITY_NAME','ORIGIN_CITY_NAME', 'Unnamed: 0']

for col in drop_cols:
    try:
        keep.remove(col)
    except:
        pass

In [91]:
df_.fillna(0, inplace=True)

In [92]:
train_df = df_.loc[(df_.FL_DATE > datetime.datetime(2018, 1, 1)) & (df_.FL_DATE < datetime.datetime(2019, 9, 1)), keep]
test_df = df_.loc[(df_.FL_DATE >= datetime.datetime(2019, 9, 1)) & (df_.FL_DATE <= datetime.datetime(2019, 9, 30)), keep]

In [93]:
X_train_ = X_train.copy(deep=True)
X_test_ = X_test.copy(deep=True)

In [94]:
train_df['DEP_DELAY_pred'] = reg.predict(X_train_)
test_df['DEP_DELAY_pred'] = reg.predict(X_test_)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
train_df['DEP_DELAY_pred'] = train_df['DEP_DELAY_pred'].astype(int)
test_df['DEP_DELAY_pred'] = test_df['DEP_DELAY_pred'].astype(int)

In [None]:
TARGET = 'ARR_DELAY'
drop = ['DEP_DELAY','ARR_DELAY']

In [None]:
features = [ col for col in train_df.columns if col != TARGET and col not in drop ]

X_train = train_df[features]
y_train = train_df[TARGET]

X_test = test_df[features]
y_test = test_df[TARGET]

In [None]:
X_train.isnull().sum()[ X_train.isnull().sum() != 0]

flight_distance                                              23915
Temperature_Max                                              41222
Temperature_Avg                                              41222
Temperature_Min                                              41222
Dew Point_Max                                                41222
                                                             ...  
NAS_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s7_mean               8739
WEATHER_DELAY_FL_DATE_s7_mean                                 8739
LATE_AIRCRAFT_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s7_mean     8739
WEATHER_DELAY_ORIGIN_AIRPORT_ID_s7_mean                          9
NAS_DELAY_ORIGIN_AIRPORT_ID_s7_mean                              9
Length: 89, dtype: int64

In [None]:
reg = gbr(loss='squared_error', n_estimators=400)
reg.fit(X_train, y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:


rmse = mean_squared_error(y_train, reg.predict(X_train), squared=True)
rmse

271.7644649169565

In [None]:
pd.DataFrame({'columns': X_train.columns,'feature_importance':reg.feature_importances_}).sort_values('feature_importance',ascending=False).head(25)

Unnamed: 0,columns,feature_importance
114,DEP_DELAY_pred,1453
108,ORIGIN,990
112,year_month,764
1,OP_CARRIER_FL_NUM,457
96,ARR_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s7_std,281
95,ARR_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s7_mean,257
47,ARR_DELAY_FL_DATE_s7_r1_mean,256
50,ARR_DELAY_FL_DATE_s14_r5_mean,234
48,ARR_DELAY_FL_DATE_s14_r1_mean,233
85,DEP_DELAY_OP_CARRIER_AIRLINE_ID_FL_DATE_s14_r1...,231


In [None]:
train_df_ = train_df.copy(deep=True)
train_df_['ARR_DELAY_pred'] = reg.predict(X_train)
train_df_[['DEP_DELAY','ARR_DELAY','DEP_DELAY_pred','ARR_DELAY_pred']].sort_values('ARR_DELAY', ascending=False).head(30)

Unnamed: 0,DEP_DELAY,ARR_DELAY,DEP_DELAY_pred,ARR_DELAY_pred
167736,500.0,485.0,111,428.714494
43693,486.0,474.0,51,372.258579
429083,470.0,466.0,43,379.864609
701391,396.0,466.0,183,425.874878
39143,479.0,459.0,46,246.29184
1019599,462.0,455.0,62,375.035751
590400,351.0,439.0,217,389.854618
298109,453.0,433.0,184,417.077641
1032656,412.0,431.0,96,374.771382
554615,433.0,425.0,82,406.606813


In [None]:
train_df_[['DEP_DELAY','ARR_DELAY','DEP_DELAY_pred','ARR_DELAY_pred']].sort_values('ARR_DELAY').head(30)

Unnamed: 0,DEP_DELAY,ARR_DELAY,DEP_DELAY_pred,ARR_DELAY_pred
561822,-4.0,-60.0,5,-14.40296
290537,-4.0,-53.0,3,-11.163201
45274,-2.0,-53.0,9,-23.263001
299480,0.0,-52.0,3,-18.416832
290402,-4.0,-52.0,0,-15.900856
39557,1.0,-52.0,8,-13.154056
43800,-1.0,-51.0,2,-14.805244
290354,-4.0,-51.0,2,-14.533337
295834,-8.0,-51.0,3,-15.570259
295761,-3.0,-50.0,6,-14.770181


In [None]:
# Big error 
train_df_['ARR_DELAY_error'] = train_df_['ARR_DELAY'] - train_df_['ARR_DELAY_pred']
train_df_[['DEP_DELAY','ARR_DELAY','DEP_DELAY_pred','ARR_DELAY_pred','ARR_DELAY_error']].sort_values('ARR_DELAY_error', ascending=False).head(30)

Unnamed: 0,DEP_DELAY,ARR_DELAY,DEP_DELAY_pred,ARR_DELAY_pred,ARR_DELAY_error
121951,215.0,218.0,5,1.49232,216.50768
39143,479.0,459.0,46,246.29184,212.70816
254357,210.0,252.0,10,59.38187,192.61813
41345,283.0,272.0,24,102.000055,169.999945
146185,338.0,317.0,29,157.391588,159.608412
622025,193.0,179.0,13,20.504755,158.495245
121517,178.0,167.0,7,8.818293,158.181707
170442,226.0,217.0,20,62.458088,154.541912
426639,206.0,197.0,11,44.96377,152.03623
441687,237.0,241.0,17,90.420702,150.579298


In [None]:
train_df_[['DEP_DELAY','ARR_DELAY','DEP_DELAY_pred','ARR_DELAY_pred']].sample(20)

Unnamed: 0,DEP_DELAY,ARR_DELAY,DEP_DELAY_pred,ARR_DELAY_pred
465122,-5.0,-1.0,19,15.348732
698664,-1.0,-5.0,3,-4.047008
496243,-3.0,-11.0,3,-8.557668
589102,58.0,49.0,25,40.399887
139537,2.0,2.0,17,2.591011
498528,0.0,-20.0,5,-8.578396
1021283,-5.0,21.0,2,-3.891298
579279,-8.0,0.0,6,5.632687
210518,0.0,-21.0,8,-6.935301
164935,2.0,-3.0,14,17.949745


In [None]:
perc =[0.05, .10, .25, 0.5, 0.75, .90, 0.95]
train_df_['ARR_DELAY_error'].describe(percentiles = perc)

count    67793.000000
mean        -0.000171
std         16.485402
min       -121.924152
5%         -20.059078
10%        -15.864614
25%         -9.618434
50%         -2.567241
75%          6.206356
90%         18.570944
95%         29.199538
max        216.507680
Name: ARR_DELAY_error, dtype: float64

In [None]:
finish

NameError: name 'finish' is not defined

In [None]:
reg = lgbm.LGBMRegressor(objective='rmse', n_estimators=400)
reg.fit(X_train, y_train)

In [None]:
train_df_ = train_df.copy(deep=True)
train_df_['pred'] = reg.predict(X_train)
train_df_[['ARR_DELAY','pred']].sort_values('ARR_DELAY', ascending=False).head(30)

Unnamed: 0,ARR_DELAY,pred
49775,485.0,415.040911
54432,474.0,437.624527
27961,466.0,440.571741
121878,466.0,424.033499
53728,459.0,424.567348
107426,455.0,431.884757
96953,439.0,403.761763
61193,433.0,395.227514
146185,431.0,375.421558
18727,425.0,350.585559


In [None]:
stop

NameError: name 'stop' is not defined

In [None]:
# With SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.model_selection import GridSearchCV


In [None]:
oversample = SMOTE()
X_train_ov, y_train_ov = oversample.fit_resample(X_train, y_train)

ValueError: Expected n_neighbors <= n_samples,  but n_samples = 1, n_neighbors = 6

In [None]:
print('Before Sampling')
counter = Counter(y_train)
for k,v in counter.items():
    per = v / len(y_train) * 100
    print('Class=%d, n=%d (%.3f%%)' % (k, v, per))

print('\n After Sampling')
counter = Counter(y_train_ov)
for k,v in counter.items():
    per = v / len(y_train) * 100
    print('Class=%d, n=%d (%.3f%%)' % (k, v, per))

Before Sampling
Class=1, n=96856 (14.902%)
Class=0, n=553079 (85.098%)

 After Sampling
Class=1, n=553079 (85.098%)
Class=0, n=553079 (85.098%)


In [None]:
rf = lgbm.LGBMClassifier(metric='roc_auc')

param_dist = {"max_depth": [30, 45, 50],
                  "n_estimators": [150, 200, 250, 300]
                 }
grid_search = GridSearchCV(rf, n_jobs=-1, param_grid=param_dist, cv = 5, scoring='roc_auc', verbose=20)
grid_search.fit(X_train_ov, y_train_ov)
smote_model = grid_search.best_estimator_

Fitting 5 folds for each of 12 candidates, totalling 60 fits




LGBMClassifier(max_depth=30, metric='roc_auc', n_estimators=300)
TRAIN (oversample): 0.9319798799086567
TRAIN: 0.6536715539369677
TEST: 0.5717710544939844


In [None]:
smote_model = lgbm.LGBMClassifier(metric='roc_auc', n_estimators=300, max_depth=50)
smote_model.fit(X_train_ov, y_train_ov)



LGBMClassifier(max_depth=50, metric='roc_auc', n_estimators=300)

In [None]:
print(smote_model)
y_pred_train_ = smote_model.predict_proba(X_train_ov)[:, 1]
y_pred_train = smote_model.predict_proba(X_train)[:, 1]
y_pred_test = smote_model.predict_proba(X_test)[:, 1]

print("TRAIN (oversample):", roc_auc_score(y_train_ov, y_pred_train_))
print("TRAIN:", roc_auc_score(y_train, y_pred_train))
print("TEST:", roc_auc_score(y_test, y_pred_test))

LGBMClassifier(max_depth=50, metric='roc_auc', n_estimators=300)
TRAIN (oversample): 0.9741408816810191
TRAIN: 0.8538919943665835
TEST: 0.7909392453378772


In [None]:
confusion_matrix(y_train, smote_model.predict(X_train))

array([[545714,   7365],
       [ 65573,  31283]])