<a href="https://colab.research.google.com/github/carlosperez1997/TFM_FlightDelayPrediction/blob/main/benchmarks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#from imputing_functions import *
import numpy as np

def q25(x):
    return x.quantile(0.25)


def q75(x):
    return x.quantile(0.75)


def TS_shift_FG(df, gb_list, time_col, target, shift, funs):
    gb_list_ = gb_list.copy()
    gb_list_.remove(time_col)

    names_ =  [ '_'.join(gb_list)+'_'+target+'_s'+str(shift)+'_'+fun for fun in funs]
    x = df.groupby(gb_list).agg({target:['min','median','mean','max','std','count',q25,q75]})
    x_ = x.groupby(gb_list_).shift(shift)
    x_.columns = [col[1] for col in x_.columns]
    x_ = x_[funs]
    x_.columns = names_
    x_.reset_index(inplace=True)

    df_ = pd.merge(df, x_, how='left', on=gb_list)

    return(df_, names_)


def TS_rolling_shift_FG(df, gb_list, time_col, target, shift, window, funs):
    gb_list_ = gb_list.copy()
    gb_list_.remove(time_col)

    names_ =  [ '_'.join(gb_list)+'_'+target+'_s'+str(shift)+'_r'+str(window)+'_'+fun for fun in funs]

    x = df.groupby(gb_list).agg({target:['min','median','mean','max','std','count',q25,q75,'sum']})
    x.columns = [col[1] for col in x.columns]

    if len(gb_list_) == 1 and len( list(df[gb_list_[0]].unique()) ) == 1:
        x_ = x.shift(shift).rolling(window)
    else:
        x_ = x.groupby(gb_list_).shift(shift).rolling(window)
        
    x_ = x_.agg({'median':['mean','median'], 'mean':['mean','median'],\
                 'min':['min','median','max'], 'max':['min','median','max'], \
                 'q25':['mean','median'], 'q75':['mean','median'], \
                 'sum':['sum'], 'count':['sum','mean']})
    
    x_.columns = x_.columns.map('_'.join)

    x_['avg'] = x_['sum_sum'] / x_['count_sum']

    x_ = x_[funs]   
    x_.columns = names_
    x_.reset_index(inplace=True)

    df_ = pd.merge(df, x_, how='left', on=gb_list)

    return(df_, names_) 


def TS_Feature_Generator_processing(df, target, time_col, shift, gb_list = None, window = None, funs = None):
    if window is None:
        df, names_ = TS_shift_FG(df, gb_list, time_col, target, shift, funs)
    else:
        df, names_ = TS_rolling_shift_FG(df, gb_list, time_col, target, shift, window, funs)
            
    return(df, names_)


def TS_Feature_Generator(df, target, shift, time_col, gb_list = None, window = None, funs = None, fillna_strategy = None):
    
    # Time Series Feature Generator
    df, names_ = TS_Feature_Generator_processing(df, target, time_col, shift, gb_list, window, funs)
            
    # Imputing (fill na strategy)
    #if fillna_strategy is not None:
    #    df = Imputing_Functions(df, name_, gb_list, fillna_strategy)
            
    return(df)

In [None]:
import pandas as pd
import numpy as np

#from lag_features import *
#from imputing_functions import *

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

def apply_calc(df_,calculations):
    for key, value in calculations.items():
        if 'gb_list' in value:
            gb_list = value['gb_list']
        else:
            gb_list = None
        
        if 'funs' in value:
            funs = value['funs']
        else:
            funs = None
            
        if 'windows' in value:
            windows = value['windows']
        else:
            windows = None
        
        time_col = value['time_col']
        target = value['target']
        shifts = value['shifts']
        
        if 'fillna_strat' in value:
            fillna_strat = value['fillna_strat']
        else:
            fillna_strat = None
        
        if windows is None:
            for shift in shifts:
                df_ = TS_Feature_Generator(df=df_.copy(), gb_list=gb_list, target=target, time_col=time_col, shift=shift, funs=funs, 
                                                fillna_strategy=fillna_strat)
        else:
            for window in windows:
                for shift in shifts:
                    df_ = TS_Feature_Generator(df=df_.copy(), gb_list=gb_list, target=target, time_col=time_col,
                                                shift=shift, window=window, funs=funs, fillna_strategy=fillna_strat)
                                                
    return df_


def date_features(df, col):
    df[col] = pd.to_datetime(df[col])
    df['month'] = df[col].dt.month
    df['day'] = df[col].dt.day
    df['year'] = df[col].dt.year
    df['FL_DATE_quarter'] = df['FL_DATE'].dt.quarter
    df['weekday'] = df[col].dt.weekday
    df['year_month'] = [ str(y)+'_'+str(m) if m < 10 else str(y)+'_0'+str(m) for y, m in zip(df['year'], df['month']) ]
    df['year_week'] = df[col].dt.strftime('%Y%V')

    return df


def delete_time_features(df):
    cols = ['month','year','day','weekday','weekday_type']
    for col in cols:
        del df[col]
        
    return(df)


def model_metrics(X_train, y_train, X_test, y_test, model):

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
    test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)
    
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)

    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    print(' --- TRAIN --- ')
    print('     - RMSE: ', train_rmse)
    print('     - MAE: ', train_mae)
    print('     - R2: ', train_r2)
    print(' --- TEST --- ')
    print('     - RMSE: ', test_rmse)
    print('     - MAE: ', test_mae)
    print('     - R2: ', test_r2)

    return train_rmse, test_rmse, train_mae, test_mae, train_r2, train_r2

In [None]:
# Add flight distance in km
def flight_distance(lat1, lon1, lat2, lon2):
    R = 6371
    phi1 = lat1 * math.pi/180
    phi2 = lat2 * math.pi/180
    delta_phi = (lat2-lat1) * math.pi/180
    delta_lambda = (lon2-lon1) * math.pi/180
    
    a = np.sin(delta_phi/2) * np.sin(delta_lambda/2) + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda/2) * np.sin(delta_lambda/2);
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))

    return np.round(R * c) # in kilometres

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        try:
            if col_type != object:
                
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)
            else:
                #df[col] = df[col].astype('category')
                pass
        except:
            pass
            
    end_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    #print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Basics
import pandas as pd
import numpy as np
import time
import os
from os import listdir
from os.path import isfile, join, basename
from tqdm import tqdm
from timeit import timeit
import gc

import math
import sys
import datetime
from dateutil.relativedelta import relativedelta

# Helper functions
#from lag_features import *
#from other_functions import *

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Models
import lightgbm as lgbm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import pickle
import joblib

# Metrics
from sklearn.metrics import mean_squared_error

In [None]:
DIR = '/Users/carlosperezricardo/Desktop/TFM'
DIR = '/content/drive/MyDrive/TFM'

In [None]:
df_ = pd.read_pickle(os.path.join(DIR,'df_DEP.pkl'))

In [None]:
TARGET = 'ARR_DELAY'
features = df_.columns
X = df_[features]
y = df_[TARGET]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
def regression_metrics(y_train, y_test, y_train_pred, y_test_pred):
    train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
    test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)
        
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)

    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    print(' --- TRAIN --- ')
    print('     - RMSE: ', train_rmse)
    print('     - MAE: ', train_mae)
    print('     - R2: ', train_r2)
    print(' --- TEST --- ')
    print('     - RMSE: ', test_rmse)
    print('     - MAE: ', test_mae)
    print('     - R2: ', test_r2)

DEP_DELAY

In [None]:
y_train_pred = X_train['DEP_DELAY']
y_test_pred = X_test['DEP_DELAY']
regression_metrics(y_train, y_test, y_train_pred, y_test_pred)

 --- TRAIN --- 
     - RMSE:  14.06
     - MAE:  11.08
     - R2:  0.6509250953867874
 --- TEST --- 
     - RMSE:  14.02
     - MAE:  11.055
     - R2:  0.651936442510187


DEP_DELAY + difference between arrival and departure

In [None]:
diff = X_train['ARR_DELAY'] - X_train['DEP_DELAY']
diff = diff.astype(int).mean()

y_train_pred = X_train['DEP_DELAY'] + diff
y_test_pred = X_test['DEP_DELAY'] + diff
regression_metrics(y_train, y_test, y_train_pred, y_test_pred)

 --- TRAIN --- 
     - RMSE:  12.35
     - MAE:  8.72
     - R2:  0.7306284840448183
 --- TEST --- 
     - RMSE:  12.305
     - MAE:  8.7
     - R2:  0.7322412161786553


In [None]:
diff

-6.721534090492226

Delay = 0

In [None]:
y_train_pred = 0*y_train
y_test_pred = 0*y_test
regression_metrics(y_train, y_test, y_train_pred, y_test_pred)

 --- TRAIN --- 
     - RMSE:  24.0
     - MAE:  16.44
     - R2:  -0.01637097332482229
 --- TEST --- 
     - RMSE:  23.97
     - MAE:  16.42
     - R2:  -0.016493097725372552


Delay mean

In [None]:
y_train_pred = np.mean(y_train.astype(int))*np.ones(len(y_train))
y_test_pred = np.mean(y_train.astype(int))*np.ones(len(y_test))
regression_metrics(y_train, y_test, y_train_pred, y_test_pred)

 --- TRAIN --- 
     - RMSE:  23.80684873115022
     - MAE:  15.271060520512881
     - R2:  -0.0002733721555456814
 --- TEST --- 
     - RMSE:  23.77640088567632
     - MAE:  15.260118530885782
     - R2:  -0.000339292710750394


In [None]:
y_train_pred = np.median(y_train.astype(int))*np.ones(len(y_train))
y_test_pred = np.median(y_train.astype(int))*np.ones(len(y_test))
regression_metrics(y_train, y_test, y_train_pred, y_test_pred)

 --- TRAIN --- 
     - RMSE:  24.545118772582914
     - MAE:  14.308819213640598
     - R2:  -0.0632739131829696
 --- TEST --- 
     - RMSE:  24.51529640622845
     - MAE:  14.307771883872956
     - R2:  -0.06348016859434868


In [None]:
X_train['prev_week'] = X_train.groupby('year_week')['ARR_DELAY'].transform( lambda x: x.shift(1).mean() ) 
X_train['prev_week'].replace([np.inf, -np.inf], np.nan, inplace=True)
X_train['prev_week'].fillna(np.mean(y_train.astype(int)), inplace=True)

X_test['prev_week'] = X_test.groupby('year_week')['ARR_DELAY'].transform( lambda x: x.shift(1).mean() ) 
X_test['prev_week'].replace([np.inf, -np.inf], np.nan, inplace=True)
X_test['prev_week'].fillna(np.mean(y_train.astype(int)), inplace=True)

y_train_pred = X_train['prev_week']
y_test_pred = X_test['prev_week']

regression_metrics(y_train, y_test, y_train_pred, y_test_pred)

 --- TRAIN --- 
     - RMSE:  23.364168099618432
     - MAE:  14.966322903223165
     - R2:  0.03658028928634072
 --- TEST --- 
     - RMSE:  23.3
     - MAE:  14.96
     - R2:  0.03895248276100183


In [None]:
X_train['prev_month'] = X_train.groupby('year_month')['ARR_DELAY'].transform( lambda x: x.shift(1).mean() ) 
X_train['prev_month'].replace([np.inf, -np.inf], np.nan, inplace=True)
X_train['prev_month'].fillna(np.mean(y_train.astype(int)), inplace=True)

X_test['prev_month'] = X_test.groupby('year_month')['ARR_DELAY'].transform( lambda x: x.shift(1).mean() ) 
X_test['prev_month'].replace([np.inf, -np.inf], np.nan, inplace=True)
X_test['prev_month'].fillna(np.mean(y_train.astype(int)), inplace=True)

y_train_pred = X_train['prev_month']
y_test_pred = X_test['prev_month']

regression_metrics(y_train, y_test, y_train_pred, y_test_pred)

 --- TRAIN --- 
     - RMSE:  23.74741594082517
     - MAE:  15.531858559720787
     - R2:  0.004714674121588214
 --- TEST --- 
     - RMSE:  23.6
     - MAE:  15.15
     - R2:  0.01501906010983134


In [None]:
X_train['month_weekday'] = X_train.groupby(['month','weekday'])['ARR_DELAY'].transform( lambda x: x.mean() ) 
X_train['month_weekday'].replace([np.inf, -np.inf], np.nan, inplace=True)
X_train['month_weekday'].fillna(np.mean(y_train.astype(int)), inplace=True)

X_test['month_weekday'] = X_test.groupby(['month','weekday'])['ARR_DELAY'].transform( lambda x: x.mean() ) 
X_test['month_weekday'].replace([np.inf, -np.inf], np.nan, inplace=True)
X_test['month_weekday'].fillna(np.mean(y_train.astype(int)), inplace=True)

y_train_pred = X_train['month_weekday']
y_test_pred = X_test['month_weekday']

regression_metrics(y_train, y_test, y_train_pred, y_test_pred)

 --- TRAIN --- 
     - RMSE:  23.701929002270234
     - MAE:  15.489866767866657
     - R2:  0.008523856900405136
 --- TEST --- 
     - RMSE:  23.62
     - MAE:  15.195
     - R2:  0.012441124216747701


In [None]:
X_train['same_day'] = X_train.groupby(['FL_DATE'])['ARR_DELAY'].transform( lambda x: x.mean() ) 
X_train['same_day'].replace([np.inf, -np.inf], np.nan, inplace=True)
X_train['same_day'].fillna(np.mean(y_train.astype(int)), inplace=True)

X_test['same_day'] = X_test.groupby(['FL_DATE'])['ARR_DELAY'].transform( lambda x: x.mean() ) 
X_test['same_day'].replace([np.inf, -np.inf], np.nan, inplace=True)
X_test['same_day'].fillna(np.mean(y_train.astype(int)), inplace=True)

y_train_pred = X_train['same_day']
y_test_pred = X_test['same_day']

regression_metrics(y_train, y_test, y_train_pred, y_test_pred)

 --- TRAIN --- 
     - RMSE:  22.5
     - MAE:  14.39
     - R2:  0.10668464311330805
 --- TEST --- 
     - RMSE:  22.44
     - MAE:  14.36
     - R2:  0.10894863322866877


In [None]:
# same day and hour
X_train['ARR_TIME_'] = round(X_train['ARR_TIME']/100).astype(int)
X_train['same_day_hour'] = X_train.groupby(['FL_DATE','ARR_TIME_'])['ARR_DELAY'].transform( lambda x: x.mean() ) 
X_train['same_day_hour'].replace([np.inf, -np.inf], np.nan, inplace=True)
X_train['same_day_hour'].fillna(np.mean(y_train.astype(int)), inplace=True)

X_test['ARR_TIME_'] = round(X_test['ARR_TIME']/100).astype(int)
X_test['same_day_hour'] = X_test.groupby(['FL_DATE','ARR_TIME_'])['ARR_DELAY'].transform( lambda x: x.mean() ) 
X_test['same_day_hour'].replace([np.inf, -np.inf], np.nan, inplace=True)
X_test['same_day_hour'].fillna(np.mean(y_train.astype(int)), inplace=True)

y_train_pred = X_train['same_day']
y_test_pred = X_test['same_day']

regression_metrics(y_train, y_test, y_train_pred, y_test_pred)

 --- TRAIN --- 
     - RMSE:  22.5
     - MAE:  14.39
     - R2:  0.10668464311330805
 --- TEST --- 
     - RMSE:  22.44
     - MAE:  14.36
     - R2:  0.10894863322866877
