# Modelling and Evaluation

In [16]:
# Basics
import pandas as pd
import numpy as np
import time
import os
from os import listdir
from os.path import isfile, join, basename
from tqdm import tqdm
from timeit import timeit
import gc

# Models
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import pickle
import joblib

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
#import lightgbm as lgbm

# Metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

# Plots
import seaborn as sns
import matplotlib.pyplot as plt

In [17]:
DIR = '/Users/carlosperezricardo/Desktop/TFM'
#DIR = '/content/drive/MyDrive/TFM'

TARGET = 'ARR_DELAY'

In [18]:
df = pd.read_pickle(os.path.join(DIR,'df_preprocessed.pkl'))
df.shape

(1110432, 67)

In [19]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        try:
            if col_type != object:
                
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)
            else:
                #df[col] = df[col].astype('category')
                pass
        except:
            pass
            
    end_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    #print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [20]:
df = reduce_mem_usage(df)

df.isnull().sum()[df.isnull().sum()!=0]

Series([], dtype: int64)

In [21]:
def model_metrics(X_train, y_train, X_test, y_test, model):

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
    test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)
    
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)

    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    print(' --- TRAIN --- ')
    print('     - RMSE: ', train_rmse)
    print('     - MAE: ', train_mae)
    print('     - R2: ', train_r2)
    print(' --- TEST --- ')
    print('     - RMSE: ', test_rmse)
    print('     - MAE: ', test_mae)
    print('     - R2: ', test_r2)

    return train_rmse, test_rmse, train_mae, test_mae, train_r2, train_r2

In [22]:
x = pd.DataFrame(df.dtypes).reset_index()
for k,r in x.iterrows():
    if r[0] == 'float64':
        print(r['index'])

## Train test split

In [23]:
features = list(df.select_dtypes(include=np.number).columns)
features.remove(TARGET)
X = df[features]
y = df[TARGET]

In [24]:
for col in X.columns:
    X[col] = X[col].astype('float32')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype('float32')


In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Model selection

In [26]:
# Decision Tree
st = time.time()
dt = DecisionTreeRegressor(max_depth=10, criterion='squared_error')
dt.fit(X_train, y_train)

x = model_metrics(X_train, y_train, X_test, y_test, dt)
et = time.time()
print((et - st)/60)

 --- TRAIN --- 
     - RMSE:  11.605813366641813
     - MAE:  8.222267272028239
     - R2:  0.7714671248684782
 --- TEST --- 
     - RMSE:  11.778888121344524
     - MAE:  8.30350675649742
     - R2:  0.7641907744394252
0.1959684689839681


In [27]:
pd.DataFrame({'columns': X_train.columns,'feature_importance':dt.feature_importances_}).sort_values('feature_importance',ascending=False).head(15)

Unnamed: 0,columns,feature_importance
8,DEP_DELAY,0.961455
50,ARR_DELAY_ORIGIN_year_week_mean,0.023734
9,ARR_TIME,0.003077
47,ARR_DELAY_OP_CARRIER_AIRLINE_ID_month_SCH_ARR_...,0.002247
7,DEP_TIME,0.002231
21,flight_distance,0.001555
49,ARR_DELAY_OP_CARRIER_AIRLINE_ID_year_week_mean,0.000693
18,lon_ORIGIN,0.000679
48,ARR_DELAY_OP_CARRIER_AIRLINE_ID_month_UPD_ARR_...,0.000661
0,OP_CARRIER_AIRLINE_ID,0.000387


In [28]:
# Random Forest
st = time.time()
rf = RandomForestRegressor(n_estimators = 50, max_depth=10, criterion='squared_error')
rf.fit(X_train, y_train)

x = model_metrics(X_train, y_train, X_test, y_test, rf)
et = time.time()
print((et - st)/60)

 --- TRAIN --- 
     - RMSE:  11.44994678569133
     - MAE:  8.10866030902467
     - R2:  0.777564318301063
 --- TEST --- 
     - RMSE:  11.592444322186738
     - MAE:  8.170478166654235
     - R2:  0.7715967728658866
7.002809898058573


In [29]:
pd.DataFrame({'columns': X_train.columns,'feature_importance':rf.feature_importances_}).sort_values('feature_importance',ascending=False).head(15)

Unnamed: 0,columns,feature_importance
8,DEP_DELAY,0.957911
50,ARR_DELAY_ORIGIN_year_week_mean,0.02376
9,ARR_TIME,0.003565
47,ARR_DELAY_OP_CARRIER_AIRLINE_ID_month_SCH_ARR_...,0.002556
7,DEP_TIME,0.002379
21,flight_distance,0.001489
49,ARR_DELAY_OP_CARRIER_AIRLINE_ID_year_week_mean,0.000972
18,lon_ORIGIN,0.000761
48,ARR_DELAY_OP_CARRIER_AIRLINE_ID_month_UPD_ARR_...,0.000708
40,airport_sum_ops,0.000653


In [31]:
# Gradient Boosting
st = time.time()
gb = GradientBoostingRegressor(max_depth=10, criterion='squared_error', min_samples_leaf=30)
gb.fit(X_train, y_train)

x = model_metrics(X_train, y_train, X_test, y_test, gb)
et = time.time()
print((et - st)/60)

 --- TRAIN --- 
     - RMSE:  7.372849736350132
     - MAE:  5.39291102701333
     - R2:  0.9077708334112301
 --- TEST --- 
     - RMSE:  7.923927345057668
     - MAE:  5.718421231430888
     - R2:  0.8932831080940937
96.11595522960027


In [32]:
pd.DataFrame({'columns': X_train.columns,'feature_importance':gb.feature_importances_}).sort_values('feature_importance',ascending=False).head(15)

Unnamed: 0,columns,feature_importance
8,DEP_DELAY,0.81851
9,ARR_TIME,0.047365
7,DEP_TIME,0.038617
50,ARR_DELAY_ORIGIN_year_week_mean,0.021518
18,lon_ORIGIN,0.017625
17,lat_ORIGIN,0.006156
21,flight_distance,0.005768
47,ARR_DELAY_OP_CARRIER_AIRLINE_ID_month_SCH_ARR_...,0.004713
40,airport_sum_ops,0.003382
48,ARR_DELAY_OP_CARRIER_AIRLINE_ID_month_UPD_ARR_...,0.002766


In [None]:
# Lightgbm
st = time.time()
reg = lgbm.LGBMRegressor(n_estimators = 250, objective='rmse')
reg.fit(X_train, y_train)

x = model_metrics(X_train, y_train, X_test, y_test, reg)
et = time.time()
print((et - st)/60)

### Hyperparameter tunning

In [None]:
mdl = lgbm.LGBMRegressor(boosting_type= 'gbdt', 
        n_jobs = 5, 
        silent = True)

gridParams = {
    'n_estimators': [100,150,200,250,300],
    'num_leaves': [6,12,18], # large num_leaves helps improve accuracy but might lead to over-fitting
    'objective' : ['regression'],
    'metric' : ['rmse'],
    'random_state' : [42],
    }

#grid = GridSearchCV(mdl, gridParams, verbose=1, cv=5, n_jobs=-1)
grid = RandomizedSearchCV(mdl, gridParams, verbose=1, cv=5, n_jobs=-1, n_iter=20, random_state=42)
# Run the grid
grid.fit(X_train, y_train)

# Print the best parameters found
print(grid.best_params_)
print(grid.best_score_)

In [None]:
model = grid.best_estimator_
x = model_metrics(X_train, y_train, X_test, y_test, model)

In [None]:
pd.DataFrame({'columns': X_train.columns,'feature_importance':model.feature_importances_}).sort_values('feature_importance',ascending=False).head(25)

## Evaluation

In [None]:
X_train_ = X_train.copy(deep=True)
X_train_['ARR_DELAY_pred'] = model.predict(X_train)
X_train_['ARR_DELAY_error'] = X_train_['ARR_DELAY'] - X_train_['ARR_DELAY_pred']

In [None]:
X_train_[['DEP_DELAY','ARR_DELAY','ARR_DELAY_pred']].sort_values('ARR_DELAY', ascending=False).head(30)

In [None]:
X_train_[['DEP_DELAY','ARR_DELAY','ARR_DELAY_pred']].sort_values('ARR_DELAY').head(20)

In [None]:
X_train_[['DEP_DELAY','ARR_DELAY','ARR_DELAY_pred']].sort_values('ARR_DELAY_error', ascending=False).head(20)

In [None]:
X_train_[['DEP_DELAY','ARR_DELAY','ARR_DELAY_pred']].head(30)

In [None]:
perc =[0.05, .10, .25, 0.5, 0.75, .90, 0.95]
X_train_['ARR_DELAY_error'].describe(percentiles = perc)

In [None]:
sns.set(style='white', rc={"grid.linewidth": 0.1})
plt.figure(figsize=(10,4))

ax = sns.histplot(data=df, x='ARR_DELAY_error', bins=100)
ax.axes.set_title("Histogram of prediction error",fontsize=20)
ax.set_xlabel("Error (Prediction - Actual) [min]",fontsize=16)
ax.set_ylabel("Count",fontsize=16)
plt.savefig('ARR_DELAY_error.png')
plt.show()