In [362]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
import pickle
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

In [363]:
flights = pd.read_csv('data/flights_cleaned.csv').drop(columns=['Unnamed: 0'])

In [364]:
def time_row(row):
    row_int = int(row)
    row_str = str(row_int)
    len_row = len(row_str)
    if len_row == 1:
        minute = '0' + row_str
        hour = '00'
        row = hour + ':' + minute
    if len_row == 2:
        minute =  row_str[0] + row_str[1]
        hour = '00'
        row = hour + ':' + minute
    if len_row == 3:
        minute = row_str[1]+row_str[2]
        hour = '0' + row_str[0]
        row = hour + ':' + minute
    elif len_row == 4:
        minute = row_str[2] + row_str[3]
        hour = row_str[0] + row_str[1]
        row = hour + ':' + minute
        if row == '24:00':
            row = '23:59'
    return str(row)

In [365]:
flights['crs_dep_time'] = flights['crs_dep_time'].apply(time_row)
flights['dep_time'] = flights['dep_time'].apply(time_row)
flights['wheels_off'] = flights['wheels_off'].apply(time_row)
flights['wheels_on'] = flights['wheels_on'].apply(time_row)
flights['crs_arr_time'] = flights['crs_arr_time'].apply(time_row)
flights['arr_time'] = flights['arr_time'].apply(time_row)

In [366]:
#lets create year, month and day
flights['fl_date'] = pd.to_datetime(flights['fl_date'],format='%Y-%m-%d')
flights['year'] = pd.DatetimeIndex(flights['fl_date']).year
flights['month'] = pd.DatetimeIndex(flights['fl_date']).month
flights['day'] = pd.DatetimeIndex(flights['fl_date']).day

In [367]:
#lets change the time columns format to time
flights['crs_dep_time'] = pd.to_datetime(flights['crs_dep_time'],format="%H:%M").dt.time
flights['dep_time'] = pd.to_datetime(flights['dep_time'],format="%H:%M").dt.time
flights['wheels_off'] = pd.to_datetime(flights['wheels_off'],format="%H:%M").dt.time
flights['wheels_on'] = pd.to_datetime(flights['wheels_on'],format="%H:%M").dt.time
flights['crs_arr_time'] = pd.to_datetime(flights['crs_arr_time'],format='%H:%M').dt.time
flights['arr_time'] = pd.to_datetime(flights['arr_time'],format="%H:%M").dt.time

In [368]:
#dropping the large delays, they seem to be outlier
flights = flights[flights['dep_delay'] <= 15]
flights['crs_arr_hour'] = flights['crs_arr_time'].apply(lambda x:x.hour)

In [369]:
top_dest = flights.groupby('dest')['taxi_in'].count().sort_values(ascending=False).head(12)

In [370]:
list_dest = []
for i in range(0,12):
    list_dest.append(top_dest.index[i])
flights=flights.loc[flights['dest'].isin(list_dest)]

In [371]:
#arrival delay filter based on hour of the day
flights = flights.loc[(flights['crs_arr_hour'] >=6) & flights['crs_arr_hour'] <=18]

In [309]:
flights = flights[['dest','mkt_unique_carrier','crs_arr_hour','air_time',
                   'taxi_in','crs_elapsed_time','month',
                  'arr_delay']]
numeric =flights[['air_time','taxi_in','crs_elapsed_time','arr_delay']]
dummies = pd.get_dummies(flights[['dest','mkt_unique_carrier','crs_arr_hour','month']],drop_first=True)


In [310]:
all_in_one = pd.concat([numeric,dummies],axis=1)

In [311]:
X = all_in_one.loc[:,all_in_one.columns!='arr_delay']
y = all_in_one[['arr_delay']]

In [312]:
X_test,X_train,y_test,y_train = train_test_split(X,y,test_size=0.3,random_state=123)

In [313]:
#normalize test train
min_max = MinMaxScaler()

cols_train = X_train.columns
values_train = X_train.values
values_train_norm = min_max.fit_transform(values_train)
X_train = pd.DataFrame(values_train_norm,columns=cols_train)

cols_test = X_test.columns
values_test = X_test.values
values_test_norm = min_max.fit_transform(values_test)
X_test = pd.DataFrame(values_test_norm,columns=cols_test)

In [314]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 10, alpha = 10, n_estimators = 50)

xg_reg.fit(X_train,y_train)
filename = 'XGBoost_regressor_final.sav'
pickle.dump(xg_reg, open(filename, 'wb'))

In [315]:
y_pred = xg_reg.predict(X_test)

In [316]:
y_pred

array([56.421497, 61.49263 , 59.291824, ..., 64.75529 , 65.89438 ,
       60.833855], dtype=float32)

In [317]:
rmse = np.sqrt(mean_squared_error(y_test,y_pred))
r2 = r2_score(y_test,y_pred)

print('rmse: ',rmse)
print('r2_score:', r2)

rmse:  13.179832009538854
r2_score: 0.16617139487308386


In [318]:
X_train.columns

Index(['air_time', 'taxi_in', 'crs_elapsed_time', 'dest', 'mkt_unique_carrier',
       'crs_arr_hour', 'month'],
      dtype='object')

In [319]:
#defining the model: Ridge Regression
rr = Ridge()

#parameters to be tuned for ridge regression: alpha
parameters = {'alpha':[0.01,0.1,0.5,1,5]}

#Definning the GridSearch
Ridge_tuned = GridSearchCV(rr,param_grid=parameters, scoring='r2', cv=10)
Ridge_tuned.fit(X_train,y_train)
bst_estimator = Ridge_tuned.best_estimator_

In [320]:
Ridge_tuned.fit(X_train,y_train)

GridSearchCV(cv=10, estimator=Ridge(),
             param_grid={'alpha': [0.01, 0.1, 0.5, 1, 5]}, scoring='r2')

In [321]:
bst_estimator = Ridge_tuned.best_estimator_

In [322]:
bst_estimator.fit(X_train,y_train)
xg_reg.fit(X_train,y_train)
filename = 'Ridge_regressor_final.sav'
pickle.dump(xg_reg, open(filename, 'wb'))

In [323]:
y_pred_ridg = bst_estimator.predict(X_test)


In [325]:
r2_score(y_test,y_pred)
#np.sqrt(mean_squared_error(y_test,y_pred))

0.16617139487308386

### Ordinal Encoding

In [383]:
from sklearn.preprocessing import OrdinalEncoder

In [384]:
cols = flights.columns
enc = OrdinalEncoder()
flights = enc.fit_transform(flights)
flights = pd.DataFrame(flights,columns=cols)

In [374]:
X = all_in_one.loc[:,all_in_one.columns!='arr_delay']
y = all_in_one[['arr_delay']]l

In [375]:
X_test,X_train,y_test,y_train = train_test_split(X,y,test_size=0.3,random_state=123)

In [376]:
#normalize test train
min_max = MinMaxScaler()

cols_train = X_train.columns
values_train = X_train.values
values_train_norm = min_max.fit_transform(values_train)
X_train = pd.DataFrame(values_train_norm,columns=cols_train)

cols_test = X_test.columns
values_test = X_test.values
values_test_norm = min_max.fit_transform(values_test)
X_test = pd.DataFrame(values_test_norm,columns=cols_test)

In [377]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 100, alpha = 10, n_estimators = 100)

xg_reg.fit(X_train,y_train)
#filename = 'XGBoost_regressor_final.sav'
#pickle.dump(xg_reg, open(filename, 'wb'))

XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=100,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=10, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [378]:
y_pred = xg_reg.predict(X_test)

In [379]:
rmse = np.sqrt(mean_squared_error(y_test,y_pred))
r2 = r2_score(y_test,y_pred)

print('rmse: ',rmse)
print('r2_score:', r2)

rmse:  13.296371015113143
r2_score: 0.15136040616121804


In [440]:
flights_test = pd.read_csv('data/flights_test.csv')

In [441]:
flights_test['crs_dep_time'] = flights_test['crs_dep_time'].apply(time_row)
flights_test['crs_arr_time'] = flights_test['crs_arr_time'].apply(time_row)

In [442]:
#lets create year, month and day
flights_test['fl_date'] = pd.to_datetime(flights_test['fl_date'],format='%Y-%m-%d')
flights_test['year'] = pd.DatetimeIndex(flights_test['fl_date']).year
flights_test['month'] = pd.DatetimeIndex(flights_test['fl_date']).month
flights_test['day'] = pd.DatetimeIndex(flights_test['fl_date']).day

In [443]:
#lets change the time columns format to time
flights_test['crs_dep_time'] = pd.to_datetime(flights_test['crs_dep_time'],format="%H:%M").dt.time
flights_test['crs_arr_time'] = pd.to_datetime(flights_test['crs_arr_time'],format='%H:%M').dt.time

In [444]:
#dropping the large delays, they seem to be outlier
flights_test['crs_arr_hour'] = flights_test['crs_arr_time'].apply(lambda x:x.hour)

In [445]:
flights_concat = flights_test.copy()

In [447]:
flights_concat = flights_concat[['fl_date','mkt_carrier','mkt_carrier_fl_num','origin','dest']]

In [448]:
flights_test = flights_test[['dest','mkt_unique_carrier','crs_arr_hour',
                   'crs_elapsed_time','month']]

In [449]:
flights_test.head()

Unnamed: 0,dest,mkt_unique_carrier,crs_arr_hour,crs_elapsed_time,month
0,PHX,DL,11,211,1
1,MIA,AA,9,162,1
2,DEN,WN,21,165,1
3,TUS,AA,22,46,1
4,BOS,B6,21,89,1


In [450]:
flights_test['air_time'] = 0
flights_test['taxi_in'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flights_test['air_time'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flights_test['taxi_in'] = 0


In [451]:
cols = flights_test.columns
enc = OrdinalEncoder()
flights_test = enc.fit_transform(flights_test)
flights_test = pd.DataFrame(flights_test,columns=cols)

In [452]:
flights_test = flights_test[X_train.columns]

In [453]:
cols_train = flights_test.columns
values_train = flights_test.values
values_train_norm = min_max.fit_transform(values_train)
flights_test = pd.DataFrame(values_train_norm,columns=cols_train)

In [454]:
flights_test_delays = xg_reg.predict(flights_test)

In [455]:
flights_test_delays.min()

21.12887

In [460]:
columns = ['predicted_delay']
predicted_delays = pd.DataFrame(flights_test_delays,columns=columns)

In [461]:
predicted_delays

Unnamed: 0,predicted_delay
0,37.227093
1,36.298462
2,52.826542
3,53.517471
4,47.133911
...,...
299995,59.453117
299996,37.978970
299997,42.795376
299998,52.954189


In [456]:
flights_test.head()

Unnamed: 0,air_time,taxi_in,crs_elapsed_time,dest,mkt_unique_carrier,crs_arr_hour,month
0,0.0,0.0,0.398268,0.71978,0.333333,0.478261,0.0
1,0.0,0.0,0.292208,0.604396,0.0,0.391304,0.0
2,0.0,0.0,0.298701,0.25,1.0,0.913043,0.0
3,0.0,0.0,0.041126,0.956044,0.0,0.956522,0.0
4,0.0,0.0,0.134199,0.120879,0.222222,0.913043,0.0


In [463]:
final_table = pd.merge(flights_concat, predicted_delays, left_index=True, right_index=True)

In [466]:
final_one_week = final_table.loc[(final_table['fl_date']>='2020-01-01') & (final_table['fl_date']<='2020-01-07')]

In [470]:
table_to_submit = final_one_week.sort_values(by="fl_date").reset_index(drop=True)

In [475]:
table_to_submit.to_csv('final_predictions.csv',index=False)