In [187]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score

In [150]:
flights = pd.read_csv('data/flights_cleaned.csv').drop(columns=['Unnamed: 0'])

In [151]:
def time_row(row):
    row_int = int(row)
    row_str = str(row_int)
    len_row = len(row_str)
    if len_row == 1:
        minute = '0' + row_str
        hour = '00'
        row = hour + ':' + minute
    if len_row == 2:
        minute =  row_str[0] + row_str[1]
        hour = '00'
        row = hour + ':' + minute
    if len_row == 3:
        minute = row_str[1]+row_str[2]
        hour = '0' + row_str[0]
        row = hour + ':' + minute
    elif len_row == 4:
        minute = row_str[2] + row_str[3]
        hour = row_str[0] + row_str[1]
        row = hour + ':' + minute
        if row == '24:00':
            row = '23:59'
    return str(row)

In [152]:
flights['crs_dep_time'] = flights['crs_dep_time'].apply(time_row)
flights['dep_time'] = flights['dep_time'].apply(time_row)
flights['wheels_off'] = flights['wheels_off'].apply(time_row)
flights['wheels_on'] = flights['wheels_on'].apply(time_row)
flights['crs_arr_time'] = flights['crs_arr_time'].apply(time_row)
flights['arr_time'] = flights['arr_time'].apply(time_row)

In [153]:
#lets create year, month and day
flights['fl_date'] = pd.to_datetime(flights['fl_date'],format='%Y-%m-%d')
flights['year'] = pd.DatetimeIndex(flights['fl_date']).year
flights['month'] = pd.DatetimeIndex(flights['fl_date']).month
flights['day'] = pd.DatetimeIndex(flights['fl_date']).day

In [154]:
#lets change the time columns format to time
flights['crs_dep_time'] = pd.to_datetime(flights['crs_dep_time'],format="%H:%M").dt.time
flights['dep_time'] = pd.to_datetime(flights['dep_time'],format="%H:%M").dt.time
flights['wheels_off'] = pd.to_datetime(flights['wheels_off'],format="%H:%M").dt.time
flights['wheels_on'] = pd.to_datetime(flights['wheels_on'],format="%H:%M").dt.time
flights['crs_arr_time'] = pd.to_datetime(flights['crs_arr_time'],format='%H:%M').dt.time
flights['arr_time'] = pd.to_datetime(flights['arr_time'],format="%H:%M").dt.time

In [155]:
#dropping the large delays, they seem to be outlier
flights = flights[flights['dep_delay'] <= 15]
flights['crs_arr_hour'] = flights['crs_arr_time'].apply(lambda x:x.hour)

In [156]:
flights.head()

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,...,dup,crs_elapsed_time,actual_elapsed_time,air_time,flights,distance,year,month,day,crs_arr_hour
0,2019-05-22,DL,DL,DL,544,DL,544,11433,DTW,"Detroit, MI",...,N,115,101.0,82.0,1,632,2019,5,22,11
2,2019-06-27,AA,AA,AA,2036,AA,2036,12953,LGA,"New York, NY",...,N,133,119.0,81.0,1,544,2019,6,27,11
3,2018-04-05,UA,UA_CODESHARE,UA,5306,OO,5306,11292,DEN,"Denver, CO",...,N,135,120.0,105.0,1,649,2018,4,5,10
5,2019-02-23,AA,AA_CODESHARE,AA,5365,OH,5365,11146,CRW,"Charleston/Dunbar, WV",...,N,87,90.0,56.0,1,221,2019,2,23,10
6,2019-10-09,DL,DL,DL,758,DL,758,10423,AUS,"Austin, TX",...,N,193,196.0,178.0,1,1242,2019,10,9,19


In [157]:
top_dest = flights.groupby('dest')['taxi_in'].count().sort_values(ascending=False).head(12)

In [158]:
list_dest = []
for i in range(0,12):
    list_dest.append(top_dest.index[i])
flights=flights.loc[flights['dest'].isin(list_dest)]

In [159]:
#arrival delay filter based on hour of the day
flights = flights.loc[(flights['crs_arr_hour'] >=6) & flights['crs_arr_hour'] <=18]

In [161]:
flights = flights[['dest','mkt_unique_carrier','crs_arr_hour','air_time',
                   'taxi_in','crs_elapsed_time','month',
                  'arr_delay']]
numeric =flights[['air_time','taxi_in','crs_elapsed_time','arr_delay']]
dummies = pd.get_dummies(flights[['dest','mkt_unique_carrier','crs_arr_hour','month']],drop_first=True)


In [164]:
all_in_one = pd.concat([numeric,dummies],axis=1)

In [167]:
X = all_in_one.loc[:,all_in_one.columns!='arr_delay']
y = all_in_one[['air_time']]

In [169]:
X_test,X_train,y_test,y_train = train_test_split(X,y,test_size=0.3,random_state=123)

In [171]:
#normalize test train
min_max = MinMaxScaler()

cols_train = X_train.columns
values_train = X_train.values
values_train_norm = min_max.fit_transform(values_train)
X_train = pd.DataFrame(values_train_norm,columns=cols_train)

cols_test = X_test.columns
values_test = X_test.values
values_test_norm = min_max.fit_transform(values_test)
X_test = pd.DataFrame(values_test_norm,columns=cols_test)

In [179]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 10, alpha = 10, n_estimators = 50)

In [180]:
xg_reg.fit(X_train,y_train)

XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=10,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=50, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=10, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [181]:
y_pred = xg_reg.predict(X_test)

In [184]:
y_pred

array([159.12666 , 107.743355,  96.190605, ..., 116.23766 , 132.98116 ,
        34.34118 ], dtype=float32)

In [188]:
rmse = np.sqrt(mean_squared_error(y_test,y_pred))

In [190]:
r2_score(y_test,y_pred)

0.976721787425578

In [196]:
y_pred.mean()

124.58715