In [17]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
#from sklearn.model_selection import StratifiedKFold
#from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures
import calendar
import xgboost as xgb
from xgboost import plot_importance
from sklearn.decomposition import PCA
import joblib
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
import pickle

In [18]:
#Ridge_label
ridge_label = open('Ridge_label_final.sav','rb')
model_ridge = pickle.load(ridge_label)
ridge_label.close()
#Ridge onehot
ridge_hot = open('Ridge_onehot_final.sav','rb')
model_ridge_one = pickle.load(ridge_hot)
ridge_hot.close()
#XGBoost onehot
xgb_hot = open('XGBoost_onehot_reg.sav','rb')
model_xgb_one = pickle.load(xgb_hot)
xgb_hot.close()

### LABEL ENCODED DATA

In [19]:
#categorizing the time
def time_categorize(row):
    if (row.hour >= 00) and (row.hour<7):
        row = 1
    elif (row.hour >=7) and (row.hour <14):
        row = 10
    else:
        row=2   
    return row

In [20]:
df = pd.read_csv('data/flights_no_outlier_iqr_time.csv').drop(columns=['Unnamed: 0'])
df['fl_date'] = pd.to_datetime(df['fl_date'],format='%Y-%m-%d')
df['dep_time_format'] = pd.to_datetime(df['dep_time_format'],format='%H:%M:%S').dt.time
df['arrival_time_format'] = pd.to_datetime(df['arrival_time_format'],format='%H:%M:%S').dt.time
df = df.sort_values(by='fl_date')
df['month'] = pd.DatetimeIndex(df['fl_date']).month

In [21]:
#dropping the useless columns
df_filtered = df[['mkt_unique_carrier','distance','air_time','actual_elapsed_time',
                'taxi_in','taxi_out','arr_delay','origin','dest','dep_time_format','arrival_time_format','month']]

In [22]:
df_filtered['dep_time_format'] = df_filtered['dep_time_format'].apply(time_categorize)
df_filtered['arrival_time_format']= df_filtered['arrival_time_format'].apply(time_categorize)

In [23]:
#labeling the categorical data
label_encoder = preprocessing.LabelEncoder()
enc_cols = ['mkt_unique_carrier','origin','dest','dep_time_format','arrival_time_format']
for col in enc_cols:
    df_filtered[col] = label_encoder.fit_transform(df_filtered[col].astype(str))


In [24]:
X = df_filtered.loc[:, df_filtered.columns != 'arr_delay']
y = df_filtered[['arr_delay']]

In [25]:
#splitting to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state=123)

In [26]:
#normalizing the train data
cols_1 = X_train.columns
x = X_train.values
min_max = preprocessing.MinMaxScaler()
x_scaled = min_max.fit_transform(x)
X_train = pd.DataFrame(x_scaled,columns=cols_1)

cols_2 = X_test.columns
x_test = X_test.values
x_test_scaled = min_max.fit_transform(x_test)
X_test = pd.DataFrame(x_test_scaled,columns=cols_2)

In [27]:
y_pred_label = model_ridge.predict(X_test)
print('r2_score for label encoded, ridge regression with tunned hyper-parameters: ',
     r2_score(y_test,y_pred_label))

print('root mean squared error for label encoded, ridge regression with tunned parameters: ',
     (np.sqrt(mean_squared_error(y_test,y_pred_label))))

r2_score for label encoded, ridge regression with tunned hyper-parameters:  0.316503793717516
root mean squared error for label encoded, ridge regression with tunned parameters:  12.554735496153507


### ONE HOT ENCODED

In [28]:
df_one = pd.read_csv('data/flights_no_outlier_iqr_time.csv').drop(columns=['Unnamed: 0'])
df_one['fl_date'] = pd.to_datetime(df_one['fl_date'],format='%Y-%m-%d')
df_one['dep_time_format'] = pd.to_datetime(df_one['dep_time_format'],format='%H:%M:%S').dt.time
df_one['arrival_time_format'] = pd.to_datetime(df_one['arrival_time_format'],format='%H:%M:%S').dt.time
df_one = df_one.sort_values(by='fl_date')
df_one['month'] = pd.DatetimeIndex(df_one['fl_date']).month
df_one['month'] = df_one['month'].apply(lambda x: calendar.month_abbr[x])

In [29]:
#dropping the useless columns
df_one_filt = df_one[['mkt_unique_carrier','distance','air_time','actual_elapsed_time',
                'taxi_in','taxi_out','arr_delay','origin','dest','dep_time_format','arrival_time_format','month']]

In [30]:
df_one_filt['dep_time_format'] = df_one_filt['dep_time_format'].apply(time_categorize)
df_one_filt['arrival_time_format']= df_one_filt['arrival_time_format'].apply(time_categorize)

In [31]:
dummies = pd.get_dummies(df_one_filt[['mkt_unique_carrier','origin','dest','month','dep_time_format','arrival_time_format']],drop_first=True)
df_one_num = df_one[['distance','air_time','actual_elapsed_time',
                'taxi_in','taxi_out','arr_delay']]
indep_dep_var = pd.concat([df_one_num,dummies],axis=1)

In [32]:
X_one = indep_dep_var.loc[:, indep_dep_var.columns != 'arr_delay']
y_one = indep_dep_var[['arr_delay']]

In [33]:
X_train_one, X_test_one, y_train_one, y_test_one = train_test_split(X_one, y_one,test_size=0.3,random_state=123)

In [34]:
#normalizing the train data
cols_1 = X_train_one.columns
x = X_train_one.values
min_max = preprocessing.MinMaxScaler()
x_scaled = min_max.fit_transform(x)
X_train_one = pd.DataFrame(x_scaled,columns=cols_1)

cols_2 = X_test_one.columns
x_test = X_test_one.values
x_test_scaled = min_max.fit_transform(x_test)
X_test_one = pd.DataFrame(x_test_scaled,columns=cols_2)

### RIDGE REG

In [35]:
y_ridge_one = model_ridge_one.predict(X_test_one)
print('r2_score for one-hot encoded, tunned hyperparameters:',
     r2_score(y_test_one,y_ridge_one))
print('root mean squared error for one-hot, ridge regression with tunned parameters: ',
     (np.sqrt(mean_squared_error(y_test_one,y_ridge_one))))

r2_score for one-hot encoded, tunned hyperparameters: 0.3780990486872633
root mean squared error for one-hot, ridge regression with tunned parameters:  11.97567837324488


### XGBOOST REG

In [36]:
y_xg_one = model_xgb_one.predict(X_test_one)
print('r2_score for one-hot encoded, xgboost:',
     r2_score(y_test_one,y_xg_one))
print('root mean squared error for one-hot, xgboost: ',
     (np.sqrt(mean_squared_error(y_test_one,y_xg_one))))

r2_score for one-hot encoded, xgboost: 0.4253397455129371
root mean squared error for one-hot, xgboost:  11.511849246006284
