In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
#from sklearn.model_selection import StratifiedKFold
#from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures
import calendar
import xgboost as xgb
from xgboost import plot_importance
from sklearn.decomposition import PCA
import joblib
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [2]:
df = pd.read_csv('data/flights_no_outlier_iqr_time.csv').drop(columns=['Unnamed: 0'])
df['fl_date'] = pd.to_datetime(df['fl_date'],format='%Y-%m-%d')
df['dep_time_format'] = pd.to_datetime(df['dep_time_format'],format='%H:%M:%S').dt.time
df['arrival_time_format'] = pd.to_datetime(df['arrival_time_format'],format='%H:%M:%S').dt.time
df = df.sort_values(by='fl_date')
df['month'] = pd.DatetimeIndex(df['fl_date']).month

In [3]:
#dropping the useless columns
df_filtered = df[['mkt_unique_carrier','distance','air_time','actual_elapsed_time',
                'taxi_in','taxi_out','arr_delay','origin','dest','dep_time_format','arrival_time_format','month']]

### Trying label encoding

In [4]:
#categorizing the time
def time_categorize(row):
    if (row.hour >= 00) and (row.hour<7):
        row = 1
    elif (row.hour >=7) and (row.hour <14):
        row = 10
    else:
        row=2   
    return row

In [5]:
df_filtered['dep_time_format'] = df_filtered['dep_time_format'].apply(time_categorize)
df_filtered['arrival_time_format']= df_filtered['arrival_time_format'].apply(time_categorize)

In [6]:
#labeling the categorical data
label_encoder = preprocessing.LabelEncoder()
enc_cols = ['mkt_unique_carrier','origin','dest','dep_time_format','arrival_time_format']
for col in enc_cols:
    df_filtered[col] = label_encoder.fit_transform(df_filtered[col].astype(str))


In [7]:
X = df_filtered.loc[:, df_filtered.columns != 'arr_delay']
y = df_filtered[['arr_delay']]

In [8]:
#splitting to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state=123)

In [9]:
#normalizing the train data
cols_1 = X_train.columns
x = X_train.values
min_max = preprocessing.MinMaxScaler()
x_scaled = min_max.fit_transform(x)
X_train = pd.DataFrame(x_scaled,columns=cols_1)

cols_2 = X_test.columns
x_test = X_test.values
x_test_scaled = min_max.fit_transform(x_test)
X_test = pd.DataFrame(x_test_scaled,columns=cols_2)

In [10]:
# #splitting the X and y
# X_train = indep_dep.loc[:,indep_dep.columns!='arr_delay']
# y_train = indep_dep.arr_delay

# X_test = indep_dep_test.loc[:,indep_dep_test.columns!='arr_delay']
# y_test = indep_dep_test.arr_delay

In [11]:
#defining the model: Ridge Regression
rr = Ridge()

#parameters to be tuned for ridge regression: alpha
parameters = {'alpha':[0.01,0.1,0.5,1,5]}

#Definning the GridSearch
Ridge_tuned = GridSearchCV(rr,param_grid=parameters, scoring='r2', cv=10)

In [12]:
#training the model with train data
Ridge_tuned.fit(X_train,y_train)
print(Ridge_tuned.best_estimator_)

Ridge(alpha=0.01)


In [13]:
y_pred = Ridge_tuned.best_estimator_.predict(X_test)

In [14]:
print(r2_score(y_test,y_pred))
print(np.sqrt(mean_squared_error(y_test,y_pred)))

0.316503793717516
12.554735496153507


### trying one hot encoding

In [15]:
df_one = pd.read_csv('data/flights_no_outlier_iqr_time.csv').drop(columns=['Unnamed: 0'])
df_one['fl_date'] = pd.to_datetime(df_one['fl_date'],format='%Y-%m-%d')
df_one['dep_time_format'] = pd.to_datetime(df_one['dep_time_format'],format='%H:%M:%S').dt.time
df_one['arrival_time_format'] = pd.to_datetime(df_one['arrival_time_format'],format='%H:%M:%S').dt.time
df_one = df_one.sort_values(by='fl_date')
df_one['month'] = pd.DatetimeIndex(df_one['fl_date']).month
df_one['month'] = df_one['month'].apply(lambda x: calendar.month_abbr[x])

In [16]:
#dropping the useless columns
df_one_filt = df_one[['mkt_unique_carrier','distance','air_time','actual_elapsed_time',
                'taxi_in','taxi_out','arr_delay','origin','dest','dep_time_format','arrival_time_format','month']]


In [17]:
df_one_filt['dep_time_format'] = df_one_filt['dep_time_format'].apply(time_categorize)
df_one_filt['arrival_time_format']= df_one_filt['arrival_time_format'].apply(time_categorize)

In [18]:
dummies = pd.get_dummies(df_one_filt[['mkt_unique_carrier','origin','dest','month','dep_time_format','arrival_time_format']],drop_first=True)
df_one_num = df_one[['distance','air_time','actual_elapsed_time',
                'taxi_in','taxi_out','arr_delay']]
indep_dep_var = pd.concat([df_one_num,dummies],axis=1)

In [19]:
X_one = indep_dep_var.loc[:, indep_dep_var.columns != 'arr_delay']
y_one = indep_dep_var[['arr_delay']]

In [20]:
X_train_one, X_test_one, y_train_one, y_test_one = train_test_split(X_one, y_one,test_size=0.3,random_state=123)

In [21]:
#normalizing the train data
cols_1 = X_train_one.columns
x = X_train_one.values
min_max = preprocessing.MinMaxScaler()
x_scaled = min_max.fit_transform(x)
X_train_one = pd.DataFrame(x_scaled,columns=cols_1)

cols_2 = X_test_one.columns
x_test = X_test_one.values
x_test_scaled = min_max.fit_transform(x_test)
X_test_one = pd.DataFrame(x_test_scaled,columns=cols_2)

In [22]:
#defining the model: Ridge Regression
rr = Ridge()

#parameters to be tuned for ridge regression: alpha
parameters = {'alpha':[0.01,0.1,0.5,1,5]}

#Definning the GridSearch
Ridge_tuned = GridSearchCV(rr,param_grid=parameters, scoring='r2', cv=10)

In [23]:
Ridge_tuned.fit(X_train_one,y_train_one)
filename = ''

In [24]:
y_pred = Ridge_tuned.best_estimator_.predict(X_test_one)

In [25]:
print(r2_score(y_test_one,y_pred))
print(np.sqrt(mean_squared_error(y_test_one,y_pred)))

0.3780990486872633
11.97567837324488


### XGboost for one hot enchoded data

In [74]:
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3,
                          learning_rate=0.1,
                          max_depth=100,
                          alpha=0.1,
                          n_estimators = 50)

In [75]:
xg_reg.fit(X_train_one,y_train_one)

XGBRegressor(alpha=0.1, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=100,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=50, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0.100000001, reg_lambda=1, scale_pos_weight=1,
             subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [78]:
y_pred_xg = xg_reg.predict(X_test_one)

In [79]:
print(r2_score(y_test_one,y_pred_xg))
print(np.sqrt(mean_squared_error(y_test_one,y_pred_xg)))

0.3892339731809782
11.867983918755613


### PCA Reduction

In [85]:
pca = PCA(n_components=3)
principalComponents = pca.fit_transform(X_one)

In [90]:
pca_scaled = min_max.fit_transform(principalComponents)

In [92]:
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(pca_scaled, y_one,test_size=0.3,random_state=123)

In [95]:
xg_reg.fit(X_train_pca,y_train_pca)

XGBRegressor(alpha=0.1, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=100,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=50, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0.100000001, reg_lambda=1, scale_pos_weight=1,
             subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [97]:
y_pca_xg = xg_reg.predict(X_test_pca)

In [98]:
print(r2_score(y_test_one,y_pca_xg))
print(np.sqrt(mean_squared_error(y_test_one,y_pca_xg)))

0.08728610761419242
14.507976508636855


### polynomial regression

In [26]:
crossvalidation = KFold(n_splits=5, random_state=1, shuffle=False)

for i in range(1,7):
    poly = PolynomialFeatures(degree=i)
    X_current = poly.fit_transform(X_train)
    model = rr.fit(X_current, y_train)
    scores = cross_val_score(model, X_current, y_train, scoring="neg_mean_squared_error", cv=crossvalidation,
 n_jobs=1)
    
    print("Degree-"+str(i)+" polynomial MSE: " + str(np.mean(np.abs(scores))) + ", STD: " + str(np.std(scores)))



Degree-1 polynomial MSE: 142.71448558361828, STD: 0.7494073857743158
Degree-2 polynomial MSE: 131.37490744133453, STD: 0.4801941327335821
Degree-3 polynomial MSE: 126.64085418672785, STD: 0.32378787268633674
Degree-4 polynomial MSE: 123.88599376202879, STD: 0.3148647020917667


Traceback (most recent call last):
  File "C:\Users\ahmad\anaconda3\envs\bootcamp\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ahmad\anaconda3\envs\bootcamp\lib\site-packages\sklearn\linear_model\_ridge.py", line 762, in fit
    return super().fit(X, y, sample_weight=sample_weight)
  File "C:\Users\ahmad\anaconda3\envs\bootcamp\lib\site-packages\sklearn\linear_model\_ridge.py", line 572, in fit
    X, y, X_offset, y_offset, X_scale = self._preprocess_data(
  File "C:\Users\ahmad\anaconda3\envs\bootcamp\lib\site-packages\sklearn\linear_model\_base.py", line 128, in _preprocess_data
    X = check_array(X, copy=copy, accept_sparse=['csr', 'csc'],
  File "C:\Users\ahmad\anaconda3\envs\bootcamp\lib\site-packages\sklearn\utils\validation.py", line 72, in inner_f
    return f(**kwargs)
  File "C:\Users\ahmad\anaconda3\envs\bootcamp\lib\site-packages\sklearn\utils\validation.py", line 6

KeyboardInterrupt: 