In [25]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
#from sklearn.model_selection import StratifiedKFold
#from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures
import calendar


In [26]:
df = pd.read_csv('data/flights_no_outlier_iqr_time.csv').drop(columns=['Unnamed: 0'])
df['fl_date'] = pd.to_datetime(df['fl_date'],format='%Y-%m-%d')
df['dep_time_format'] = pd.to_datetime(df['dep_time_format'],format='%H:%M:%S').dt.time
df['arrival_time_format'] = pd.to_datetime(df['arrival_time_format'],format='%H:%M:%S').dt.time
df = df.sort_values(by='fl_date')
df['month'] = pd.DatetimeIndex(df['fl_date']).month

In [27]:
#dropping the useless columns
df_filtered = df[['mkt_unique_carrier','distance','air_time','actual_elapsed_time',
                'taxi_in','taxi_out','arr_delay','origin','dest','dep_time_format','arrival_time_format','month']]

### Trying label encoding

In [28]:
#categorizing the time
def time_categorize(row):
    if (row.hour >= 00) and (row.hour<7):
        row = 1
    elif (row.hour >=7) and (row.hour <14):
        row = 10
    else:
        row=2   
    return row

In [29]:
df_filtered['dep_time_format'] = df_filtered['dep_time_format'].apply(time_categorize)
df_filtered['arrival_time_format']= df_filtered['arrival_time_format'].apply(time_categorize)

In [30]:
#labeling the categorical data
label_encoder = preprocessing.LabelEncoder()
enc_cols = ['mkt_unique_carrier','origin','dest','dep_time_format','arrival_time_format']
for col in enc_cols:
    df_filtered[col] = label_encoder.fit_transform(df_filtered[col].astype(str))


In [31]:
X = df_filtered.loc[:, df_filtered.columns != 'arr_delay']
y = df_filtered[['arr_delay']]

### Grabbing X and y

In [32]:
#splitting to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state=123)

In [33]:
indep_dep = pd.concat([X_train,y_train],axis=1)
indep_dep_test = pd.concat([X_test,y_test],axis=1)

In [34]:
indep_dep

Unnamed: 0,mkt_unique_carrier,distance,air_time,actual_elapsed_time,taxi_in,taxi_out,origin,dest,dep_time_format,arrival_time_format,month,arr_delay
42279,7,787,114.0,133.0,8.0,11.0,218,58,2,2,3,-12.0
153017,10,405,65.0,79.0,4.0,10.0,217,219,1,1,10,-5.0
103628,0,526,79.0,104.0,6.0,19.0,73,235,1,1,6,-18.0
14253,8,605,92.0,132.0,6.0,34.0,118,143,1,1,11,20.0
10476,5,1142,173.0,196.0,6.0,17.0,251,320,2,2,1,-2.0
...,...,...,...,...,...,...,...,...,...,...,...,...
62057,4,444,68.0,101.0,5.0,28.0,24,242,2,2,7,7.0
21966,8,316,43.0,69.0,5.0,21.0,256,71,2,2,2,-15.0
184740,8,1754,263.0,291.0,10.0,18.0,46,93,2,2,3,-1.0
227103,3,226,45.0,81.0,12.0,24.0,73,21,2,2,2,-7.0


In [35]:
#normalizing the train data
cols_1 = indep_dep.columns
x = indep_dep.values
min_max = preprocessing.MinMaxScaler()
x_scaled = min_max.fit_transform(x)
indep_dep = pd.DataFrame(x_scaled,columns=cols_1)

cols_2 = indep_dep_test.columns
x_test = indep_dep_test.values
x_test_scaled = min_max.fit_transform(x_test)
indep_dep_test = pd.DataFrame(x_test_scaled,columns=cols_2)

In [195]:
#splitting the X and y
X_train = indep_dep.loc[:,indep_dep.columns!='arr_delay']
y_train = indep_dep.arr_delay

X_test = indep_dep_test.loc[:,indep_dep_test.columns!='arr_delay']
y_test = indep_dep_test.arr_delay

In [196]:
#defining the model: Ridge Regression
rr = Ridge()

#parameters to be tuned for ridge regression: alpha
parameters = {'alpha':[0.01,0.1,0.5,1,5]}

#Definning the GridSearch
Ridge_tuned = GridSearchCV(rr,param_grid=parameters, scoring='r2', cv=10)

In [197]:
#training the model with train data
Ridge_tuned.fit(X_train,y_train)
print(Ridge_tuned.best_estimator_)

Ridge(alpha=0.01)


In [198]:
y_pred = Ridge_tuned.best_estimator_.predict(X_test)

In [202]:
print(r2_score(y_test,y_pred))
print(np.sqrt(mean_squared_error(y_test,y_pred)))

-0.33585007659148336
0.06268452535108554


### trying one hot encoding

In [4]:
df_one = pd.read_csv('data/flights_no_outlier_iqr_time.csv').drop(columns=['Unnamed: 0'])
df_one['fl_date'] = pd.to_datetime(df_one['fl_date'],format='%Y-%m-%d')
df_one['dep_time_format'] = pd.to_datetime(df_one['dep_time_format'],format='%H:%M:%S').dt.time
df_one['arrival_time_format'] = pd.to_datetime(df_one['arrival_time_format'],format='%H:%M:%S').dt.time
df_one = df_one.sort_values(by='fl_date')
df_one['month'] = pd.DatetimeIndex(df_one['fl_date']).month
df_one['month'] = df_one['month'].apply(lambda x: calendar.month_abbr[x])

In [5]:
#dropping the useless columns
df_one_filt = df_one[['mkt_unique_carrier','distance','air_time','actual_elapsed_time',
                'taxi_in','taxi_out','arr_delay','origin','dest','dep_time_format','arrival_time_format','month']]


In [8]:
df_one_filt['dep_time_format'] = df_one_filt['dep_time_format'].apply(time_categorize)
df_one_filt['arrival_time_format']= df_one_filt['arrival_time_format'].apply(time_categorize)

In [9]:
dummies = pd.get_dummies(df_one_filt[['mkt_unique_carrier','origin','dest','month','dep_time_format','arrival_time_format']],drop_first=True)
df_one_num = df_one[['distance','air_time','actual_elapsed_time',
                'taxi_in','taxi_out','arr_delay']]
indep_dep_var = pd.concat([df_one_num,dummies],axis=1)

In [10]:
X_one = indep_dep_var.loc[:, indep_dep_var.columns != 'arr_delay']
y_one = indep_dep_var[['arr_delay']]

In [11]:
X_train_one, X_test_one, y_train_one, y_test_one = train_test_split(X_one, y_one,test_size=0.3,random_state=123)

In [12]:
indep_dep_one = pd.concat([X_train_one,y_train_one],axis=1)
indep_dep_test_one = pd.concat([X_test_one,y_test_one],axis=1)

In [13]:
#normalizing the train data
cols_1 = indep_dep_one.columns
x = indep_dep_one.values
min_max = preprocessing.MinMaxScaler()
x_scaled = min_max.fit_transform(x)
indep_dep_one = pd.DataFrame(x_scaled,columns=cols_1)

cols_2 = indep_dep_test_one.columns
x_test = indep_dep_test_one.values
x_test_scaled = min_max.fit_transform(x_test)
indep_dep_test_one = pd.DataFrame(x_test_scaled,columns=cols_2)

In [16]:
X_train_one = indep_dep_one.loc[:,indep_dep_one.columns!='arr_delay']
y_train_one = indep_dep_one.arr_delay

In [17]:
X_test_one = indep_dep_test_one.loc[:,indep_dep_test_one.columns!='arr_delay']
y_test_one = indep_dep_test_one.arr_delay

In [18]:
#defining the model: Ridge Regression
rr = Ridge()

#parameters to be tuned for ridge regression: alpha
parameters = {'alpha':[0.01,0.1,0.5,1,5]}

#Definning the GridSearch
Ridge_tuned = GridSearchCV(rr,param_grid=parameters, scoring='r2', cv=10)

In [19]:
Ridge_tuned.fit(X_train_one,y_train_one)
print(Ridge_tuned.best_estimator_)

Ridge(alpha=0.01)


In [20]:
y_pred = Ridge_tuned.best_estimator_.predict(X_test_one)

In [22]:
print(r2_score(y_test_one,y_pred))
print(np.sqrt(mean_squared_error(y_test_one,y_pred)))

0.014887134793492351
0.05383001208182276
