In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
#from sklearn.model_selection import StratifiedKFold
#from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn import preprocessing

In [2]:
# function to categorize the time
def time_categorize(row):
    if (row.hour >= 00) and (row.hour<7):
        row = 'low'
    elif (row.hour >=7) and (row.hour <14):
        row = 'high'
    else:
        row='medium'    
    return row

def label_encoder_time(row):
    if row == 'high':
        row = 10
    if row =='medium':
        row = 5
    if row == 'low':
        row =1
    return row

In [3]:
flights_reg = pd.read_csv('data/flights_time_fixed.csv').drop(columns=['Unnamed: 0'])
flights_reg['fl_date'] = pd.to_datetime(flights_reg['fl_date'],format='%Y/%m/%d')

#changing the time string into time
flights_reg['dep_time_format'] = pd.to_datetime(flights_reg['dep_time_format'],format='%H:%M:%S').dt.time
flights_reg['arrival_time_format'] = pd.to_datetime(flights_reg['arrival_time_format'],format='%H:%M:%S').dt.time

#dividing the data into data that we want to work and data that we want to predict
flights_to_predict = flights_reg.loc[(flights_reg['fl_date'] <= '2019-12-31') & (flights_reg['fl_date'] >='2019-12-25')] #data to be predicted
flights_reg = flights_reg[(flights_reg['fl_date'] < '2019-12-25')] #data that build our model upon

In [4]:
flights_reg['dep_time_format'] = flights_reg['dep_time_format'].apply(time_categorize)
flights_reg['arrival_time_format'] = flights_reg['arrival_time_format'].apply(time_categorize)

In [5]:
flights_reg['dep_time_format'] = flights_reg['dep_time_format'].apply(label_encoder_time)
flights_reg['arrival_time_format'] = flights_reg['arrival_time_format'].apply(label_encoder_time)

In [6]:
flights_reg

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,...,cancelled,diverted,dup,crs_elapsed_time,actual_elapsed_time,air_time,flights,distance,dep_time_format,arrival_time_format
0,2019-05-22,DL,DL,DL,544,DL,544,11433,DTW,"Detroit, MI",...,0,0,N,115,101.0,82.0,1,632,10,10
1,2019-06-27,AA,AA,AA,2036,AA,2036,12953,LGA,"New York, NY",...,0,0,N,133,119.0,81.0,1,544,10,10
2,2018-04-05,UA,UA_CODESHARE,UA,5306,OO,5306,11292,DEN,"Denver, CO",...,0,0,N,135,120.0,105.0,1,649,10,10
3,2019-02-08,WN,WN,WN,548,WN,548,10821,BWI,"Baltimore, MD",...,0,0,N,80,74.0,58.0,1,314,5,5
4,2019-02-23,AA,AA_CODESHARE,AA,5365,OH,5365,11146,CRW,"Charleston/Dunbar, WV",...,0,0,N,87,90.0,56.0,1,221,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255152,2019-08-13,AA,AA,AA,1356,AA,1356,13930,ORD,"Chicago, IL",...,0,0,N,183,182.0,151.0,1,1118,10,10
255153,2018-03-21,B6,B6,B6,1125,B6,1125,14492,RDU,"Raleigh/Durham, NC",...,0,0,N,127,123.0,104.0,1,680,1,10
255154,2019-04-29,UA,UA,UA,493,UA,493,11278,DCA,"Washington, DC",...,0,0,N,138,130.0,99.0,1,612,5,5
255155,2019-04-26,DL,DL,DL,1779,DL,1779,11433,DTW,"Detroit, MI",...,0,0,N,163,143.0,121.0,1,926,5,5


In [7]:
flights_reg = flights_reg[['fl_date','distance','air_time','actual_elapsed_time','taxi_in',
                                 'taxi_out','arr_delay','mkt_unique_carrier','origin','dest','dep_time_format','arrival_time_format']]

#We dont need the date for one hot encoding
flights_date_dropped = flights_reg.drop(columns=['fl_date'])
dummies = pd.get_dummies(flights_date_dropped[['mkt_unique_carrier','origin','dest']],drop_first=True)

#applying one label encoder custom function

#create a data frame that only contain numeric values so that we add the dumies with
#and no target variable
flights_numeric = flights_reg[['distance','air_time','actual_elapsed_time','taxi_in','taxi_out','arr_delay','dep_time_format','arrival_time_format']]
flights_indep_dep_var = pd.concat([flights_numeric,dummies],axis=1)

In [8]:
#Normalizing the whole data(indep+dep)
columns = flights_indep_dep_var.columns
x = flights_indep_dep_var.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
flights_indep_dep_norm = pd.DataFrame(x_scaled, columns=columns)

In [9]:
#seperating the target variables
X_reg = flights_indep_dep_norm.loc[:, flights_indep_dep_norm.columns != 'arr_delay']
y_reg = flights_indep_dep_norm.arr_delay

In [10]:
#splitting into train and test
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg,y_reg, test_size=0.30, random_state=123)

#defining the model: Ridge Regression
rr = Ridge()

#parameters to be tuned for ridge regression: alpha
parameters = {'alpha':[0.01,0.1,0.5,1,5]}

#Definning the GridSearch
Ridge_tuned = GridSearchCV(rr,param_grid=parameters, scoring='r2', cv=10)

In [11]:
#Fittin the model intor train data
Ridge_tuned.fit(X_reg_train,y_reg_train)
print(Ridge_tuned.best_estimator_)

Ridge(alpha=0.01)


In [12]:
#grabbing the best model from GridSearch
best_model_ridge = Ridge_tuned.best_estimator_
best_model_ridge.fit(X_reg_train,y_reg_train)

Ridge(alpha=0.01)

In [13]:
y_predicted_ridge = best_model_ridge.predict(X_reg_test)
print(np.sqrt(mean_squared_error(y_reg_test,y_predicted_ridge)))
print(r2_score(y_reg_test,y_predicted_ridge))

0.024687391712751076
0.5653316720178718
