In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
#from sklearn.model_selection import StratifiedKFold
#from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn import preprocessing

In [2]:
flights_reg = pd.read_csv('data/flights_no_outlier_iqr_time.csv').drop(columns=['Unnamed: 0'])
flights_reg['fl_date'] = pd.to_datetime(flights_reg['fl_date'],format='%Y/%m/%d')
#sorting the values by date
#flights_sorted = flights_reg.sort_values(by='fl_date').reset_index().drop('index',axis=1)
#dividing the data into data that we want to work and data that we want to predict
flights_to_predict = flights_reg.loc[(flights_reg['fl_date'] <= '2019-12-31') & (flights_reg['fl_date'] >='2019-12-25')] #data to be predicted
flights_reg = flights_reg[(flights_reg['fl_date'] < '2019-12-25')] #data that build our model upon

In [3]:
flights_reg = flights_reg[['fl_date','distance','air_time','actual_elapsed_time','taxi_in',
                                 'taxi_out','arr_delay','mkt_unique_carrier','origin','dest']]

#We dont need the date for one hot encoding
flights_date_dropped = flights_reg.drop(columns=['fl_date'])
dummies = pd.get_dummies(flights_date_dropped[['mkt_unique_carrier','origin','dest']],drop_first=True)
#flights_one_hot_encoded = pd.concat([flights_date_dropped, dummies], axis=1)
#create a data frame that only contain numeric values so that we add the dumies with
#and no target variable
flights_numeric = flights_reg[['distance','air_time','actual_elapsed_time','taxi_in','taxi_out','arr_delay']]
flights_indep_dep_var = pd.concat([flights_numeric,dummies],axis=1)

In [6]:
#Normalizing Independt variables
columns = flights_indep_dep_var.columns
x = flights_indep_dep_var.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
flights_indep_dep_norm = pd.DataFrame(x_scaled, columns=columns)

In [7]:
X_reg = flights_indep_dep_norm.loc[:, flights_indep_dep_norm.columns != 'arr_delay']
y_reg = flights_indep_dep_norm.arr_delay

In [8]:
#splitting into train and test
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg,y_reg, test_size=0.30, random_state=123)

#defining the model: Ridge Regression
rr = Ridge()

#parameters to be tuned for ridge regression: alpha
parameters = {'alpha':[0.01,0.1,0.5,1,5]}

#Definning the GridSearch
Ridge_tuned = GridSearchCV(rr,param_grid=parameters, scoring='r2', cv=10)

In [9]:
#Fittin the model intor train data
Ridge_tuned.fit(X_reg_train,y_reg_train)
print(Ridge_tuned.best_estimator_)

Ridge(alpha=0.01)


In [10]:
#grabbing the best model from GridSearch
best_model_ridge = Ridge_tuned.best_estimator_
best_model_ridge.fit(X_reg_train,y_reg_train)

Ridge(alpha=0.01)

In [11]:
y_predicted_ridge = best_model_ridge.predict(X_reg_test)
print(np.sqrt(mean_squared_error(y_reg_test,y_predicted_ridge)))
print(r2_score(y_reg_test,y_predicted_ridge))

0.024852082464787578
0.5595129422826639
