In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)
# Pre-requisites
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
# Classifiers from scikit-learn
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
# Performance metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

Mounted at /content/drive


In [None]:
df = pd.read_csv("/content/drive/MyDrive/Data/flight_and_weather.csv", index_col=0)
print(df.shape)
df.info()

(1851436, 33)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1851436 entries, 0 to 1851435
Data columns (total 33 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Year             int64  
 1   Quarter          int64  
 2   Month            int64  
 3   DayofMonth       int64  
 4   FlightDate       object 
 5   OriginAirportID  int64  
 6   Origin           object 
 7   DestAirportID    int64  
 8   Dest             object 
 9   CRSDepTime       int64  
 10  DepTime          float64
 11  DepDelayMinutes  float64
 12  DepDel15         float64
 13  CRSArrTime       int64  
 14  ArrTime          float64
 15  ArrDelayMinutes  float64
 16  ArrDel15         float64
 17  Time_new         int64  
 18  windspeedKmph    int64  
 19  winddirDegree    int64  
 20  weatherCode      int64  
 21  precipMM         float64
 22  visibility       int64  
 23  pressure         int64  
 24  cloudcover       int64  
 25  DewPointF        int64  
 26  WindGustKmph     int64  
 27

In [None]:
# Dropping columns with redundant or duplicate data
df.drop(columns=["FlightDate",
                 "OriginAirportID",
                 "DestAirportID",
                 "CRSArrTime",
                 "ArrTime",
                 "ArrDel15",
                 "Time_new",
                 "date",
                 "airport"],
        inplace=True)
print(df.shape)
print(df.info())

(1851436, 24)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1851436 entries, 0 to 1851435
Data columns (total 24 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Year             int64  
 1   Quarter          int64  
 2   Month            int64  
 3   DayofMonth       int64  
 4   Origin           object 
 5   Dest             object 
 6   CRSDepTime       int64  
 7   DepTime          float64
 8   DepDelayMinutes  float64
 9   DepDel15         float64
 10  ArrDelayMinutes  float64
 11  windspeedKmph    int64  
 12  winddirDegree    int64  
 13  weatherCode      int64  
 14  precipMM         float64
 15  visibility       int64  
 16  pressure         int64  
 17  cloudcover       int64  
 18  DewPointF        int64  
 19  WindGustKmph     int64  
 20  tempF            int64  
 21  WindChillF       int64  
 22  humidity         int64  
 23  time             int64  
dtypes: float64(5), int64(17), object(2)
memory usage: 353.1+ MB
None


In [None]:
labelEncoder = LabelEncoder()
df["Origin"] = labelEncoder.fit_transform(df["Origin"])
df["Dest"] = labelEncoder.fit_transform(df["Dest"])
# Only need the observations where the flight is delayed
# df = df[df["ArrDelayMinutes"] > 0]
# df.reset_index(inplace=True, drop=True)
# print(df.columns)
# print(df.shape)
features = df.loc[:, df.columns != "ArrDelayMinutes"]
labels = np.asarray(df["ArrDelayMinutes"])

In [None]:
# Number of samples/observations/rows is greater than 100,000
print(f"\nDataset shape: {df.shape}")
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.20, random_state=42)
print(f"features_train shape: {features_train.shape} | features_test shape: {features_test.shape}")
print(f"labels_train shape: {labels_train.shape} | labels_test shape: {labels_test.shape}")
# print(f"{features_train.shape[1]} Features: {features_train.columns.to_list()}")
del df


Dataset shape: (1851436, 24)
features_train shape: (1481148, 23) | features_test shape: (370288, 23)
labels_train shape: (1481148,) | labels_test shape: (370288,)


In [None]:
perf_df = pd.DataFrame(columns=["Regressors", "MSE", "RMSE", "MAE", "R2"])
def print_metrics(labels_test, model_pred, regressor_name, perf_df):

    mse = mean_squared_error(labels_test, model_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(labels_test, model_pred)
    r2 = r2_score(labels_test, model_pred)

    print(f"MSE      : {mse}", end="\n\n")
    print(f"RMSE     : {rmse}", end="\n\n")
    print(f"MAE      : {mae}", end="\n\n")
    print(f"R2 Score : {r2}", end="\n\n")

    perf_df = perf_df.append({"Regressors": regressor_name,
                                        "MSE": mse,
                                        "RMSE": rmse,
                                        "MAE": mae,
                                        "R2": r2}, ignore_index=True)
    return perf_df

*Linear* Regression

In [None]:
model = LinearRegression(n_jobs=-1)
model.fit(features_train, labels_train)
model_pred = model.predict(features_test)
perf_df = print_metrics(labels_test, model_pred, "LinearRegression", perf_df)
del model
del model_pred

MSE      : 114.63278034514948

RMSE     : 10.70666989988715

MAE      : 5.681653701909743

R2 Score : 0.932584164103862



  perf_df = perf_df.append({"Regressors": regressor_name,


Decision Tree Regressor

In [None]:
model = DecisionTreeRegressor()
model.fit(features_train, labels_train)
model_pred = model.predict(features_test)
perf_df = print_metrics(labels_test, model_pred, "DecisionTreeRegressor", perf_df)
del model
del model_pred

MSE      : 227.13479777902606

RMSE     : 15.070991930826121

MAE      : 7.296010132653502

R2 Score : 0.8664214354108071



  perf_df = perf_df.append({"Regressors": regressor_name,


Random Forest

In [None]:
model = RandomForestRegressor(n_jobs=-1)
model.fit(features_train, labels_train)
model_pred = model.predict(features_test)
perf_df = print_metrics(labels_test, model_pred, "RandomForestRegressor", perf_df)
del model
del model_pred

MSE      : 227.49311145277014

RMSE     : 15.082874774152643

MAE      : 10.405916738390292

R2 Score : 0.93681610627977



  perf_df = perf_df.append({"Regressors": regressor_name,


XGBoost

In [None]:
model = GradientBoostingRegressor()
model.fit(features_train, labels_train)
model_pred = model.predict(features_test)
perf_df = print_metrics(labels_test, model_pred, "GradientBoostingRegressor", perf_df)
del model
del model_pred

MSE      : 230.8549422166322

RMSE     : 15.193911353454455

MAE      : 10.331746623039692

R2 Score : 0.9358823920396648



  perf_df = perf_df.append({"Regressors": regressor_name,


Extra Trees Regressor

In [None]:
model = ExtraTreesRegressor(n_jobs=-1)
model.fit(features_train, labels_train)
model_pred = model.predict(features_test)
perf_df = print_metrics(labels_test, model_pred, "ExtraTreesRegressor", perf_df)

MSE      : 225.07760312922798

RMSE     : 15.002586547966587

MAE      : 10.328728441830325

R2 Score : 0.9357704320642302



  perf_df = perf_df.append({"Regressors": regressor_name,
