In [1]:
# plt.figure(figsize=(20,14))
# ax = sns.heatmap(dfm.corr(), cmap='viridis', center=0, annot=True)
# bottom, top = ax.get_ylim()
# plt.text(0,-0.6, "df2 - Heat Map", fontsize = 30, color='Black', fontstyle='normal')
# ax.set_ylim(bottom + 0.5, top - 0.5)
# plt.yticks(rotation=0, fontsize=14)
# plt.xticks(rotation=90, fontsize=14)
# plt.show()

In [4]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
import csv
import os
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [6]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
prediction_df = pd.read_csv("Resources/analysis_df.csv")

# Reshuffling the data for good measure
prediction_df = prediction_df.sample(frac = 1)
 
# Print the shuffled DataFrame
print("\nShuffled DataFrame:")
print(prediction_df)


Shuffled DataFrame:
      YEAR MONTH  DAY_OF_MONTH DAY_OF_WEEK          OP_CARRIER ORIGIN DEST  \
368   2023   AUG             6      Sunday    United Air Lines    DEN  IAH   
1729  2022   AUG            19      Friday   American Airlines    DFW  MCI   
4268  2022   OCT            29    Saturday  Southwest Airlines    BWI  ISP   
231   2023   MAY            18    Thursday   Frontier Airlines    RDU  ATL   
1919  2022   NOV            28      Monday        PSA Airlines    PHL  PVD   
...    ...   ...           ...         ...                 ...    ...  ...   
4437  2022   DEC            15    Thursday  Southwest Airlines    ATL  MCO   
255   2023   JAN            15      Sunday    SkyWest Airlines    STS  PDX   
3516  2023   FEB            17      Friday    SkyWest Airlines    BOI  SEA   
3260  2022   AUG            24   Wednesday        Endeavor Air    ACK  JFK   
2316  2023   MAR             1   Wednesday   American Airlines    MEM  DFW   

      DEP_DEL15  
368           0  
1729  

In [7]:
# Checkpoint
prediction_df.shape

(5002, 8)

In [8]:
# Review the DataFrame
display(prediction_df.head(2))
display(prediction_df.tail())

Unnamed: 0,YEAR,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER,ORIGIN,DEST,DEP_DEL15
368,2023,AUG,6,Sunday,United Air Lines,DEN,IAH,0
1729,2022,AUG,19,Friday,American Airlines,DFW,MCI,0


Unnamed: 0,YEAR,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER,ORIGIN,DEST,DEP_DEL15
4437,2022,DEC,15,Thursday,Southwest Airlines,ATL,MCO,0
255,2023,JAN,15,Sunday,SkyWest Airlines,STS,PDX,1
3516,2023,FEB,17,Friday,SkyWest Airlines,BOI,SEA,0
3260,2022,AUG,24,Wednesday,Endeavor Air,ACK,JFK,0
2316,2023,MAR,1,Wednesday,American Airlines,MEM,DFW,1


### Column Definitions

* YEAR = Year of Flight
* MONTH = Month of Flight
* DAY_OF_MONTH = Day of Month
* DAY_OF_WEEK = Day of Week
* OP_CARRIER = Carrier Name
* ORIGIN = Origin Airport Code
* DEST = Destination Airport Code
* DEP_DEL15 = Departure Delay (0 = No, 1 = Yes)

In [9]:
# Transforming the data using get_dummies
month_dummies = pd.get_dummies(prediction_df["MONTH"])
day_dummies = pd.get_dummies(prediction_df["DAY_OF_WEEK"])
date_dummies = pd.get_dummies(prediction_df["DAY_OF_MONTH"])
carrier_dummies = pd.get_dummies(prediction_df["OP_CARRIER"])
origin_dummies = pd.get_dummies(prediction_df["ORIGIN"])
dest_dummies = pd.get_dummies(prediction_df["DEST"])

In [10]:
# Display the transformed data
month_dummies.tail()
#day_dummies.tail()
#date_dummies.tail()
#carrier_dummies.tail()
#origin_dummies.tail()
#dest_dummies.tail()

Unnamed: 0,APR,AUG,DEC,FEB,JAN,JUL,JUN,MAR,MAY,NOV,OCT,SEP
4437,0,0,1,0,0,0,0,0,0,0,0,0
255,0,0,0,0,1,0,0,0,0,0,0,0
3516,0,0,0,1,0,0,0,0,0,0,0,0
3260,0,1,0,0,0,0,0,0,0,0,0,0
2316,0,0,0,0,0,0,0,1,0,0,0,0


In [11]:
# Concatenate prediction_df and the created dummies DataFrames
tranformed_pred_df = pd.concat([prediction_df, 
                       month_dummies, 
                       day_dummies,
                       date_dummies, 
                       carrier_dummies,
                       origin_dummies,
                       dest_dummies                      
                      ], 
                      axis=1)

# Drop the originating dummies columns
tranformed_pred_df = tranformed_pred_df.drop(columns=["MONTH",
                                            "DAY_OF_MONTH", 
                                            "DAY_OF_WEEK", 
                                            "OP_CARRIER",
                                            "ORIGIN", 
                                            "DEST"
                                           ])

# Display the DataFrame
tranformed_pred_df.head()

Unnamed: 0,YEAR,DEP_DEL15,APR,AUG,DEC,FEB,JAN,JUL,JUN,MAR,...,TVC,TYR,TYS,USA,VEL,VPS,WRG,XNA,XWA,YUM
368,2023,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1729,2022,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4268,2022,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
231,2023,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1919,2022,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# # Encoding the YEAR column using a custom function
# def encode_year(YEAR):
#     """
#     This function encodes data set years by setting 2022 as 0 and 2023 as 1.
#     """
#     if YEAR == 2022:
#         return 0
#     else:
#         return 1

# # Call the encode_marriage function on the marriage column
# tranformed_pred_df["YEAR"] = tranformed_pred_df["YEAR"].apply(encode_year)

# # Review the DataFrame 
# tranformed_pred_df.tail()

In [13]:
tranformed_pred_df.value_counts('YEAR')

YEAR
2023    3075
2022    1927
dtype: int64

In [14]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = tranformed_pred_df['DEP_DEL15']

# Separate the X variable, the features
X = tranformed_pred_df.drop(columns=['DEP_DEL15'])

In [15]:
tranformed_pred_df.dtypes

YEAR         int64
DEP_DEL15    int64
APR          uint8
AUG          uint8
DEC          uint8
             ...  
VPS          uint8
WRG          uint8
XNA          uint8
XWA          uint8
YUM          uint8
Length: 577, dtype: object

In [26]:
# X =  X.astype(str)
# X.columns = X.columns.astype(str)
tranformed_pred_df = tranformed_pred_df.astype(str)

In [27]:
# Review the y variable Series
display(y.head())
display(y.tail())

368     0
1729    0
4268    1
231     0
1919    0
Name: DEP_DEL15, dtype: int64

4437    0
255     1
3516    0
3260    0
2316    1
Name: DEP_DEL15, dtype: int64

In [28]:
# Check the balance of our target values
y.value_counts()

0    3916
1    1086
Name: DEP_DEL15, dtype: int64

In [29]:
# Review the X variable DataFrame
display(X.head(2))
display(X.tail(2))

Unnamed: 0,YEAR,APR,AUG,DEC,FEB,JAN,JUL,JUN,MAR,MAY,...,TVC,TYR,TYS,USA,VEL,VPS,WRG,XNA,XWA,YUM
368,2023,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1729,2022,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,YEAR,APR,AUG,DEC,FEB,JAN,JUL,JUN,MAR,MAY,...,TVC,TYR,TYS,USA,VEL,VPS,WRG,XNA,XWA,YUM
3260,2022,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2316,2023,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [31]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model = LogisticRegression(random_state=1)

# Fit the model using training data
lr_model = logistic_regression_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [32]:
# Make a prediction using the testing data
testing_predictions = logistic_regression_model.predict(X_test)

In [33]:
# Print the balanced_accuracy score of the model
acc_score = balanced_accuracy_score(y_test, testing_predictions)
acc_score

0.5164507869568491

In [34]:
# Generate a confusion matrix for the model
cm = confusion_matrix(y_test, testing_predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual On Time", "Actual Delay"], columns=["Predicted On Time", "Predicted Delay"]
)


# plot_confusion_matrix(tree_clf, X, y, values_format='.3g')
# plt.show()

In [35]:
# Print the classification report for the model
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, testing_predictions))

Confusion Matrix


Unnamed: 0,Predicted On Time,Predicted Delay
Actual On Time,958,22
Actual Delay,256,15


Accuracy Score : 0.5164507869568491
Classification Report
              precision    recall  f1-score   support

           0       0.79      0.98      0.87       980
           1       0.41      0.06      0.10       271

    accuracy                           0.78      1251
   macro avg       0.60      0.52      0.49      1251
weighted avg       0.71      0.78      0.71      1251



In [36]:

# Step 1: Use the RandomOverSampler module from the imbalanced-learn library to resample the data. 
# Be sure to confirm that the labels have an equal number of data points.


# Import the RandomOverSampler module form imbalanced-learn
# !pip install -U imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
ros = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_res, y_res = ros.fit_resample(X_train, y_train)

In [37]:
# Count the distinct values of the resampled labels data
y_res.value_counts()

1    2936
0    2936
Name: DEP_DEL15, dtype: int64

In [38]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model_resampled = LogisticRegression(random_state=1)

# Fit the model using the resampled training data
resampled_lr_model = logistic_regression_model_resampled.fit(X_res, y_res)

# Make a prediction using the testing data
retesting_predictions = logistic_regression_model_resampled.predict(X_res)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [39]:
# Print the balanced_accuracy score of the model 
acc_rescore = balanced_accuracy_score(y_res, retesting_predictions)
acc_rescore

0.6600817438692098

In [40]:
# Generate a confusion matrix for the model
cm2 = confusion_matrix(y_res, retesting_predictions)
cm2_df = pd.DataFrame(
    cm2, index=["Actual On Time (OS)", "Actual Delay (OS)"], columns=["Predicted On Time (OS)", "Predicted Delay (OS)"]
)

# plot_confusion_matrix(tree_clf, X, y, values_format='.3g')
# plt.show()

In [41]:
# Print the classification report for the model
print("Resampled Confusion Matrix")
display(cm2_df)
print(f"Accuracy Score : {acc_rescore}")
print("Classification Report")
print(classification_report(y_res, retesting_predictions))

Resampled Confusion Matrix


Unnamed: 0,Predicted On Time (OS),Predicted Delay (OS)
Actual On Time (OS),1917,1019
Actual Delay (OS),977,1959


Accuracy Score : 0.6600817438692098
Classification Report
              precision    recall  f1-score   support

           0       0.66      0.65      0.66      2936
           1       0.66      0.67      0.66      2936

    accuracy                           0.66      5872
   macro avg       0.66      0.66      0.66      5872
weighted avg       0.66      0.66      0.66      5872

