In [1]:
# plt.figure(figsize=(20,14))
# ax = sns.heatmap(dfm.corr(), cmap='viridis', center=0, annot=True)
# bottom, top = ax.get_ylim()
# plt.text(0,-0.6, "df2 - Heat Map", fontsize = 30, color='Black', fontstyle='normal')
# ax.set_ylim(bottom + 0.5, top - 0.5)
# plt.yticks(rotation=0, fontsize=14)
# plt.xticks(rotation=90, fontsize=14)
# plt.show()

In [2]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import csv
import os
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
%matplotlib inline

In [3]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
log_reg_df = pd.read_csv("Resources/analysis_df.csv")

# Reshuffling the data for good measure
log_reg_df = log_reg_df.sample(frac = 1)
 
# Print the shuffled DataFrame
print("\nShuffled DataFrame:")
print(log_reg_df)


Shuffled DataFrame:
       MONTH  DAY_OF_MONTH DAY_OF_WEEK          OP_CARRIER ORIGIN DEST  \
78886    AUG            10    Thursday   American Airlines    PHL  SEA   
47902    MAR            18    Saturday     Delta Air Lines    ATL  JAX   
41432    AUG            30   Wednesday    SkyWest Airlines    DFW  GCK   
123743   OCT            16      Sunday    Republic Airline    RIC  JFK   
80568    DEC            30      Friday    Republic Airline    IND  BOS   
...      ...           ...         ...                 ...    ...  ...   
84967    JAN            23      Monday  Southwest Airlines    ATL  LGA   
45718    MAR            28     Tuesday    SkyWest Airlines    DFW  MLU   
1575     SEP            11      Sunday   American Airlines    MIA  TPA   
131476   JUL            12   Wednesday        Endeavor Air    LGA  CVG   
106940   DEC            19      Monday    Republic Airline    JFK  BOS   

        DEP_DEL15  
78886           0  
47902           0  
41432           0  
123743    

In [4]:
# Checkpoint
log_reg_df.shape

(150001, 7)

In [5]:
# Review the DataFrame
display(log_reg_df.head(2))
display(log_reg_df.tail())

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER,ORIGIN,DEST,DEP_DEL15
78886,AUG,10,Thursday,American Airlines,PHL,SEA,0
47902,MAR,18,Saturday,Delta Air Lines,ATL,JAX,0


Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER,ORIGIN,DEST,DEP_DEL15
84967,JAN,23,Monday,Southwest Airlines,ATL,LGA,1
45718,MAR,28,Tuesday,SkyWest Airlines,DFW,MLU,0
1575,SEP,11,Sunday,American Airlines,MIA,TPA,1
131476,JUL,12,Wednesday,Endeavor Air,LGA,CVG,0
106940,DEC,19,Monday,Republic Airline,JFK,BOS,0


### Column Definitions

* YEAR = Year of Flight
* MONTH = Month of Flight
* DAY_OF_MONTH = Day of Month
* DAY_OF_WEEK = Day of Week
* OP_CARRIER = Carrier Name
* ORIGIN = Origin Airport Code
* DEST = Destination Airport Code
* DEP_DEL15 = Departure Delay (0 = No, 1 = Yes)

In [20]:
# Changing data type to string to address regression analysis errors
log_reg_df = log_reg_df.astype(str)

In [21]:
# Checkpoint
# log_reg_df.dtypes

In [22]:
# Transforming the data using get_dummies
month_dummies = pd.get_dummies(log_reg_df["MONTH"])
day_dummies = pd.get_dummies(log_reg_df["DAY_OF_WEEK"])
date_dummies = pd.get_dummies(log_reg_df["DAY_OF_MONTH"])
carrier_dummies = pd.get_dummies(log_reg_df["OP_CARRIER"])
origin_dummies = pd.get_dummies(log_reg_df["ORIGIN"])
dest_dummies = pd.get_dummies(log_reg_df["DEST"])

In [23]:
# Display the transformed data
month_dummies.tail()
#day_dummies.tail()
#date_dummies.tail()
#carrier_dummies.tail()
#origin_dummies.tail()
#dest_dummies.tail()

Unnamed: 0,APR,AUG,DEC,FEB,JAN,JUL,JUN,MAR,MAY,NOV,OCT,SEP
84967,0,0,0,0,1,0,0,0,0,0,0,0
45718,0,0,0,0,0,0,0,1,0,0,0,0
1575,0,0,0,0,0,0,0,0,0,0,0,1
131476,0,0,0,0,0,1,0,0,0,0,0,0
106940,0,0,1,0,0,0,0,0,0,0,0,0


In [24]:
# Concatenate log_reg_df and the created dummies DataFrames
tranformed_logreg_df = pd.concat([log_reg_df, 
                       month_dummies, 
                       day_dummies,
                       date_dummies, 
                       carrier_dummies,
                       origin_dummies,
                       dest_dummies                      
                      ], 
                      axis=1)

# Drop the originating dummies columns
tranformed_logreg_df = tranformed_logreg_df.drop(columns=["MONTH",
                                            "DAY_OF_MONTH", 
                                            "DAY_OF_WEEK", 
                                            "OP_CARRIER",
                                            "ORIGIN", 
                                            "DEST"
                                           ])

# Display the DataFrame
tranformed_logreg_df.head()

Unnamed: 0,DEP_DEL15,APR,AUG,DEC,FEB,JAN,JUL,JUN,MAR,MAY,...,VEL,VLD,VPS,WRG,WYS,XNA,XWA,YAK,YKM,YUM
78886,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47902,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
41432,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
123743,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
80568,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = tranformed_logreg_df['DEP_DEL15']

# Separate the X variable, the features
X = tranformed_logreg_df.drop(columns=['DEP_DEL15'])

In [26]:
# Review the y variable Series
display(y.head())
display(y.tail())

78886     0
47902     0
41432     0
123743    0
80568     0
Name: DEP_DEL15, dtype: object

84967     1
45718     0
1575      1
131476    0
106940    0
Name: DEP_DEL15, dtype: object

In [27]:
# Check the balance of our target values
y.value_counts()

0    117230
1     32771
Name: DEP_DEL15, dtype: int64

In [28]:
# Review the X variable DataFrame
display(X.head(2))
display(X.tail(2))

Unnamed: 0,APR,AUG,DEC,FEB,JAN,JUL,JUN,MAR,MAY,NOV,...,VEL,VLD,VPS,WRG,WYS,XNA,XWA,YAK,YKM,YUM
78886,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47902,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,APR,AUG,DEC,FEB,JAN,JUL,JUN,MAR,MAY,NOV,...,VEL,VLD,VPS,WRG,WYS,XNA,XWA,YAK,YKM,YUM
131476,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
106940,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [30]:
# X= X.rename(str,axis="columns") 
# X.columns = X.columns.astype(str)

In [31]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model = LogisticRegression(random_state=1)

# Fit the model using training data
lr_model = logistic_regression_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [32]:
# Make a prediction using the testing data
testing_predictions = logistic_regression_model.predict(X_test)

In [33]:
# Print the balanced_accuracy score of the model
acc_score = balanced_accuracy_score(y_test, testing_predictions)
acc_score

0.5019443834015488

In [34]:
# Generate a confusion matrix for the model
cm = confusion_matrix(y_test, testing_predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual On Time", "Actual Delay"], columns=["Predicted On Time", "Predicted Delay"]
)


# plot_confusion_matrix(tree_clf, X, y, values_format='.3g')
# plt.show()

In [35]:
# Print the classification report for the model
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, testing_predictions))

Confusion Matrix


Unnamed: 0,Predicted On Time,Predicted Delay
Actual On Time,29247,54
Actual Delay,8153,47


Accuracy Score : 0.5019443834015488
Classification Report
              precision    recall  f1-score   support

           0       0.78      1.00      0.88     29301
           1       0.47      0.01      0.01      8200

    accuracy                           0.78     37501
   macro avg       0.62      0.50      0.44     37501
weighted avg       0.71      0.78      0.69     37501



In [36]:

# Step 1: Use the RandomOverSampler module from the imbalanced-learn library to resample the data. 
# Be sure to confirm that the labels have an equal number of data points.


# Import the RandomOverSampler module form imbalanced-learn
# !pip install -U imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
ros = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_res, y_res = ros.fit_resample(X_train, y_train)

In [37]:
# Count the distinct values of the resampled labels data
y_res.value_counts()

0    87929
1    87929
Name: DEP_DEL15, dtype: int64

In [38]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model_resampled = LogisticRegression(random_state=1)

# Fit the model using the resampled training data
resampled_lr_model = logistic_regression_model_resampled.fit(X_res, y_res)

# Make a prediction using the testing data
retesting_predictions = logistic_regression_model_resampled.predict(X_res)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [39]:
# Print the balanced_accuracy score of the model 
acc_rescore = balanced_accuracy_score(y_res, retesting_predictions)
acc_rescore

0.6040555448145664

In [40]:
# Generate a confusion matrix for the model
cm2 = confusion_matrix(y_res, retesting_predictions)
cm2_df = pd.DataFrame(
    cm2, index=["Actual On Time (OS)", "Actual Delay (OS)"], columns=["Predicted On Time (OS)", "Predicted Delay (OS)"]
)

# plot_confusion_matrix(tree_clf, X, y, values_format='.3g')
# plt.show()

In [41]:
# Print the classification report for the model
print("Resampled Confusion Matrix")
display(cm2_df)
print(f"Accuracy Score : {acc_rescore}")
print("Classification Report")
print(classification_report(y_res, retesting_predictions))

Resampled Confusion Matrix


Unnamed: 0,Predicted On Time (OS),Predicted Delay (OS)
Actual On Time (OS),52314,35615
Actual Delay (OS),34015,53914


Accuracy Score : 0.6040555448145664
Classification Report
              precision    recall  f1-score   support

           0       0.61      0.59      0.60     87929
           1       0.60      0.61      0.61     87929

    accuracy                           0.60    175858
   macro avg       0.60      0.60      0.60    175858
weighted avg       0.60      0.60      0.60    175858



## Random Forest

In [42]:
# Loading data
forest_df = pd.read_csv("Resources/analysis_df.csv")
forest_df.head()

# Reshuffling the data for good measure
forest_df = forest_df.sample(frac = 1)

In [38]:
forest_df.dtypes


MONTH           object
DAY_OF_MONTH     int64
DAY_OF_WEEK     object
OP_CARRIER      object
ORIGIN          object
DEST            object
DEP_DEL15        int64
dtype: object

In [58]:
# df['DataFrame Column'] = forest_df.astype(float)

In [44]:
# Define features set
X = forest_df.copy()
X.drop("DEP_DEL15", axis=1, inplace=True)
X.head()

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER,ORIGIN,DEST
102667,JUL,29,Saturday,Delta Air Lines,JFK,SLC
92677,MAR,19,Sunday,Delta Air Lines,MIA,ATL
23983,JUN,7,Wednesday,Southwest Airlines,MCI,BNA
21290,AUG,10,Thursday,Southwest Airlines,ATL,MDW
88608,MAR,12,Sunday,Delta Air Lines,AUS,ATL


In [45]:
# Define target vector
y = forest_df["DEP_DEL15"].ravel()
y[:5]

array(['0', '0', '0', '0', '0'], dtype=object)

In [46]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [47]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [54]:
# # Changing data type to string to address regression analysis errors
X_train = X_train.astype(float)

ValueError: could not convert string to float: 'APR'

In [52]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

ValueError: could not convert string to float: 'APR'

In [53]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

NameError: name 'X_scaler' is not defined

In [None]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [None]:
# Making predictions using the testing data
rf_predictions = rf_model.predict(X_test_scaled)

In [None]:
# Calculating the confusion matrix
rf_cm = confusion_matrix(y_test, rf_predictions)
rf_cm_df = pd.DataFrame(
    rf_cm, index=["Actual On Time", "Actual Delay"], columns=["Predicted On Time", "Predicted Delay"]
)

# Calculating the accuracy score
rf_acc_score = accuracy_score(y_test, rf_predictions)

In [None]:
# Displaying results
print("Confusion Matrix")
display(rf_cm_df)
print(f"Accuracy Score : {rf_acc_score}")
print("Classification Report")
print(classification_report(y_test, rf_predictions))

In [None]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf_model.feature_importances_

# We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

In [None]:
# Visualize the features by importance
importances_df = pd.DataFrame(sorted(zip(rf_model.feature_importances_, X.columns), reverse=True))
importances_df.set_index(importances_df[1], inplace=True)
importances_df.drop(columns=1, inplace=True)
importances_df.rename(columns={0: 'Feature Importances'}, inplace=True)
importances_sorted = importances_df.sort_values(by='Feature Importances')
importances_sorted.plot(kind='barh', color='lightgreen', title= 'Features Importances', legend=False)