## Part 1: Cleaning dataset for initial analysis

In [1]:
# Dependencies
import pandas as pd
from pathlib import Path
import csv
import os
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# Files to load
sep_22 = pd.read_csv("Resources/SEP-22_ONTIME_REPORTING.csv")
oct_22 = pd.read_csv("Resources/OCT-22_ONTIME_REPORTING.csv")
nov_22 = pd.read_csv("Resources/NOV-22_ONTIME_REPORTING.csv")
dec_22 = pd.read_csv("Resources/DEC-22_ONTIME_REPORTING.csv")
jan_23 = pd.read_csv("Resources/JAN-23_ONTIME_REPORTING.csv")
feb_23 = pd.read_csv("Resources/FEB-23_ONTIME_REPORTING.csv")
mar_23 = pd.read_csv("Resources/MAR-23_ONTIME_REPORTING.csv")
apr_23 = pd.read_csv("Resources/APR-23_ONTIME_REPORTING.csv")
may_23 = pd.read_csv("Resources/MAY-23_ONTIME_REPORTING.csv")
jun_23 = pd.read_csv("Resources/JUN-23_ONTIME_REPORTING.csv")
jul_23 = pd.read_csv("Resources/JUL-23_ONTIME_REPORTING.csv")
aug_23 = pd.read_csv("Resources/AUG-23_ONTIME_REPORTING.csv")

In [3]:
# Joining data
frames = [
    sep_22,
    oct_22,
    nov_22,
    dec_22,
    jan_23,
    feb_23,
    mar_23,
    apr_23,
    may_23,
    jun_23,
    jul_23,
    aug_23
]

pd.set_option('display.max_columns', None)

combined_df = pd.concat(frames)

In [4]:
# Data checkpoint
combined_df.shape

(6778708, 28)

In [5]:
# Removing airports with fewer than 1000 flights
origin_adjust = combined_df['ORIGIN'].value_counts()

to_remove = origin_adjust[origin_adjust <= 1000].index

combined_df_adjust = combined_df[~combined_df.ORIGIN.isin(to_remove)]

combined_df_adjust['ORIGIN'].value_counts()

ATL    330168
DEN    283337
DFW    277696
ORD    258592
LAX    192111
        ...  
ROW      1032
AZO      1025
DEC      1022
BLV      1018
LAW      1009
Name: ORIGIN, Length: 241, dtype: int64

In [6]:
combined_df_adjust.shape

(6716245, 28)

In [7]:
combined_df_adjust['ORIGIN'].value_counts()

ATL    330168
DEN    283337
DFW    277696
ORD    258592
LAX    192111
        ...  
ROW      1032
AZO      1025
DEC      1022
BLV      1018
LAW      1009
Name: ORIGIN, Length: 241, dtype: int64

In [8]:
# Date checkpoint
combined_df_adjust.head(5)

Unnamed: 0,YEAR,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,DEST_AIRPORT_ID,DEST,DEST_CITY_NAME,DEST_STATE_ABR,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,ARR_DELAY,ARR_DELAY_NEW,ARR_DEL15,CANCELLED,DIVERTED,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
0,2022,9,1,4,9/1/2022 12:00:00 AM,9E,4628,15919,XNA,"Fayetteville, AR",AR,12953,LGA,"New York, NY",NY,-5.0,0.0,0.0,-17.0,0.0,0.0,0.0,0.0,,,,,
1,2022,9,1,4,9/1/2022 12:00:00 AM,9E,4630,13342,MKE,"Milwaukee, WI",WI,10721,BOS,"Boston, MA",MA,204.0,204.0,1.0,216.0,216.0,1.0,0.0,0.0,204.0,0.0,12.0,0.0,0.0
2,2022,9,1,4,9/1/2022 12:00:00 AM,9E,4631,14492,RDU,"Raleigh/Durham, NC",NC,12478,JFK,"New York, NY",NY,-8.0,0.0,0.0,-21.0,0.0,0.0,0.0,0.0,,,,,
3,2022,9,1,4,9/1/2022 12:00:00 AM,9E,4632,11042,CLE,"Cleveland, OH",OH,10721,BOS,"Boston, MA",MA,-4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,
4,2022,9,1,4,9/1/2022 12:00:00 AM,9E,4634,10397,ATL,"Atlanta, GA",GA,13422,MOB,"Mobile, AL",AL,-5.0,0.0,0.0,-8.0,0.0,0.0,0.0,0.0,,,,,


In [9]:
# Drop unneeded columns
updated_df = combined_df_adjust.drop(columns=['OP_CARRIER_FL_NUM', 
                                       'ORIGIN_AIRPORT_ID', 
                                       'ARR_DELAY', 
                                       'ARR_DELAY_NEW', 
                                       'ORIGIN_STATE_ABR', 
                                       'DEP_DELAY_NEW', 
                                       'DEST_AIRPORT_ID',
                                       'DEST_STATE_ABR',
                                       'DEP_DELAY',
                                       'CARRIER_DELAY',
                                       'LATE_AIRCRAFT_DELAY',
                                       'SECURITY_DELAY',
                                       'NAS_DELAY',
                                       'WEATHER_DELAY',
                                       'CARRIER_DELAY',
                                       'CANCELLED',
                                       'DIVERTED',
                                       'FL_DATE'
                                      ])

In [10]:
# Data checkpoint
updated_df.head(5)

Unnamed: 0,YEAR,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,DEP_DEL15,ARR_DEL15
0,2022,9,1,4,9E,XNA,"Fayetteville, AR",LGA,"New York, NY",0.0,0.0
1,2022,9,1,4,9E,MKE,"Milwaukee, WI",BOS,"Boston, MA",1.0,1.0
2,2022,9,1,4,9E,RDU,"Raleigh/Durham, NC",JFK,"New York, NY",0.0,0.0
3,2022,9,1,4,9E,CLE,"Cleveland, OH",BOS,"Boston, MA",0.0,0.0
4,2022,9,1,4,9E,ATL,"Atlanta, GA",MOB,"Mobile, AL",0.0,0.0


### Column Definitions

* YEAR = Year of Flight
* MONTH = Month of Flight
* DAY_OF_MONTH = Day of Month
* DAY_OF_WEEK = Day of Week
* OP_CARRIER = Carrier Name
* ORIGIN = Origin Airport Code
* ORIGIN_CITY_NAME = Origin City and State
* DEST = Destination Airport Code
* DEST_CITY_NAME = Destination City and State
* DEP_DEL15 = Departure Delay (0 = No, 1 = Yes)
* ARR_DEL15 = Arrival Delay (0 = No, 1 = Yes)

In [11]:
# Drop NA entries
cleaned_df = updated_df.dropna()

In [12]:
# Data checkpoint
cleaned_df.isna().sum()

YEAR                0
MONTH               0
DAY_OF_MONTH        0
DAY_OF_WEEK         0
OP_CARRIER          0
ORIGIN              0
ORIGIN_CITY_NAME    0
DEST                0
DEST_CITY_NAME      0
DEP_DEL15           0
ARR_DEL15           0
dtype: int64

In [13]:
# Data checkpoint
cleaned_df.shape

(6575257, 11)

In [14]:
# Remap DAY_OF_WEEK values
day_dict = {1 : 'Monday', 
            2 : 'Tuesday', 
            3 : 'Wednesday', 
            4 : 'Thursday', 
            5 : 'Friday', 
            6 : 'Saturday',
            7 : 'Sunday'
           }
 
cleaned_df = cleaned_df.replace({"DAY_OF_WEEK": day_dict})

In [15]:
# Remap OP_CARRIER values
carrier_dict = {'WN' : 'Southwest Airlines',
                'DL' : 'Delta Air Lines',
                'AA' : 'American Airlines',
                'UA' : 'United Air Lines',
                'OO' : 'SkyWest Airlines',
                'YX' : 'Republic Airline',
                'B6' : 'JetBlue Airways',
                'NK' : 'Spirit Air Lines',
                'AS' : 'Alaska Airlines',
                'MQ' : 'Envoy Air',
                '9E' : 'Endeavor Air',
                'OH' : 'PSA Airlines',
                'F9' : 'Frontier Airlines',
                'G4' : 'Allegiant Air',
                'HA' : 'Hawaiian Airlines',
                'YV' : 'Mesa Airlines',
                'QX' : 'Horizon Air'
               }

cleaned_df = cleaned_df.replace({"OP_CARRIER": carrier_dict})

In [16]:
# Convert data types
cleaned_df['ARR_DEL15'] = cleaned_df['ARR_DEL15'].astype(int) 
cleaned_df['DEP_DEL15'] = cleaned_df['DEP_DEL15'].astype(int) 
cleaned_df['MONTH'] = cleaned_df['MONTH'].astype(int) 
cleaned_df['DAY_OF_WEEK'] = cleaned_df['DAY_OF_WEEK'].astype('category') 
cleaned_df['OP_CARRIER'] = cleaned_df['OP_CARRIER'].astype('category') 
cleaned_df['ORIGIN'] = cleaned_df['ORIGIN'].astype('category') 
cleaned_df['ORIGIN_CITY_NAME'] = cleaned_df['ORIGIN_CITY_NAME'].astype('category') 
cleaned_df['DEST'] = cleaned_df['DEST'].astype('category') 
cleaned_df['DEST_CITY_NAME'] = cleaned_df['DEST_CITY_NAME'].astype('category') 

cleaned_df.dtypes

YEAR                   int64
MONTH                  int32
DAY_OF_MONTH           int64
DAY_OF_WEEK         category
OP_CARRIER          category
ORIGIN              category
ORIGIN_CITY_NAME    category
DEST                category
DEST_CITY_NAME      category
DEP_DEL15              int32
ARR_DEL15              int32
dtype: object

In [17]:
# Remap DEP_DEL15 values
dep_dict = {0.0 : 0, 
            1.0 : 1
           }
 
cleaned_df = cleaned_df.replace({"DEP_DEL15": dep_dict})

In [18]:
# Remap ARR_DEL15 values
arr_dict = {0.0 : 0, 
            1.0 : 1
           }
 
cleaned_df = cleaned_df.replace({"ARR_DEL15": arr_dict})

In [19]:
# Data checkpoint
cleaned_df.value_counts('OP_CARRIER')

OP_CARRIER
Southwest Airlines    1361355
Delta Air Lines        943994
American Airlines      914925
United Air Lines       690580
SkyWest Airlines       624045
Republic Airline       286677
JetBlue Airways        273596
Spirit Air Lines       250084
Alaska Airlines        231927
Envoy Air              218313
Endeavor Air           191719
PSA Airlines           185906
Frontier Airlines      162340
Allegiant Air          106004
Hawaiian Airlines       77908
Mesa Airlines           33343
Horizon Air             22541
dtype: int64

In [20]:
# Data checkpoint
cleaned_df.value_counts('DAY_OF_WEEK')

DAY_OF_WEEK
Thursday     988107
Friday       974321
Monday       973533
Sunday       944912
Wednesday    928477
Tuesday      920372
Saturday     845535
dtype: int64

In [21]:
# Data checkpoint
cleaned_df.value_counts('MONTH')

MONTH
8     587309
7     580054
5     570645
3     566347
10    560610
6     558026
4     545473
9     541661
11    534106
1     522397
12    520450
2     488179
dtype: int64

In [22]:
# Data checkpoint
cleaned_df.value_counts('DEP_DEL15')

DEP_DEL15
0    5147830
1    1427427
dtype: int64

In [23]:
# Data checkpoint
cleaned_df.value_counts('ARR_DEL15')

ARR_DEL15
0    5136898
1    1438359
dtype: int64

In [24]:
# Shuffle the dataset
cleaned_shuffled_df = cleaned_df.sample(frac=1).reset_index(drop=True)

# Print the shuffled DataFrame
print("\nShuffled DataFrame:")
print(cleaned_shuffled_df)


Shuffled DataFrame:
         YEAR  MONTH  DAY_OF_MONTH DAY_OF_WEEK          OP_CARRIER ORIGIN  \
0        2022      9             3    Saturday     Delta Air Lines    LIH   
1        2023      3             8   Wednesday   Frontier Airlines    MCO   
2        2023      4             8    Saturday   Hawaiian Airlines    KOA   
3        2023      6             4      Sunday        PSA Airlines    CLT   
4        2022     10             7      Friday    United Air Lines    LGA   
...       ...    ...           ...         ...                 ...    ...   
6575252  2022      9             3    Saturday     Delta Air Lines    ATL   
6575253  2023      7             8    Saturday    United Air Lines    ORD   
6575254  2023      2            19      Sunday        Endeavor Air    JFK   
6575255  2023      4             8    Saturday  Southwest Airlines    OAK   
6575256  2022     12            28   Wednesday   American Airlines    SRQ   

               ORIGIN_CITY_NAME DEST      DEST_CITY_NA

In [25]:
# Data checkpoint
cleaned_shuffled_df.head(2).append(cleaned_df.tail(2))

Unnamed: 0,YEAR,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,DEP_DEL15,ARR_DEL15
0,2022,9,3,Saturday,Delta Air Lines,LIH,"Lihue, HI",LAX,"Los Angeles, CA",0,0
1,2023,3,8,Wednesday,Frontier Airlines,MCO,"Orlando, FL",SJU,"San Juan, PR",0,0
602985,2023,8,31,Thursday,Republic Airline,ACK,"Nantucket, MA",JFK,"New York, NY",0,0
602986,2023,8,31,Thursday,Republic Airline,CMH,"Columbus, OH",JFK,"New York, NY",0,0


In [26]:
# Save new CSVs
# Selected arbitrary limit to reduce dataset for analysis
cleaned_shuffled_df.loc[0:150000].to_csv('Resources/prepared_dataset.csv', index= False)

# Keeping one full set for personal reference
#cleaned_df.to_csv("Resources/supplemental_dataset.csv", index= False)

## Part 2: Preparing dataset for modelling analysis

In [27]:
# Read the CSV file from the Resources folder into a Pandas DataFrame

analysis_df = pd.read_csv("Resources/prepared_dataset.csv")

# Remove additional columns
analysis_df = analysis_df.drop(columns=['ORIGIN_CITY_NAME', 
                                       'DEST_CITY_NAME', 
                                        'ARR_DEL15',
                                        "YEAR",
                                        "DEST"
                                       ])

In [28]:
# Data checkpoint
analysis_df.head(2).append(analysis_df.tail(2))

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER,ORIGIN,DEP_DEL15
0,9,3,Saturday,Delta Air Lines,LIH,0
1,3,8,Wednesday,Frontier Airlines,MCO,0
149999,9,7,Wednesday,United Air Lines,IAH,1
150000,1,27,Friday,United Air Lines,SNA,0


In [29]:
analysis_df.shape

(150001, 6)

In [30]:
# Save to new CSV
analysis_df.to_csv("Resources/analysis_df.csv", index= False)