In [2]:
# import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import glob
import plotly.express as px

In [3]:
# compile the data files
file_paths = glob.glob('../data/20*')
file_paths

['../data\\2013.csv',
 '../data\\2014.csv',
 '../data\\2015.csv',
 '../data\\2016.csv',
 '../data\\2017.csv',
 '../data\\2018.csv',
 '../data\\2019.csv',
 '../data\\2020.csv',
 '../data\\2021.csv',
 '../data\\2022.csv',
 '../data\\2023.csv',
 '../data\\2024.csv']

In [4]:
# combine data files into one dataframe
df = pd.DataFrame()

for file in file_paths:
    df_air = pd.read_csv(file, index_col=0, parse_dates={'DATE':['MONTH','YEAR']}, date_format='%m/%Y') # combine 'YEAR' and 'MONTH' to 'DATE'
    df = pd.concat([df,df_air])

df = df.reset_index()

In [5]:
df.head()

Unnamed: 0,DATE,DEPARTURES_PERFORMED,SEATS,PASSENGERS,UNIQUE_CARRIER,CARRIER_NAME,ORIGIN_AIRPORT_ID,ORIGIN_CITY_NAME,ORIGIN_COUNTRY_NAME,DEST_AIRPORT_ID,DEST_CITY_NAME,DEST_COUNTRY_NAME,CLASS
0,10 2013,0.0,0.0,0.0,3M,Silver Airways,11624,"Key West, FL",United States,11534,"North Eleuthera, The Bahamas",The Bahamas,F
1,11 2013,0.0,0.0,0.0,3M,Silver Airways,11624,"Key West, FL",United States,11534,"North Eleuthera, The Bahamas",The Bahamas,F
2,2 2013,0.0,0.0,0.0,3M,Silver Airways,13289,"Marsh Harbour, The Bahamas",The Bahamas,15304,"Tampa, FL",United States,F
3,2 2013,0.0,0.0,0.0,3M,Silver Airways,15304,"Tampa, FL",United States,11534,"North Eleuthera, The Bahamas",The Bahamas,F
4,10 2013,0.0,0.0,0.0,3M,Silver Airways,15304,"Tampa, FL",United States,11534,"North Eleuthera, The Bahamas",The Bahamas,F


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 969506 entries, 0 to 969505
Data columns (total 13 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   DATE                  969506 non-null  object 
 1   DEPARTURES_PERFORMED  969506 non-null  float64
 2   SEATS                 969506 non-null  float64
 3   PASSENGERS            969506 non-null  float64
 4   UNIQUE_CARRIER        969239 non-null  object 
 5   CARRIER_NAME          969331 non-null  object 
 6   ORIGIN_AIRPORT_ID     969506 non-null  int64  
 7   ORIGIN_CITY_NAME      969105 non-null  object 
 8   ORIGIN_COUNTRY_NAME   969105 non-null  object 
 9   DEST_AIRPORT_ID       969506 non-null  int64  
 10  DEST_CITY_NAME        969106 non-null  object 
 11  DEST_COUNTRY_NAME     969106 non-null  object 
 12  CLASS                 969506 non-null  object 
dtypes: float64(3), int64(2), object(8)
memory usage: 96.2+ MB


In [7]:
# convert to datetime datatype
df['DATE'] = pd.to_datetime(df['DATE'])

  df['DATE'] = pd.to_datetime(df['DATE'])


In [8]:
# determine top 3 airlines
df['UNIQUE_CARRIER'].value_counts()

UNIQUE_CARRIER
UA     95297
AA     77566
DL     74305
B6     28220
WS     24233
       ...  
NJQ        1
FK         1
MX         1
9X         1
1NQ        1
Name: count, Length: 395, dtype: int64

In [9]:
# locate top 3 airlines with completed flights
df_air = df[((df['UNIQUE_CARRIER'] == 'UA') 
            | (df['UNIQUE_CARRIER'] == 'AA')
            | (df['UNIQUE_CARRIER'] == 'DL')) 
            & (df['DEPARTURES_PERFORMED'] > 0)
            & (df['SEATS'] > 0)]

df_air = df_air.reset_index(drop=True)
df_air.head()

Unnamed: 0,DATE,DEPARTURES_PERFORMED,SEATS,PASSENGERS,UNIQUE_CARRIER,CARRIER_NAME,ORIGIN_AIRPORT_ID,ORIGIN_CITY_NAME,ORIGIN_COUNTRY_NAME,DEST_AIRPORT_ID,DEST_CITY_NAME,DEST_COUNTRY_NAME,CLASS
0,2013-01-01,1.0,54.0,0.0,DL,Delta Air Lines Inc.,13232,"Chicago, IL",United States,15147,"Treasure Cay, The Bahamas",The Bahamas,L
1,2013-11-01,1.0,54.0,24.0,DL,Delta Air Lines Inc.,12197,"White Plains, NY",United States,13289,"Marsh Harbour, The Bahamas",The Bahamas,L
2,2013-05-01,1.0,54.0,28.0,DL,Delta Air Lines Inc.,11618,"Newark, NJ",United States,13289,"Marsh Harbour, The Bahamas",The Bahamas,L
3,2013-02-01,1.0,54.0,30.0,DL,Delta Air Lines Inc.,12197,"White Plains, NY",United States,13289,"Marsh Harbour, The Bahamas",The Bahamas,L
4,2013-11-01,1.0,54.0,30.0,DL,Delta Air Lines Inc.,13289,"Marsh Harbour, The Bahamas",The Bahamas,11618,"Newark, NJ",United States,L


## DATA CLEANING

In [11]:
# check null values in the dataframe
df_air.isna().mean()*100

DATE                    0.000000
DEPARTURES_PERFORMED    0.000000
SEATS                   0.000000
PASSENGERS              0.000000
UNIQUE_CARRIER          0.000000
CARRIER_NAME            0.000000
ORIGIN_AIRPORT_ID       0.000000
ORIGIN_CITY_NAME        0.011620
ORIGIN_COUNTRY_NAME     0.011620
DEST_AIRPORT_ID         0.000000
DEST_CITY_NAME          0.010375
DEST_COUNTRY_NAME       0.010375
CLASS                   0.000000
dtype: float64

In [12]:
# view null values
df_air[df_air.isna().any(axis=1)]

Unnamed: 0,DATE,DEPARTURES_PERFORMED,SEATS,PASSENGERS,UNIQUE_CARRIER,CARRIER_NAME,ORIGIN_AIRPORT_ID,ORIGIN_CITY_NAME,ORIGIN_COUNTRY_NAME,DEST_AIRPORT_ID,DEST_CITY_NAME,DEST_COUNTRY_NAME,CLASS
189570,2022-11-01,1.0,271.0,158.0,AA,American Airlines Inc.,11369,,,10581,"Bangor, ME",United States,F
192309,2022-11-01,3.0,912.0,225.0,AA,American Airlines Inc.,11369,,,12478,"New York, NY",United States,F
192312,2022-11-01,3.0,912.0,863.0,AA,American Airlines Inc.,12478,"New York, NY",United States,11369,,,F
193560,2022-12-01,4.0,1216.0,516.0,AA,American Airlines Inc.,12478,"New York, NY",United States,11369,,,F
193561,2022-12-01,4.0,1216.0,1079.0,AA,American Airlines Inc.,11369,,,12478,"New York, NY",United States,F
194671,2022-09-01,6.0,1824.0,602.0,AA,American Airlines Inc.,12478,"New York, NY",United States,11369,,,F
194944,2022-09-01,7.0,2128.0,1937.0,AA,American Airlines Inc.,11369,,,12478,"New York, NY",United States,F
198301,2022-09-01,23.0,6233.0,5114.0,AA,American Airlines Inc.,11369,,,12478,"New York, NY",United States,F
198548,2022-09-01,24.0,6504.0,3143.0,AA,American Airlines Inc.,12478,"New York, NY",United States,11369,,,F
198895,2022-06-01,25.0,7600.0,5873.0,AA,American Airlines Inc.,11369,,,12478,"New York, NY",United States,F


In [13]:
# check unique null values
df_air['ORIGIN_AIRPORT_ID'][df_air['ORIGIN_CITY_NAME'].isna()].value_counts()

ORIGIN_AIRPORT_ID
11369    28
Name: count, dtype: int64

In [14]:
# check unique null values
df_air['DEST_AIRPORT_ID'][df_air['DEST_CITY_NAME'].isna()].value_counts()

DEST_AIRPORT_ID
11369    25
Name: count, dtype: int64

In [15]:
# determined from the lookup table 'L_AIRPORT_ID'
df_air.loc[df_air['ORIGIN_AIRPORT_ID'] == 11369, ['ORIGIN_CITY_NAME','ORIGIN_COUNTRY_NAME']] = ['Doha','Qatar']

In [16]:
# view remaining null values
df_air[df_air.isna().any(axis=1)]

Unnamed: 0,DATE,DEPARTURES_PERFORMED,SEATS,PASSENGERS,UNIQUE_CARRIER,CARRIER_NAME,ORIGIN_AIRPORT_ID,ORIGIN_CITY_NAME,ORIGIN_COUNTRY_NAME,DEST_AIRPORT_ID,DEST_CITY_NAME,DEST_COUNTRY_NAME,CLASS
192312,2022-11-01,3.0,912.0,863.0,AA,American Airlines Inc.,12478,"New York, NY",United States,11369,,,F
193560,2022-12-01,4.0,1216.0,516.0,AA,American Airlines Inc.,12478,"New York, NY",United States,11369,,,F
194671,2022-09-01,6.0,1824.0,602.0,AA,American Airlines Inc.,12478,"New York, NY",United States,11369,,,F
198548,2022-09-01,24.0,6504.0,3143.0,AA,American Airlines Inc.,12478,"New York, NY",United States,11369,,,F
199351,2022-06-01,26.0,7904.0,6539.0,AA,American Airlines Inc.,12478,"New York, NY",United States,11369,,,F
199793,2022-11-01,27.0,7313.0,5965.0,AA,American Airlines Inc.,12478,"New York, NY",United States,11369,,,F
199795,2022-12-01,27.0,7317.0,6198.0,AA,American Airlines Inc.,12478,"New York, NY",United States,11369,,,F
201701,2022-07-01,29.0,8816.0,6938.0,AA,American Airlines Inc.,12478,"New York, NY",United States,11369,,,F
203375,2022-08-01,30.0,9120.0,3507.0,AA,American Airlines Inc.,12478,"New York, NY",United States,11369,,,F
204681,2022-10-01,31.0,8403.0,6012.0,AA,American Airlines Inc.,12478,"New York, NY",United States,11369,,,F


In [17]:
# determined from the lookup table 'L_AIRPORT_ID'
df_air.loc[df_air['DEST_AIRPORT_ID'] == 11369, ['DEST_CITY_NAME','DEST_COUNTRY_NAME']] = ['Doha','Qatar']

In [18]:
# verify
df_air.isna().sum()

DATE                    0
DEPARTURES_PERFORMED    0
SEATS                   0
PASSENGERS              0
UNIQUE_CARRIER          0
CARRIER_NAME            0
ORIGIN_AIRPORT_ID       0
ORIGIN_CITY_NAME        0
ORIGIN_COUNTRY_NAME     0
DEST_AIRPORT_ID         0
DEST_CITY_NAME          0
DEST_COUNTRY_NAME       0
CLASS                   0
dtype: int64

In [19]:
# check duplicates
df_air.duplicated().sum()

5

In [20]:
# view duplicates
df_air[df_air.duplicated(keep=False)].head(10)

Unnamed: 0,DATE,DEPARTURES_PERFORMED,SEATS,PASSENGERS,UNIQUE_CARRIER,CARRIER_NAME,ORIGIN_AIRPORT_ID,ORIGIN_CITY_NAME,ORIGIN_COUNTRY_NAME,DEST_AIRPORT_ID,DEST_CITY_NAME,DEST_COUNTRY_NAME,CLASS
169411,2021-11-01,1.0,126.0,123.0,UA,United Air Lines Inc.,13605,"Nassau, The Bahamas",The Bahamas,11618,"Newark, NJ",United States,F
169412,2021-11-01,1.0,126.0,123.0,UA,United Air Lines Inc.,13605,"Nassau, The Bahamas",The Bahamas,11618,"Newark, NJ",United States,F
188361,2022-07-01,1.0,179.0,156.0,UA,United Air Lines Inc.,16229,"Vancouver, Canada",Canada,12266,"Houston, TX",United States,F
188362,2022-07-01,1.0,179.0,156.0,UA,United Air Lines Inc.,16229,"Vancouver, Canada",Canada,12266,"Houston, TX",United States,F
208731,2023-09-01,1.0,126.0,119.0,UA,United Air Lines Inc.,16229,"Vancouver, Canada",Canada,12266,"Houston, TX",United States,F
208732,2023-09-01,1.0,126.0,119.0,UA,United Air Lines Inc.,16229,"Vancouver, Canada",Canada,12266,"Houston, TX",United States,F
226783,2023-03-01,31.0,5332.0,4495.0,AA,American Airlines Inc.,13303,"Miami, FL",United States,10411,"Aruba, Aruba",Aruba,F
226784,2023-03-01,31.0,5332.0,4495.0,AA,American Airlines Inc.,13303,"Miami, FL",United States,10411,"Aruba, Aruba",Aruba,F
232775,2024-01-01,1.0,166.0,163.0,UA,United Air Lines Inc.,14677,"San Salvador, El Salvador",El Salvador,12266,"Houston, TX",United States,F
232776,2024-01-01,1.0,166.0,163.0,UA,United Air Lines Inc.,14677,"San Salvador, El Salvador",El Salvador,12266,"Houston, TX",United States,F


In [21]:
# drop duplicates
df_air = df_air.drop_duplicates()

In [22]:
# verify
df_air.duplicated().sum()

0

In [23]:
df_air1 = df_air.copy() # pandas preferred method to avoid SettingWithCopyWarning

In [24]:
# create new feature showing seats per flight
df_air1['CAPACITY'] = df_air1.loc[:, 'PASSENGERS'] / df_air1.loc[:, 'SEATS']

In [25]:
df_air1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 240968 entries, 0 to 240972
Data columns (total 14 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   DATE                  240968 non-null  datetime64[ns]
 1   DEPARTURES_PERFORMED  240968 non-null  float64       
 2   SEATS                 240968 non-null  float64       
 3   PASSENGERS            240968 non-null  float64       
 4   UNIQUE_CARRIER        240968 non-null  object        
 5   CARRIER_NAME          240968 non-null  object        
 6   ORIGIN_AIRPORT_ID     240968 non-null  int64         
 7   ORIGIN_CITY_NAME      240968 non-null  object        
 8   ORIGIN_COUNTRY_NAME   240968 non-null  object        
 9   DEST_AIRPORT_ID       240968 non-null  int64         
 10  DEST_CITY_NAME        240968 non-null  object        
 11  DEST_COUNTRY_NAME     240968 non-null  object        
 12  CLASS                 240968 non-null  object        
 13  CAPA

In [26]:
# export cleaned dataset
df_air1.to_csv('capstone_data_cleaned.csv', index=False) # makes it not save an index col