In [None]:
# Download data from Kaggle
# !kaggle datasets download -d yuanyuwendymu/airline-delay-and-cancellation-data-2009-2018

In [None]:
# imports
import pandas as pd
import zipfile
import os
import requests
import airportsdata
from dotenv import dotenv_values

# Extract data for 2018 out of the kaggle dataset
try:
    with zipfile.ZipFile('airline-delay-and-cancellation-data-2009-2018.zip', 'r') as zip_ref:
        zip_ref.extract('2018.csv', '../data/')
except:
    print('Zipped dataset missing from src.')

# Delete zipped dataset from kaggle once extracted
try:
    os.remove('airline-delay-and-cancellation-data-2009-2018.zip')  # Alternatively, you can use os.unlink(file_path)
except OSError as e:
    print(f"Error while deleting: {e}")

# import variables
airports = airportsdata.load('IATA')
weather_key = dotenv_values().get('weather_key')

In [None]:
# read in CSV
df = pd.read_csv('../data/2018.csv')
# display
df.head()

In [None]:
df.count()

In [None]:
# Started Cleaning
airline_df = df.copy()
airline_df = airline_df[['FL_DATE','ORIGIN', 'DEST', 'OP_CARRIER_FL_NUM', 'OP_CARRIER', 'CRS_ARR_TIME', 'ARR_TIME', 'ARR_DELAY','CRS_DEP_TIME', 'DEP_TIME', 'DEP_DELAY']]
airline_df.head()

In [None]:
airline_df.rename(columns={
    'FL_DATE': 'Flight_Date',
    'ORIGIN': 'Origin_Airport',
    'DEST': 'Destination_Airport',
    'OP_CARRIER_FL_NUM': 'Flight_Number',
    'OP_CARRIER': 'Operating_Carrier',
    'CRS_ARR_TIME': 'Scheduled_Arrival_Time',
    'ARR_TIME': 'Actual_Arrival_Time',
    'ARR_DELAY': 'Arrival_Delay',
    'CRS_DEP_TIME': 'Scheduled_Departure_Time',
    'DEP_TIME': 'Actual_Departure_Time',
    'DEP_DELAY': 'Departure_Delay'
}, inplace=True)

In [None]:
airline_df.dropna(inplace=True)
airline_df.reset_index(drop=True, inplace=True)
airline_df.head()

In [None]:
airline_df.count()

In [None]:
airline_df.info()

In [None]:
len(airline_df['Flight_Date'].unique())

In [None]:
airline_df['Flight_Date'].value_counts()

In [None]:
# get month with most flights
monthly_df = airline_df.copy()

monthly_df['Flight_DateTime'] = pd.to_datetime(monthly_df['Flight_Date'])
monthly_df['Month'] = monthly_df['Flight_DateTime'].dt.month
month_counts = monthly_df['Month'].value_counts()
most_frequent_month = month_counts.idxmax()
month_most_flights = monthly_df[monthly_df['Month'] == most_frequent_month]
month_most_flights.drop(['Month', 'Flight_DateTime'], axis=1, inplace=True)
month_most_flights.reset_index(drop=True, inplace=True)
month_most_flights.head()


In [None]:
month_most_flights['Flight_Date'].unique()

In [None]:
month_most_flights.count()

In [None]:
# add airport cities to DF
month_most_flights['Arrival City'] = ''
month_most_flights['Departure City'] = ''
for i, row in month_most_flights.iterrows():
    arr_port, dep_port = row[['Origin_Airport','Destination_Airport']]
    try:
        month_most_flights.at[i, 'Arrival City'] = f"{airports[arr_port]['city']}, {airports[arr_port]['subd']}"
        month_most_flights.at[i, 'Departure City'] = f"{airports[dep_port]['city']}, {airports[dep_port]['subd']}"
    except:
        if arr_port == 'ISN':
            month_most_flights.at[i,'Arrival City'] = 'Williston, North Dakota'
        elif dep_port == 'ISN':
            month_most_flights.at[i, 'Departure City'] = 'Williston, North Dakota'
        else:
            print(f'Index {i} - Airport not found: {arr_port} | {dep_port}')

month_most_flights.head()



In [None]:
# clean out data folder of old data
try:
    os.remove('../data/2018.csv')
except OSError as e:
    print(f"Error while deleting: {e}")

# save month data as csv
month_most_flights.to_csv('../data/clean_2018_July.csv' )

In [None]:
month_most_flights['Arrival City'].value_counts()

In [None]:
month_most_flights['Departure City'].value_counts()

In [None]:
# Getting returning a delayed arrival count
delayed_arrival = month_most_flights["Arrival_Delay"]
delayed_arrival_counter = 0
for i in delayed_arrival:
    if i > 0:
        delayed_arrival_counter = delayed_arrival_counter + 1
print(delayed_arrival_counter)

In [None]:
#Getting and returning a delayed departure count
delayed_departure = month_most_flights["Departure_Delay"]
counter = 0
for i in delayed_departure:
    if i > 0:
        counter = counter + 1
print(counter)

In [None]:
#df.loc[df['col1'] == value]
delayed_arrivals = month_most_flights.loc[month_most_flights['Arrival_Delay'] > 0]
delayed_arrivals["Flight_Date"].value_counts()

In [None]:
delayed_departures = month_most_flights.loc[month_most_flights['Departure_Delay'] > 0]
delayed_departures['Flight_Date'].value_counts()