In [None]:

import pandas as pd
import numpy as np
import holidays
import os

#############################################################################################################
### FUNCTIONS
#############################################################################################################

### ---------------------------------------------------------------------------------------------------------
### Load parquet files (should have 12 months worth of data)
### ---------------------------------------------------------------------------------------------------------

def load_parquet_files(directory):
    files = [f for f in os.listdir(directory) if f.endswith('.parquet')]
    print("Available parquet files (" + str(len(files)) + "): ", files)
    df = pd.concat([pd.read_parquet(os.path.join(directory, f)) for f in files], ignore_index=True)
    df.columns = df.columns.str.strip().str.replace(' ', '_').str.lower()
    return df

### ---------------------------------------------------------------------------------------------------------
### General data cleaning (remove duplicates, fill NAs, etc)
### ---------------------------------------------------------------------------------------------------------

def clean_and_filter_columns(df, columns, delay_cols):
    df = df.dropna(axis=1, how='all')
    df = df[[col for col in columns if col in df.columns]].drop_duplicates()
    df[delay_cols] = df[delay_cols].fillna(0)
    df[delay_cols] = df[delay_cols].astype(int)
    return df

### ---------------------------------------------------------------------------------------------------------
### Extract hour (0–23) directly from HHMM-formatted time (e.g., 1420 → 14)
### ---------------------------------------------------------------------------------------------------------

def extract_hour_from_hhmm(df, colname, new_colname):
    df[colname] = pd.to_numeric(df[colname], errors='coerce')  # ensure numeric
    df[new_colname] = (df[colname] // 100).astype('Int64')     # supports NA
    return df

### ---------------------------------------------------------------------------------------------------------
### Filter to only 50 US states & DC (excludes Canadian and other US territories)
### ---------------------------------------------------------------------------------------------------------

def filter_valid_states(df, valid_states):
    return df[df['originstate'].isin(valid_states) & df['deststate'].isin(valid_states)].copy()

### ---------------------------------------------------------------------------------------------------------
### Filter to top 200 airports based on combined arrival and departures
### ---------------------------------------------------------------------------------------------------------

def get_top_airports(df, n=200):
    origin = df['origin'].value_counts()
    dest = df['dest'].value_counts()
    combined = origin.add(dest, fill_value=0)
    return combined.nlargest(n).index

def filter_by_top_airports(df, top_airports):
    return df[
        df['origin'].isin(top_airports) & df['dest'].isin(top_airports)
    ].copy()

### ---------------------------------------------------------------------------------------------------------
### Create categorical feature for proximity to major US holidays
### ---------------------------------------------------------------------------------------------------------

def add_holiday_features(df):

    # Step 1: Convert to datetime
    df['flight_date'] = pd.to_datetime(df[['year', 'month', 'dayofmonth']].rename(columns={'dayofmonth': 'day'}))

    # Step 2: Major holidays and codes
    us_holidays = holidays.US(years=df['year'].unique())

    major_holidays = {
        "New Year's Day": "A",
        "Memorial Day": "B",
        "Independence Day": "C",
        "Labor Day": "D",
        "Thanksgiving Day": "E",
        "Christmas Day": "F"
    }

    # Filter to relevant holiday dates and codes
    holiday_info = [
        (pd.Timestamp(date), code)
        for date, name in us_holidays.items()
        if name in major_holidays
        for code in [major_holidays[name]]
    ]

    if not holiday_info:
        df['holiday_proximity_bucket'] = 5
        df['holiday_code'] = 'NA'
        return df

    # Step 3: Build holiday date array
    holiday_dates = np.array([d[0] for d in holiday_info], dtype='datetime64[D]')
    holiday_codes = np.array([d[1] for d in holiday_info])

    # Step 4: Calculate days difference (vectorized)
    flight_dates = df['flight_date'].values.astype('datetime64[D]')
    date_diffs = flight_dates[:, None] - holiday_dates[None, :]  # shape (N_flights, N_holidays)
    delta_days = np.abs(date_diffs.astype('timedelta64[D]').astype(int))  # in days

    # Step 5: Find nearest holiday within 7 days
    min_diff = np.min(delta_days, axis=1)
    min_idx = np.argmin(delta_days, axis=1)

    # Step 6: Assign bucket based on delta
    bucket = np.full(len(df), 5)  # Default: 5 = not near holiday
    bucket[min_diff == 0] = 1
    bucket[(min_diff == 1)] = 2
    bucket[(min_diff >= 2) & (min_diff <= 3)] = 3
    bucket[(min_diff >= 4) & (min_diff <= 7)] = 4

    # Step 7: Assign holiday code (or NA if not within range)
    code = np.array(['NA'] * len(df), dtype=object)
    within_range = min_diff <= 7
    code[within_range] = holiday_codes[min_idx[within_range]]

    # Step 8: Assign to dataframe
    df['holiday_proximity_bucket'] = bucket
    df['holiday_code'] = code

    return df

#############################################################################################################
### CALL MAIN
#############################################################################################################

# delays defined as more than 15 minutes

if __name__ == "__main__":
    cols = ['year', 'month', 'dayofmonth', 'dayofweek', 'origin', 'dest', 'reporting_airline', 
        'originstate', 'deststate', 'crsdeptime', 'crsarrtime','carrierdelay', 'weatherdelay', 
        'nasdelay', 'securitydelay', 'lateaircraftdelay', 'arrdelayminutes', 'cancelled', 'diverted']
    delay_cols = ['carrierdelay', 'weatherdelay', 'nasdelay', 'securitydelay', 
        'lateaircraftdelay', 'arrdelayminutes']
    state_list = ['AK','AL','AR','AZ','CA','CO','CT','DC','DE','FL','GA','HI','IA','ID','IL','IN',
        'KS','KY','LA','MA','MD','ME','MI','MN','MO','MS','MT','NC','ND','NE','NH','NJ','NM','NV',
        'NY','OH','OK','OR','PA','RI','SC','SD','TN','TX','UT','VA','VT','WA','WI','WV','WY']

    df_raw = load_parquet_files("../data/raw")
    df_clean = clean_and_filter_columns(df_raw, cols, delay_cols)
    df_filtered = filter_valid_states(df_clean, state_list)
    df_filtered = df_filtered.drop(columns=['originstate', 'deststate'])
    df_filtered = extract_hour_from_hhmm(df_filtered, 'crsdeptime', 'dep_hour')
    df_filtered = extract_hour_from_hhmm(df_filtered, 'crsarrtime', 'arr_hour')
    df_filtered = df_filtered.drop(columns=['crsdeptime', 'crsarrtime'])
    df_filtered = add_holiday_features(df_filtered)
    df_filtered = df_filtered.drop(columns=['year', 'flight_date'])
    df_filtered['if_delay'] = np.where(df_filtered['arrdelayminutes'] <= 15, '0', '1').astype(int)
    df_filtered['if_cancelled'] = np.where(df_filtered['cancelled'] == 0, '0', '1').astype(int)
    df_filtered['if_diverted'] = np.where(df_filtered['diverted'] == 0, '0', '1').astype(int)
    df_filtered = df_filtered.drop(columns=['cancelled', 'diverted'])
    df_filtered['nonweatherdelay'] = df_filtered['arrdelayminutes'] - df_filtered['weatherdelay']
    top_airports = get_top_airports(df_filtered)
    df_final = filter_by_top_airports(df_filtered, top_airports)
    
    print("✅ FINAL DATASET CREATED")

In [1]:
import pandas as pd
import numpy as np
import holidays
import os

#############################################################################################################
### FUNCTIONS
#############################################################################################################

### ---------------------------------------------------------------------------------------------------------
### Load parquet files (should have 12 months worth of data)
### ---------------------------------------------------------------------------------------------------------

def load_parquet_files(directory):
    files = [f for f in os.listdir(directory) if f.endswith('.parquet')]
    print("Available parquet files (" + str(len(files)) + "): ", files)
    df = pd.concat([pd.read_parquet(os.path.join(directory, f)) for f in files], ignore_index=True)
    df.columns = df.columns.str.strip().str.replace(' ', '_').str.lower()
    return df

df = load_parquet_files("../data/raw")

Available parquet files (12):  ['flight_data_2024_9.parquet', 'flight_data_2025_3.parquet', 'flight_data_2024_8.parquet', 'flight_data_2025_2.parquet', 'flight_data_2025_1.parquet', 'flight_data_2025_4.parquet', 'flight_data_2024_7.parquet', 'flight_data_2024_12.parquet', 'flight_data_2025_5.parquet', 'flight_data_2024_11.parquet', 'flight_data_2025_6.parquet', 'flight_data_2024_10.parquet']


In [6]:
df.iloc[1:5]

Unnamed: 0,year,quarter,month,dayofmonth,dayofweek,flightdate,reporting_airline,dot_id_reporting_airline,iata_code_reporting_airline,tail_number,...,div4tailnum,div5airport,div5airportid,div5airportseqid,div5wheelson,div5totalgtime,div5longestgtime,div5wheelsoff,div5tailnum,unnamed:_109
1,2024,3,9,1,7,2024-09-01,WN,19393,WN,N7889A,...,,,,,,,,,,
2,2024,3,9,1,7,2024-09-01,WN,19393,WN,N7857B,...,,,,,,,,,,
3,2024,3,9,1,7,2024-09-01,WN,19393,WN,N8567Z,...,,,,,,,,,,
4,2024,3,9,1,7,2024-09-01,WN,19393,WN,N8897K,...,,,,,,,,,,


In [None]:
#############################################################################################################
### SUMMARY STATISTICS DATASET
#############################################################################################################

summary_cols = ['holiday_code', 'dayofweek']

df_summary = df_final.copy().groupby(summary_cols).agg(
    total_flights = ('if_delay', 'count'),
    delayed_flights = ('if_delay', 'sum'),
    cancelled_flights = ('if_cancelled', 'sum'),
    diverted_flights = ('if_diverted', 'sum'),
    total_delay_minutes = ('arrdelayminutes', 'sum'),
    delay_minutes_75th = ('arrdelayminutes', lambda x: int(x.quantile(0.75))),
    delay_minutes_90th = ('arrdelayminutes', lambda x: int(x.quantile(0.90))),
    delay_minutes_95th = ('arrdelayminutes', lambda x: int(x.quantile(0.95))),
    delay_minutes_99th = ('arrdelayminutes', lambda x: int(x.quantile(0.99)))
).reset_index()

# df_summary.to_parquet('../data/processed/summary_dataset.parquet')

df_summary.sort_values(by="delay_minutes_95th")


In [None]:
#############################################################################################################
### MACHINE LEARNING DATASET
#############################################################################################################

ml_cols = ['month', 'dayofweek', 'origin', 'dest', 'reporting_airline', 'dep_hour', 'holiday_code', 'holiday_proximity_bucket', 'arrdelayminutes']

df_ml = df_final[ml_cols].copy()

df_ml.to_parquet('../data/processed/ml_dataset.parquet')



In [None]:
df_ml

In [None]:

summary_sandbox_cols = ['if_near_holiday']

df_summary_sandbox = df_final.copy().groupby(summary_sandbox_cols).agg(
    total_flights = ('if_delay', 'count'),
    delayed_flights = ('if_delay', 'sum'),
    cancelled_flights = ('if_cancelled', 'sum'),
    diverted_flights = ('if_diverted', 'sum'),
    total_delay_minutes = ('arrdelayminutes', 'sum'),
    delay_minutes_90th = ('arrdelayminutes', lambda x: int(x.quantile(0.90))),
    delay_minutes_95th = ('arrdelayminutes', lambda x: int(x.quantile(0.95))),
    delay_minutes_99th = ('arrdelayminutes', lambda x: int(x.quantile(0.99))),
).reset_index()

df_summary_sandbox['avg_delay'] = df_summary_sandbox['total_delay_minutes'] / df_summary_sandbox['delayed_flights']
df_summary_sandbox['delay_percent'] = (100 * df_summary_sandbox['delayed_flights'] / df_summary_sandbox['total_flights']).round(1)
df_summary_sandbox['delay_percent_str'] = (100 * df_summary_sandbox['delayed_flights'] / df_summary_sandbox['total_flights']).round(1).astype(str) + '%'
df_summary_sandbox['cancelled_percent'] = (100 * df_summary_sandbox['cancelled_flights'] / df_summary_sandbox['total_flights']).round(1).astype(str) + '%'
df_summary_sandbox['diverted_percent'] = (100 * df_summary_sandbox['diverted_flights'] / df_summary_sandbox['total_flights']).round(1).astype(str) + '%'

df_summary_sandbox.sort_values(by='delay_percent').tail(20)

#df_summary_sandbox[df_summary_sandbox['origin'] == "STS"].sort_values(by='delay_percent')


In [25]:
#MAPS_DATA_DIR = os.path.join(os.path.dirname(__file__), '..', 'data', 'maps')

df = pd.read_csv('../data/maps/L_AIRPORT.csv')

df['airport_ui'] = df['Description'].str.split(':').str[-1] + ' (' + df['Code'] + ')'

print(df[df['Code'] == 'DCA'])

     Code                                        Description  \
1467  DCA  Washington, DC: Ronald Reagan Washington National   

                                    airport_ui  
1467   Ronald Reagan Washington National (DCA)  


In [1]:
df = pd.read_csv('../data/maps/L_UNIQUE_CARRIERS.csv')

df

NameError: name 'pd' is not defined