In [1]:

import pandas as pd
import numpy as np
import holidays
import os
import itertools

df = pd.read_parquet('../data/processed/summary_dataset.parquet')

In [2]:
df

Unnamed: 0,if_cancelled,if_diverted,arrdelayminutes,if_delay,airline_ui,origin_ui,destination_ui,month_ui,day_ui,hour_ui
0,0,0,0,0,Southwest Airlines,ATL (Hartsfield-Jackson Atlanta International),AUS (Austin - Bergstrom International),September,Sunday,10:00 AM
1,0,0,0,0,Southwest Airlines,ATL (Hartsfield-Jackson Atlanta International),AUS (Austin - Bergstrom International),September,Sunday,02:00 PM
2,0,0,0,0,Southwest Airlines,ATL (Hartsfield-Jackson Atlanta International),BNA (Nashville International),September,Sunday,12:00 PM
3,0,0,0,0,Southwest Airlines,ATL (Hartsfield-Jackson Atlanta International),BWI (Baltimore/Washington International Thurgo...,September,Sunday,07:00 PM
4,0,0,0,0,Southwest Airlines,ATL (Hartsfield-Jackson Atlanta International),BWI (Baltimore/Washington International Thurgo...,September,Sunday,11:00 AM
...,...,...,...,...,...,...,...,...,...,...
4143938,0,0,4,0,United Air Lines,IAD (Washington Dulles International),EWR (Newark Liberty International),October,Tuesday,06:00 AM
4143939,0,0,0,0,United Air Lines,LAX (Los Angeles International),EWR (Newark Liberty International),October,Tuesday,08:00 AM
4143940,0,0,0,0,United Air Lines,DEN (Denver International),DTW (Detroit Metro Wayne County),October,Tuesday,11:00 AM
4143941,0,0,0,0,United Air Lines,IAH (George Bush Intercontinental/Houston),IAD (Washington Dulles International),October,Tuesday,08:00 PM


In [3]:
# Clean up whitespace (in case there are spaces before AM/PM)
df['hour_ui_clean'] = df['hour_ui'].str.strip().str.replace(" ", "")

# Convert to datetime hour (24-hour format)
df['hour_parsed'] = pd.to_datetime(df['hour_ui_clean'], format='%I:%M%p').dt.hour

# Define time block labeling function
def label_time_block(hour):
    if 0 <= hour < 3:
        return "Late Night (12:00 AM - 3:00 AM)"         # 12AM–3AM
    elif 3 <= hour < 6:
        return "Early Morning (3:00 AM - 6:00 AM)"      # 3AM–6AM
    elif 6 <= hour < 9:
        return "Morning (6:00 AM - 9:00 AM)"       # 6AM–9AM
    elif 9 <= hour < 12:
        return "Late Morning (9:00 AM - 12:00 PM)"       # 9AM–12PM
    elif 12 <= hour < 15:
        return "Early Afternoon (12:00 PM - 3:00 PM)"    # 12PM–3PM
    elif 15 <= hour < 18:
        return "Late Afternoon (3:00 PM - 6:00 PM)"     # 3PM–6PM
    elif 18 <= hour < 21:
        return "Evening (6:00 PM - 9:00 PM)"            # 6PM–9PM
    else:
        return "Night (9:00 PM - 12:00 AM)"              # 9PM–12AM

# Apply the labeling function
df['hour_block_ui'] = df['hour_parsed'].apply(label_time_block)

# Clean up
df = df.drop(columns=['hour_ui_clean', 'hour_parsed'])

df

Unnamed: 0,if_cancelled,if_diverted,arrdelayminutes,if_delay,airline_ui,origin_ui,destination_ui,month_ui,day_ui,hour_ui,hour_block_ui
0,0,0,0,0,Southwest Airlines,ATL (Hartsfield-Jackson Atlanta International),AUS (Austin - Bergstrom International),September,Sunday,10:00 AM,Late Morning (9:00 AM - 12:00 PM)
1,0,0,0,0,Southwest Airlines,ATL (Hartsfield-Jackson Atlanta International),AUS (Austin - Bergstrom International),September,Sunday,02:00 PM,Early Afternoon (12:00 PM - 3:00 PM)
2,0,0,0,0,Southwest Airlines,ATL (Hartsfield-Jackson Atlanta International),BNA (Nashville International),September,Sunday,12:00 PM,Early Afternoon (12:00 PM - 3:00 PM)
3,0,0,0,0,Southwest Airlines,ATL (Hartsfield-Jackson Atlanta International),BWI (Baltimore/Washington International Thurgo...,September,Sunday,07:00 PM,Evening (6:00 PM - 9:00 PM)
4,0,0,0,0,Southwest Airlines,ATL (Hartsfield-Jackson Atlanta International),BWI (Baltimore/Washington International Thurgo...,September,Sunday,11:00 AM,Late Morning (9:00 AM - 12:00 PM)
...,...,...,...,...,...,...,...,...,...,...,...
4143938,0,0,4,0,United Air Lines,IAD (Washington Dulles International),EWR (Newark Liberty International),October,Tuesday,06:00 AM,Morning (6:00 AM - 9:00 AM)
4143939,0,0,0,0,United Air Lines,LAX (Los Angeles International),EWR (Newark Liberty International),October,Tuesday,08:00 AM,Morning (6:00 AM - 9:00 AM)
4143940,0,0,0,0,United Air Lines,DEN (Denver International),DTW (Detroit Metro Wayne County),October,Tuesday,11:00 AM,Late Morning (9:00 AM - 12:00 PM)
4143941,0,0,0,0,United Air Lines,IAH (George Bush Intercontinental/Houston),IAD (Washington Dulles International),October,Tuesday,08:00 PM,Evening (6:00 PM - 9:00 PM)


In [None]:
#############################################################################################################
### SUMMARY STATISTICS DATASET
#############################################################################################################

summary_cols = ['airline_ui', 'origin_ui', 'destination_ui', 'month_ui', 'day_ui', 'hour_block_ui']

df_summary = df.copy().groupby(summary_cols).agg(
    total_flights = ('if_delay', 'count'),
    delayed_flights = ('if_delay', 'sum'),
    cancelled_flights = ('if_cancelled', 'sum'),
    diverted_flights = ('if_diverted', 'sum'),
    total_delay_minutes = ('arrdelayminutes', 'sum'),
    # delay_minutes_90th = ('arrdelayminutes', lambda x: int(x.quantile(0.90))),
    # delay_minutes_95th = ('arrdelayminutes', lambda x: int(x.quantile(0.95))),
    # delay_minutes_99th = ('arrdelayminutes', lambda x: int(x.quantile(0.99)))
).reset_index()

# df_summary.to_parquet('../data/processed/summary_dataset.parquet')

df_summary

In [None]:
import pandas as pd
import itertools

summary_cols = ['airline_ui', 'origin_ui', 'destination_ui', 'month_ui', 'day_ui', 'hour_block_ui']

agg_dict = {
    'if_delay': ['count', 'sum'],
    'if_cancelled': 'sum',
    'if_diverted': 'sum',
    'arrdelayminutes': 'sum'
}

def make_summary(df, group_cols):
    # ✅ Convert tuple to list to avoid KeyError
    group_cols = list(group_cols)

    df_summary = (
        df.groupby(group_cols, dropna=False)
          .agg(agg_dict)
          .reset_index()
    )

    # Flatten MultiIndex columns
    df_summary.columns = ['_'.join(c).strip('_') for c in df_summary.columns.to_flat_index()]

    # Fill missing grouping columns with "None"
    for col in summary_cols:
        if col not in group_cols:
            df_summary[col] = "None"

    # Reorder columns to match consistent schema
    df_summary = df_summary[summary_cols + 
                            [c for c in df_summary.columns if c not in summary_cols]]

    return df_summary

# Aggregate for all 5-field combinations
summaries = [
    make_summary(df, group_cols)
    for group_cols in itertools.combinations(summary_cols, 4)
]

# Combine all results
df_summary_6 = pd.concat(summaries, ignore_index=True)


In [None]:
# final_summary

#df_summary_6.to_parquet('test_dataset.parquet')

In [9]:

summary_cols = ['airline_ui', 'origin_ui', 'destination_ui', 'month_ui', 'day_ui', 'hour_block_ui']
hist_bins = [0,60,120,300,np.inf]

def compute_hist(arr):
    counts, _ = np.histogram(arr, bins=hist_bins)
    return counts.astype(np.uint16)

def make_summary(df, group_cols):
    if group_cols:  # normal groupby
        grouped = df.groupby(group_cols, dropna=False)
        df_summary = grouped.agg(
            if_delay_sum=('if_delay', 'sum'),
            if_cancelled_sum=('if_cancelled', 'sum'),
            if_diverted_sum=('if_diverted', 'sum'),
            total_flights=('if_delay', 'count'),
            arrdelayminutes_p90=('arrdelayminutes', lambda x: np.percentile(x, 90)),
            arrdelayminutes_p95=('arrdelayminutes', lambda x: np.percentile(x, 95)),
            arrdelayminutes_p99=('arrdelayminutes', lambda x: np.percentile(x, 99)),
        ).reset_index()
        
        # Compute histograms separately with apply
        df_summary['arrdelayminutes_hist'] = grouped['arrdelayminutes'].apply(compute_hist).values

        # Fill missing grouping columns with None
        for col in summary_cols:
            if col not in group_cols:
                df_summary[col] = None
    else:  # overall summary
        df_summary = pd.DataFrame({
            **{col: [None] for col in summary_cols},
            'total_flights': [len(df)],
            'if_delay_sum': [df['if_delay'].sum()],
            'if_cancelled_sum': [df['if_cancelled'].sum()],
            'if_diverted_sum': [df['if_diverted'].sum()],
            'arrdelayminutes_p90': [np.percentile(df['arrdelayminutes'], 90)],
            'arrdelayminutes_p95': [np.percentile(df['arrdelayminutes'], 95)],
            'arrdelayminutes_p99': [np.percentile(df['arrdelayminutes'], 99)],
            'arrdelayminutes_hist': [compute_hist(df['arrdelayminutes'])]
        })

    # Ensure consistent column order
    df_summary = df_summary[summary_cols + [
        'total_flights', 'if_delay_sum', 'if_cancelled_sum', 'if_diverted_sum',
        'arrdelayminutes_p90', 'arrdelayminutes_p95', 'arrdelayminutes_p99',
        'arrdelayminutes_hist'
    ]]
    return df_summary


# Generate all summaries
all_summaries = []

# 0-field summary
all_summaries.append(make_summary(df, []))

# 1-4 field combinations
for r in range(1, 5):
    for group_cols in itertools.combinations(summary_cols, r):
        summary = make_summary(df, list(group_cols))
        all_summaries.append(summary)

# Combine all
final_summary = pd.concat(all_summaries, ignore_index=True)

# Save to parquet
final_summary.to_parquet('summary_1to4.parquet', index=False)


In [22]:
#final_summary_nohist = final_summary.drop(columns=['arrdelayminutes_hist'])

final_summary_nohist = final_summary.copy()

num_cols = ['if_delay_sum', 'if_cancelled_sum', 'if_diverted_sum', 'arrdelayminutes_p90', 'arrdelayminutes_p95', 'arrdelayminutes_p99']
cat_cols = ['airline_ui', 'origin_ui', 'destination_ui', 'month_ui', 'day_ui', 'hour_block_ui']

final_summary_nohist[num_cols] = final_summary_nohist[num_cols].astype(int)
final_summary_nohist[cat_cols] = final_summary_nohist[cat_cols].astype('category')

final_summary_nohist

final_summary_nohist.to_parquet('nohist.parquet', index=False)

#final_summary.to_parquet('test_dataset.parquet')