In [1]:
import pandas as pd
import os

### Preprocess Data

In [2]:
columns = ['MONTH',
           'DAY_OF_MONTH',
           'DAY_OF_WEEK',
           'OP_CARRIER',
           'ORIGIN',
           'DEST',
           'DEP_TIME',
           'ARR_DELAY']
rawdata_2018 = pd.read_csv('2018.csv', usecols=columns).dropna()

In [3]:
rawdata_2018['DEP_TIME'] = rawdata_2018['DEP_TIME'].transform(lambda x: (x/100).round())
d2018 = rawdata_2018.rename(columns = {'DEP_TIME' : 'HOUR_OF_DAY', "OP_CARRIER": "IATA_CODE"})

In [4]:
d2018 = pd.read_csv('airlines.csv').merge(d2018, on=['IATA_CODE']).drop(columns=['IATA_CODE'])

### Generate Aggregations

In [5]:
def aggregate_data(df, aggr_cols):
    cols = df.columns
    cols_drop = set(cols) - set(aggr_cols).union(set(['ARR_DELAY']))
    aggr_count_df = df.groupby(aggr_cols).size().reset_index().rename(columns={0: "volume"})
    aggr_sum_df = df.groupby(aggr_cols).mean().round(decimals=1).reset_index().drop(columns=cols_drop).sort_values(aggr_cols)
    return aggr_sum_df.merge(aggr_count_df, on=aggr_cols).rename(columns={"ORIGIN": "origin", "DEST": "dest", 'AIRLINE': "label", "ARR_DELAY": "value"})



In [6]:
aggregations = {
    "month": ['ORIGIN', 'DEST','AIRLINE','MONTH'],
    "day_of_week": ['ORIGIN', 'DEST','AIRLINE','DAY_OF_WEEK'],
    "day_of_month": ['ORIGIN', 'DEST','AIRLINE','DAY_OF_MONTH'],
    "hour_of_day": ['ORIGIN', 'DEST','AIRLINE','HOUR_OF_DAY']
}

In [7]:
for k in aggregations.keys():
    aggr_df = aggregate_data(d2018, aggregations[k]).rename(columns={aggregations[k][-1]: "period"})
    filename = "flights_delay_" + k + ".csv"
    aggr_df.to_csv(filename, index=False)
    print("Saved " + os.path.join(os.getcwd(),filename))

Saved /workspace/project/app/data/flights_delay_month.csv
Saved /workspace/project/app/data/flights_delay_day_of_week.csv
Saved /workspace/project/app/data/flights_delay_day_of_month.csv
Saved /workspace/project/app/data/flights_delay_hour_of_day.csv
