## Setup

In [43]:
# imports
import os
import zipfile
import pandas as pd
import numpy as np
import glob as glob

In [44]:
# function to open zipped file and read into df
def unzip_to_df(zip_filepath, file_inside_zip, **read_csv_kwargs):
    try:
        with zipfile.ZipFile(zip_filepath, 'r') as z:
            with z.open(file_inside_zip) as f:
                df = pd.read_csv(f, dtype={'Manufacturer Code': str}, **read_csv_kwargs)
        return df
    except Exception as e:
        print(f'Error occured: {e}')
        return None

In [45]:
# get df
zip_path = '../Work/DB_Work/Output/flight_delays.zip'
file_name = 'flight_delays.csv'

delays_df = unzip_to_df(zip_path, file_name)

In [46]:
# display
delays_df.head()

Unnamed: 0,Delay Bin,Departure Delay,Date,Year,Month,Day,Day of Week,Month (sin),Month (cos),Day (sin),...,Air Temperature,Dew Point Temperature,Relative Humidity,Wind Speed,Wind Direction,Wind Gust,Visibility,Ceiling,Sea Level Pressure,Weather Condition Code
0,Early,-2,01/01/2020,2020,1,1,2,0.5,0.866025,0.201299,...,57.02,33.08,40.23,5.75,80.0,0.0,10.0,,1017.5,
1,11-30 min,11,01/01/2020,2020,1,1,2,0.5,0.866025,0.201299,...,57.02,33.08,40.23,5.75,80.0,0.0,10.0,,1017.5,
2,31-60 min,45,01/01/2020,2020,1,1,2,0.5,0.866025,0.201299,...,57.02,33.08,40.23,5.75,80.0,0.0,10.0,,1017.5,
3,Early,-8,01/01/2020,2020,1,1,2,0.5,0.866025,0.201299,...,57.02,33.08,40.23,5.75,80.0,0.0,10.0,,1017.5,
4,Early,-11,01/01/2020,2020,1,1,2,0.5,0.866025,0.201299,...,57.02,33.08,40.23,5.75,80.0,0.0,10.0,,1017.5,


# Start Airport
---

In [47]:
# get weather df
weather_path = '../Work/DB_Work/Data/Weather/LAX_KLAX_1997-2025.csv'
weather_df = pd.read_csv(
    weather_path,
    usecols=[
        'Date', 'Time','date_time', 'precip_accum_one_hour', 'precip_accum_six_hour', 'air_temp', 'dew_point_temperature', 
        'relative_humidity', 'wind_speed', 'wind_direction', 'wind_gust', 'visibility', 'ceiling', 'sea_level_pressure', 
        'weather_cond_code'
    ],
    dtype={
        'weather_cond_code': 'Int64',
        'ceiling': 'Int64'
    }  #make sure it gets read as nullable int type
)
# display
weather_df.head()

Unnamed: 0,Date,Time,date_time,precip_accum_one_hour,air_temp,relative_humidity,wind_speed,wind_direction,sea_level_pressure,visibility,weather_cond_code,wind_gust,precip_accum_six_hour,ceiling,dew_point_temperature
0,03/20/25,17:53,03/20/25-17:53,,59.0,62.1,11.51,240.0,1017.2,10.0,,,,,46.04
1,03/20/25,16:53,03/20/25-16:53,,60.98,51.51,13.81,240.0,1017.3,10.0,,,,,42.98
2,03/20/25,15:53,03/20/25-15:53,,62.06,51.67,17.26,250.0,1017.7,10.0,,,,,44.06
3,03/20/25,14:53,03/20/25-14:53,,62.96,50.06,13.81,260.0,1018.3,10.0,,,,,44.06
4,03/20/25,13:53,03/20/25-13:53,,62.06,53.48,12.66,260.0,1018.9,10.0,,,,,44.96


## Data Check
---

### Data types

In [48]:
# data types
weather_df.dtypes

Date                      object
Time                      object
date_time                 object
precip_accum_one_hour    float64
air_temp                 float64
relative_humidity        float64
wind_speed               float64
wind_direction           float64
sea_level_pressure       float64
visibility               float64
weather_cond_code          Int64
wind_gust                float64
precip_accum_six_hour    float64
ceiling                    Int64
dew_point_temperature    float64
dtype: object

In [49]:
# create datetime col and remove old one
weather_df['DateTime'] = pd.to_datetime(weather_df['date_time'], format='%m/%d/%y-%H:%M')
weather_df['Hour'] = pd.to_datetime(weather_df['Time'], format='%H:%M').dt.round('h').dt.hour
weather_df = weather_df.drop(columns=['date_time']).copy()

# display
weather_df.head()

Unnamed: 0,Date,Time,precip_accum_one_hour,air_temp,relative_humidity,wind_speed,wind_direction,sea_level_pressure,visibility,weather_cond_code,wind_gust,precip_accum_six_hour,ceiling,dew_point_temperature,DateTime,Hour
0,03/20/25,17:53,,59.0,62.1,11.51,240.0,1017.2,10.0,,,,,46.04,2025-03-20 17:53:00,18
1,03/20/25,16:53,,60.98,51.51,13.81,240.0,1017.3,10.0,,,,,42.98,2025-03-20 16:53:00,17
2,03/20/25,15:53,,62.06,51.67,17.26,250.0,1017.7,10.0,,,,,44.06,2025-03-20 15:53:00,16
3,03/20/25,14:53,,62.96,50.06,13.81,260.0,1018.3,10.0,,,,,44.06,2025-03-20 14:53:00,15
4,03/20/25,13:53,,62.06,53.48,12.66,260.0,1018.9,10.0,,,,,44.96,2025-03-20 13:53:00,14


### Remove old dates

In [50]:
# filter out old dates
yr_threshold = 2019
weather_df = weather_df[weather_df['DateTime'].dt.year >= yr_threshold].copy()

### Check nulls

In [51]:
# check nulls and len
print(f'Number of rows: {len(weather_df)}')
weather_df.isna().sum()

Number of rows: 63383


Date                         0
Time                         0
precip_accum_one_hour    59529
air_temp                     2
relative_humidity            3
wind_speed                  19
wind_direction            3218
sea_level_pressure        9135
visibility                   7
weather_cond_code        51899
wind_gust                61345
precip_accum_six_hour    62682
ceiling                  34422
dew_point_temperature        9
DateTime                     0
Hour                         0
dtype: int64

## Analyzing nulls
---

### Check if nulls cover large ranges

In [52]:
# function to check nulls per col depending on date group
def null_check(main_df, date_grp_cols):
    # copy df
    df = main_df.copy()

    # groupby day--> see if there are still nulls for that entire day
    null_check_df = df.groupby(date_grp_cols).count()

    # display if which columns don't have a single value for a year (count)
    columns = list(null_check_df.columns)
    null_col_cnt = []
    for col in columns:
        null_time = len(null_check_df[null_check_df[col] == 0])

        # only add columns that have nulls
        if null_time > 0:
            null_dict = {
                'Column': col,
                f'Null {date_grp_cols}': null_time,
                '% Null': round(null_time/len(null_check_df) * 100, 2)
            }
        
            null_col_cnt.append(null_dict)
    
    # sort by % null
    sorted_null_cnt = sorted(null_col_cnt, key=lambda x: x['% Null'], reverse=True)

    # display
    print(f'Number of {date_grp_cols}: {len(null_check_df)}')

    # return
    return sorted_null_cnt

##### Checking if there any valid column values for the entire day

In [53]:
null_check(weather_df, 'Date')

Number of Date: 2271


[{'Column': 'precip_accum_six_hour', 'Null Date': 1918, '% Null': 84.46},
 {'Column': 'precip_accum_one_hour', 'Null Date': 1909, '% Null': 84.06},
 {'Column': 'wind_gust', 'Null Date': 1612, '% Null': 70.98},
 {'Column': 'weather_cond_code', 'Null Date': 1278, '% Null': 56.27},
 {'Column': 'ceiling', 'Null Date': 465, '% Null': 20.48}]

##### Checking if there any valid column values for each hour per day

In [54]:
null_check(weather_df, ['Date', 'Hour'])

Number of ['Date', 'Hour']: 54350


[{'Column': 'precip_accum_six_hour',
  "Null ['Date', 'Hour']": 53649,
  '% Null': 98.71},
 {'Column': 'wind_gust', "Null ['Date', 'Hour']": 52534, '% Null': 96.66},
 {'Column': 'precip_accum_one_hour',
  "Null ['Date', 'Hour']": 52008,
  '% Null': 95.69},
 {'Column': 'weather_cond_code',
  "Null ['Date', 'Hour']": 47070,
  '% Null': 86.61},
 {'Column': 'ceiling', "Null ['Date', 'Hour']": 31962, '% Null': 58.81},
 {'Column': 'wind_direction', "Null ['Date', 'Hour']": 2501, '% Null': 4.6},
 {'Column': 'sea_level_pressure', "Null ['Date', 'Hour']": 110, '% Null': 0.2},
 {'Column': 'wind_speed', "Null ['Date', 'Hour']": 5, '% Null': 0.01},
 {'Column': 'relative_humidity', "Null ['Date', 'Hour']": 1, '% Null': 0.0},
 {'Column': 'visibility', "Null ['Date', 'Hour']": 2, '% Null': 0.0},
 {'Column': 'dew_point_temperature',
  "Null ['Date', 'Hour']": 1,
  '% Null': 0.0}]

### Check if nulls mean 0 --> **precip_accum** and **wind_gust** are null are likely 0

In [55]:
# check for following columns if 0 ever shows up 
print(f"precip_accum_one_hour: {len(weather_df[weather_df['precip_accum_one_hour'] == 0])}")
print(f"precip_accum_six_hour: {len(weather_df[weather_df['precip_accum_six_hour'] == 0])}")
print(f"ceiling: {len(weather_df[weather_df['ceiling'] == 0])}")
print(f"wind_direction: {len(weather_df[weather_df['wind_direction'] == 0])}")
print(f"wind_gust: {len(weather_df[weather_df['wind_gust'] == 0])}")
print(f"weather_cond_code: {len(weather_df[weather_df['weather_cond_code'] == 0])}")

precip_accum_one_hour: 0
precip_accum_six_hour: 0
ceiling: 56
wind_direction: 7847
wind_gust: 0
weather_cond_code: 0


### Look into **weather_cond_code** --> should DROP it!!!

##### Wind cond code--> maybe na is supposed to be 0 which means 'no value': https://docs.synopticdata.com/services/weather-condition-codes

In [56]:
# looking into unique values (have some that are out of normal weather condition codes)
weather_df['weather_cond_code'].unique()

<IntegerArray>
[  <NA>,   2493,      6,     31,     13,   2481,    493,      9,   2489,
   2494,      1,    733,   2497,     17,    751,  60093,      5,    737,
 198485, 199445,     77,   2546,   2482,   1045,     14, 199525,   2558,
    801,      7,    569,    494,     66,      8,   2485,   2557,    734,
     78,    566,    591,     85]
Length: 40, dtype: Int64

### Check **ceiling**--> correlation with visibility

##### Ceiling = 0 seems to correlate with lower visibility range but null ceiling values are often quite high (may be above detection range)
- nulls can mean "unlimited/very high ceiling" --> indicating low chance of weather related delays

In [57]:
# check visibility for ceiling = 0
weather_df[weather_df['ceiling'] == 0].visibility.value_counts()

visibility
0.50    21
0.25    14
0.12     6
1.00     4
1.50     3
8.00     3
0.75     2
0.06     1
1.75     1
2.00     1
Name: count, dtype: int64

In [58]:
# check visibility for ceiling is null
weather_df[weather_df['ceiling'].isna()].visibility.value_counts()

visibility
10.00    29000
9.00      1668
8.00      1195
7.00       848
6.00       589
5.00       394
4.00       215
3.00       194
2.50        85
2.00        59
1.50        41
1.00        30
0.75        27
0.50        26
0.25        18
1.25        14
1.75        11
15.00        1
3.50         1
0.12         1
Name: count, dtype: int64

### Check if **wind_direction** being null correlates with wind strength (wind_speed, wind_gust)

##### Wind gust --> not much of a pattern

In [59]:
# wind_direction is null--> see wind gust values
weather_df[weather_df['wind_direction'].isna()].wind_gust.value_counts()

wind_gust
17.26    7
18.41    3
16.11    3
20.71    2
21.86    2
19.56    1
Name: count, dtype: int64

In [60]:
# wind_direction is NOT null--> see wind gust values
weather_df[weather_df['wind_direction'].notna()].wind_gust.value_counts()

wind_gust
20.71    225
21.86    184
23.02    182
25.32    142
24.17    128
18.41    128
19.56    127
26.47     87
28.77     77
27.62     63
31.07     60
17.26     56
29.92     54
32.22     53
19.57     36
34.52     35
16.11     34
33.37     34
26.46     28
24.16     28
35.67     27
27.63     24
36.82     24
37.98     23
17.27     19
21.85     18
39.13     16
41.43     15
40.28     14
42.58     13
29.93     13
33.38     12
32.21     11
43.73     10
46.03      5
39.12      4
44.88      3
40.29      3
35.68      3
48.33      1
44.87      1
Name: count, dtype: int64

##### Wind speed --> not much of a pattern

In [61]:
# wind_direction is null--> see wind speed values
weather_df[weather_df['wind_direction'].isna()].wind_speed.value_counts()

wind_speed
3.45    1192
4.60     711
5.75     464
3.44     372
4.61     237
6.91     223
Name: count, dtype: int64

In [62]:
# wind_direction is NOT null--> see wind speed values
weather_df[weather_df['wind_direction'].notna()].wind_speed.value_counts()

wind_speed
0.00     7846
5.75     6393
4.60     5704
6.91     5266
3.45     4996
10.36    4099
8.06     3528
9.21     3353
12.66    3296
11.51    2915
13.81    1868
4.61     1477
3.44     1298
14.96    1291
16.11    1069
8.05     1024
9.22      941
11.50     871
13.80     504
17.26     496
18.41     381
14.97     356
19.56     205
20.71     174
17.27     131
21.86     106
23.02     105
25.32      70
24.17      69
19.57      62
26.47      40
28.77      37
21.85      30
29.92      27
27.62      26
24.16      23
31.07      17
27.63      13
26.46      12
32.22      11
33.37       9
35.67       6
36.82       6
34.52       5
32.21       3
29.93       2
39.13       1
40.28       1
41.43       1
35.68       1
Name: count, dtype: int64

##### Most likely nulls from sensor or reporting gaps

### Check time gaps for other cols --> is it safe to interpolation?

In [63]:
def find_time_gaps(main_df, col_name):
    # make df copy
    df = main_df.copy()

    # define cols
    miss_col = f'{col_name}_missing'

    # sort by time
    df = df.sort_values('DateTime').reset_index(drop=True)

    # create missing flag
    df[miss_col] = df[col_name].isna()

    # compare each row's missing flag to previous row
    df['block_id'] = (
        df[miss_col] != df[miss_col].shift()
    ).cumsum()

    # filter blocks for miss_col
    missing_blocks = df[df[miss_col]].copy()

    # summarize each missing block
    gap_summary = (
        missing_blocks
        .groupby('block_id')
        .agg(
            start_time = ('DateTime', 'min'),
            end_time = ('DateTime', 'max'),
            gap_hrs = ('DateTime', 'count')
        )
        .reset_index()
    )

    # display
    display(gap_summary['gap_hrs'].value_counts())

#### sea level pressure --> mostly ok

In [64]:
find_time_gaps(weather_df, 'sea_level_pressure')

gap_hrs
1     5500
2     1110
3      286
4       91
5       18
6        5
7        3
15       1
27       1
10       1
Name: count, dtype: int64

##### rest cols --> ok

In [65]:
find_time_gaps(weather_df, 'wind_direction')

gap_hrs
1    2427
2     309
3      41
5       6
4       5
Name: count, dtype: int64

In [66]:
find_time_gaps(weather_df, 'wind_speed')

gap_hrs
1    19
Name: count, dtype: int64

In [67]:
find_time_gaps(weather_df, 'air_temp')

gap_hrs
1    2
Name: count, dtype: int64

In [68]:
find_time_gaps(weather_df, 'relative_humidity')

gap_hrs
1    3
Name: count, dtype: int64

In [69]:
find_time_gaps(weather_df, 'visibility')

gap_hrs
1    7
Name: count, dtype: int64

In [70]:
find_time_gaps(weather_df, 'dew_point_temperature')

gap_hrs
1    7
2    1
Name: count, dtype: int64

## Handle Nulls
---

### Define function to fill nulls

In [71]:
# fix nulls
def fill_nulls(main_df):

    # ------------------------------------
    # Setup DF
    # ------------------------------------
    df = main_df.copy()  #make a copy

    # sort chronologically
    df = df.sort_values('DateTime')
    df = df.set_index('DateTime')

    # ------------------------------------
    # Fill nulls that are supposed to be 0
    # ------------------------------------
    fillna_0_cols = ['precip_accum_one_hour', 'precip_accum_six_hour', 'wind_gust']

    # show step
    print(f'1) filling nulls with 0 for: {fillna_0_cols}')

    # loop through
    for col in fillna_0_cols:
        df[col] = df[col].fillna(0)

    # -----------------------------------------
    # Drop col with too many nulls/invalid vals
    # -----------------------------------------
    # show step
    print('2) weather_cond_code--> dropping column')
    df = df.drop(columns=['weather_cond_code']).copy()

    # ---------------------------------------------------------
    # Ceiling: fill with high value & add missingness indicator
    # ---------------------------------------------------------
    # add missingness indicator col
    print('3a) ceiling--> adding missingness indicator')
    df['ceiling_missing'] = df['ceiling'].isna().astype(int)

    # fill nulls in og col with high val
    high_ceiling = 35000
    print(f'3b) ceiling--> filling nulls with {high_ceiling} (max ceiling is {df['ceiling'].max()})')
    df['ceiling'] = df['ceiling'].fillna(high_ceiling)

    # ------------------------------------------------------------
    # Wind Direction: Interpolate sin/cos (b/c it's 0-360 degrees)
    # ------------------------------------------------------------
    print('4) wind direction--> interpolate sin/cos')

    # storing sin/cos to handle cyclical nature
    df['wind_dir_rad'] = np.deg2rad(df['wind_direction'])
    df['wind_dir_sin'] = np.sin(df['wind_dir_rad'])
    df['wind_dir_cos'] = np.cos(df['wind_dir_rad'])

    # interpolate sin/cos
    df['wind_dir_sin'] = df['wind_dir_sin'].interpolate(method='time')
    df['wind_dir_cos'] = df['wind_dir_cos'].interpolate(method='time')

    # recompute direction
    df['wind_direction_interp'] = np.rad2deg(
        np.arctan2(df['wind_dir_sin'], df['wind_dir_cos'])
    )

    # convert values into 0-360 degrees
    df['wind_direction_interp'] = (df['wind_direction_interp'] + 360) % 360
    df['wind_direction_interp'] = df['wind_direction_interp'].round(1)

    # # drop old wind_dir col and sin/cos/rad cols
    # df = df.drop(columns=[
    #     'wind_direction',
    #     'wind_dir_rad',
    #     'wind_dir_sin',
    #     'wind_dir_cos'
    # ]).copy()
    # drop wind direction and rad cols
    df = df.drop(columns=[
        'wind_direction',
        #'wind_direction_interp',
        'wind_dir_rad'
    ]).copy()

    # ------------------------------------------------------------
    # Rest of nulls: Interpolate for continuous non-cylical cols
    # ------------------------------------------------------------
    # for sea level add missing column
    print('5a) sea_level_pressure--> adding missingness indicator')
    df['sea_level_pressure_missing'] = df['sea_level_pressure'].isna().astype(int)

    # interpolate for leftover cols
    # loop through
    leftover_cols = list(df.columns[df.isna().any()])
    print(f'5b) interpolate rest: {leftover_cols}')
    for col in leftover_cols:
        # interpolate
        df[col] = df[col].interpolate(method='time')
    
    print('-----------------------------------------')

    # return
    return df

In [72]:
# apply
weather2_df = fill_nulls(weather_df)
print(f'Number of rows: {len(weather2_df)}')

# display nulls
weather2_df.isna().sum()

1) filling nulls with 0 for: ['precip_accum_one_hour', 'precip_accum_six_hour', 'wind_gust']
2) weather_cond_code--> dropping column
3a) ceiling--> adding missingness indicator
3b) ceiling--> filling nulls with 35000 (max ceiling is 33000)
4) wind direction--> interpolate sin/cos
5a) sea_level_pressure--> adding missingness indicator
5b) interpolate rest: ['air_temp', 'relative_humidity', 'wind_speed', 'sea_level_pressure', 'visibility', 'dew_point_temperature']
-----------------------------------------
Number of rows: 63383


Date                          0
Time                          0
precip_accum_one_hour         0
air_temp                      0
relative_humidity             0
wind_speed                    0
sea_level_pressure            1
visibility                    0
wind_gust                     0
precip_accum_six_hour         0
ceiling                       0
dew_point_temperature         0
Hour                          0
ceiling_missing               0
wind_dir_sin                  0
wind_dir_cos                  0
wind_direction_interp         0
sea_level_pressure_missing    0
dtype: int64

### Check nulls

In [73]:
null_check(weather2_df, 'Date')

Number of Date: 2271


[]

In [74]:
null_check(weather2_df, ['Date', 'Hour'])

Number of ['Date', 'Hour']: 54350


[]

## Ouput
---

### drop columns

In [75]:
# drop
drop_cols = ['Date', 'Time','Hour']
weather2_df = weather2_df.drop(columns=drop_cols).copy()

### display df

In [76]:
weather2_df.head()

Unnamed: 0_level_0,precip_accum_one_hour,air_temp,relative_humidity,wind_speed,sea_level_pressure,visibility,wind_gust,precip_accum_six_hour,ceiling,dew_point_temperature,ceiling_missing,wind_dir_sin,wind_dir_cos,wind_direction_interp,sea_level_pressure_missing
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2019-01-01 00:49:00,0.0,51.8,18.6,4.61,,10.0,0.0,0.0,35000,10.4,1,1.0,6.123234000000001e-17,90.0,1
2019-01-01 00:53:00,0.0,51.08,18.8,4.61,1015.3,10.0,0.0,0.0,35000,10.04,1,0.984808,-0.1736482,100.0,0
2019-01-01 01:53:00,0.0,51.98,18.18,0.0,1016.0,10.0,0.0,0.0,35000,10.04,1,0.0,1.0,0.0,0
2019-01-01 02:53:00,0.0,48.92,21.2,5.75,1015.9,10.0,0.0,0.0,35000,10.94,1,0.173648,-0.9848078,170.0,0
2019-01-01 03:53:00,0.0,42.98,36.13,3.44,1016.1,10.0,0.0,0.0,35000,17.96,1,0.766044,-0.6427876,130.0,0


### export as zip

In [77]:
# export as zipped file
weather2_df.to_csv(
    'Output/Weather/LAX_2019-2024_cleaned.zip',
    compression={
        'method': 'zip',
        'archive_name': 'LAX_2019-2024_cleaned.csv'
    }
)

# Destination Airports
---

## Get destination airports weather csv's

### get station ids

In [78]:
# definition to convert stations from IATA code (like LAX) to station id (like KLAX)
def get_station_id(iata_code: str) -> str:
    # special mappings for Hawaii & Alaska
    special_map = {
        # Alaska
        "ANC": "PANC",
        
        # Hawaii
        "HNL": "PHNL",
        "KOA": "PHKO",
        "LIH": "PHLI",
        "OGG": "PHOG",
        "ITO": "PHTO",
    }
    
    # return special map value, if not then add K
    return special_map.get(iata_code, f"K{iata_code}")

In [79]:
# create list of station ids from list of all station codes
station_codes = list(delays_df['Destination Airport'].unique())
station_ids = []

for station_code in station_codes:
    # add to list 
    station_ids.append(get_station_id(station_code))

# display
station_ids

['KEWR',
 'KCLT',
 'KMIA',
 'KJFK',
 'KMSP',
 'KORD',
 'KIAH',
 'KDEN',
 'KDFW',
 'KAUS',
 'KBOS',
 'KSLC',
 'KPHX',
 'KIAD',
 'KHOU',
 'KMSY',
 'KATL',
 'KLAS',
 'KBWI',
 'KSFO',
 'KDTW',
 'KSMF',
 'PHNL',
 'KOAK',
 'KSEA',
 'KTUS',
 'KMDW',
 'KSTL',
 'KMCO',
 'KSJC',
 'KPHL',
 'KPDX',
 'PHOG',
 'KASE',
 'KJAC',
 'KSAN',
 'KCLE',
 'KDCA',
 'KFLL',
 'PHKO',
 'KTPA',
 'KBNA',
 'PHLI',
 'KDAL',
 'KMCI',
 'KSTS',
 'KMSN',
 'KSAT',
 'KFAT',
 'KRDM',
 'KEGE',
 'KEUG',
 'KMFR',
 'KSBA',
 'KELP',
 'KBZN',
 'KCMH',
 'KABQ',
 'KSDF',
 'KRDU',
 'KIND',
 'KSBP',
 'KMSO',
 'KMRY',
 'KBOI',
 'KMKE',
 'KSUN',
 'KHDN',
 'KCOS',
 'KCVG',
 'KMMH',
 'KPAE',
 'KPSC',
 'KMEM',
 'KRNO',
 'PHTO',
 'KBUF',
 'KOMA',
 'KSGU',
 'KACV',
 'KPRC',
 'KSCK',
 'KRDD',
 'KBDL',
 'KPSP',
 'PANC',
 'KMTJ',
 'KGEG',
 'KPIT',
 'KOKC',
 'KFCA',
 'KPBI',
 'KRSW',
 'KRIC',
 'KCHS',
 'KJAX',
 'KTUL',
 'KGJT',
 'KDRO',
 'KSAF',
 'KLGA',
 'KBTR',
 'KSBN',
 'KXNA',
 'KBIH',
 'KBHM',
 'KFLG',
 'KCID',
 'KDSM',
 'KBUR']

In [80]:
len(station_ids)

110

### convert csv's to dfs

In [81]:
# definition to convert stations from station id (like KLAX) to IATA code (like LAX)
def get_station_code(station_id: str) -> str:
    # special mappings for Hawaii & Alaska
    special_map = {
        # Alaska
        "PANC": "ANC",
        
        # Hawaii
        "PHNL": "HNL",
        "PHKO": "KOA",
        "PHLI": "LIH",
        "PHOG": "OGG",
        "PHTO": "ITO",
    }
    
    # return special map value, if not then add K
    return special_map.get(station_id, f"{station_id[1:]}")

In [82]:
# get input path
weather_folder = '../Resources/Weather'

# get csv files
csv_files = glob.glob(os.path.join(weather_folder, '*.csv'))

# cols to select
desired_cols = [
    'Date', 'Time','date_time', 'precip_accum_one_hour', 'precip_accum_six_hour', 'air_temp', 'dew_point_temperature', 
    'relative_humidity', 'wind_speed', 'wind_direction', 'wind_gust', 'visibility', 'ceiling', 'sea_level_pressure', 
    'weather_cond_code'
]

# dict to hold dataframes
destination_dfs = {}

try: 
    for file_path in csv_files:
        file_name = os.path.basename(file_path)
        # get key name
        station_id = file_name.split('_')[0]
        station_code = get_station_code(station_id)

        # read csv into df
        df = pd.read_csv(
            file_path,
            skiprows=[1],
            usecols=desired_cols,
            dtype={
                'weather_cond_code': 'Int64',
                'ceiling': 'Int64'
            }  #make sure it gets read as nullable int type
        )

        # reorder cols
        df = df[desired_cols]

        # store dict: station id is key and df is value
        destination_dfs[station_code] = df

        # print
        print(f'Done loading for: {station_code}')
except Exception as e:
    print(f'Error reading {station_code}: {e}')

Done loading for: ABQ
Done loading for: ACV
Done loading for: ASE
Done loading for: ATL
Done loading for: AUS
Done loading for: BDL
Done loading for: BHM
Done loading for: BIH
Done loading for: BIL
Done loading for: BLI
Done loading for: BNA
Done loading for: BOI
Done loading for: BOS
Done loading for: BTR
Done loading for: BUF
Done loading for: BUR
Done loading for: BWI
Done loading for: BZN
Done loading for: CHS
Done loading for: CID
Done loading for: CLE
Done loading for: CLT
Done loading for: CMH
Done loading for: COS
Done loading for: CVG
Done loading for: DAL
Done loading for: DCA
Done loading for: DEN
Done loading for: DFW
Done loading for: DRO
Done loading for: DSM
Done loading for: DTW
Done loading for: EGE
Done loading for: ELP
Done loading for: EUG
Done loading for: EWR
Done loading for: FAR
Done loading for: FAT
Done loading for: FLG
Done loading for: FLL
Done loading for: FSD
Done loading for: GEG
Done loading for: GJT
Done loading for: GRR
Done loading for: HDN
Done loadi

### Missing

In [116]:
# get input path
weather_folder = '../Resources/Weather'

# get csv files
csv_files = glob.glob(os.path.join(weather_folder, '*.csv'))

# cols to select
desired_cols = [
    'Date', 'Time','date_time', 'precip_accum_one_hour', 'precip_accum_six_hour', 'air_temp', 'dew_point_temperature', 
    'relative_humidity', 'wind_speed', 'wind_direction', 'wind_gust', 'visibility', 'ceiling', 'sea_level_pressure', 
    'weather_cond_code'
]

# dict to hold dataframes
destination_dfs = {}
missing_cols_dict = {}

try: 
    for file_path in csv_files:
        file_name = os.path.basename(file_path)
        # get key name
        station_id = file_name.split('_')[0]
        station_code = get_station_code(station_id)

        # read csv into df
        raw_df = pd.read_csv(
            file_path,
            skiprows=[1]
        )

        # see which cols the df has that you want
        actual_cols = set(raw_df.columns)
        keep_cols = list(actual_cols.intersection(desired_cols))

        # subset to those existing columns
        df = raw_df[keep_cols].copy()

        # identify which desired columns are missing 
        missing_cols = list(set(desired_cols) - actual_cols)

        # create missing cols fileed with NA
        for col in missing_cols:
            df[col] = pd.NA

        # reorder cols
        df = df[desired_cols]

        # convert some columns to nullable int if they exist
        dtype_map ={
                'weather_cond_code': 'Int64',
                'ceiling': 'Int64'
            } 
        
        for col, col_type in dtype_map.items():
            if col in df.columns:
                df[col] = df[col].astype(col_type)

        # store dict: station id is key and df is value
        destination_dfs[station_code] = df

        # also store record of which columns were missing
        missing_cols_dict[station_code] = missing_cols

        # print
        print(f'Done loading for: {station_code}. Missing columns: {missing_cols}')
except Exception as e:
    print(f'Error reading {station_code}: {e}')

  raw_df = pd.read_csv(


Done loading for: ABQ. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: ACV. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: ASE. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: ATL. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: ATW. Missing columns: ['precip_accum_six_hour']


  raw_df = pd.read_csv(


Done loading for: AUS. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: BDL. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: BHM. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: BIH. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: BIL. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: BLI. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: BNA. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: BOI. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: BOS. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: BTR. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: BUF. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: BUR. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: BWI. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: BZN. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: CHS. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: CID. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: CLE. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: CLT. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: CMH. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: COS. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: CVG. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: DAL. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: DCA. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: DEN. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: DFW. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: DRO. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: DSM. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: DTW. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: EGE. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: ELP. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: EUG. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: EWR. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: FAR. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: FAT. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: FLG. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: FLL. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: FSD. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: GEG. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: GJT. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: GRR. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: HDN. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: HOU. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: IAD. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: IAH. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: ICT. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: IDA. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: IND. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: JAC. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: JAX. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: JFK. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: LAS. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: LGA. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: LIT. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: LRD. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: MCI. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: MCO. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: MDW. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: MEM. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: MFE. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: MFR. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: MIA. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: MKE. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: MMH. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: MRY. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: MSN. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: MSO. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: MSP. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: MSY. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: MTJ. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: OAK. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: OKC. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: OMA. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: ORD. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: PAE. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: PBI. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: PDX. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: PHL. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: PHX. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: PIT. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: PRC. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: PSC. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: PSP. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: PVU. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: RAP. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: RDD. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: RDM. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: RDU. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: RFD. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: RIC. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: RNO. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: RSW. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: SAF. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: SAN. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: SAT. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: SBA. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: SBN. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: SBP. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: SCK. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: SDF. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: SEA. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: SFO. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: SGF. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: SGU. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: SHV. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: SJC. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: SMF. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: SNA. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: STL. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: STS. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: TPA. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: TUL. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: TUS. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: XNA. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: ANC. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: KOA. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: LIH. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: HNL. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: OGG. Missing columns: []


  raw_df = pd.read_csv(


Done loading for: ITO. Missing columns: []


In [117]:
missing_cols_dict

{'ABQ': [],
 'ACV': [],
 'ASE': [],
 'ATL': [],
 'ATW': ['precip_accum_six_hour'],
 'AUS': [],
 'BDL': [],
 'BHM': [],
 'BIH': [],
 'BIL': [],
 'BLI': [],
 'BNA': [],
 'BOI': [],
 'BOS': [],
 'BTR': [],
 'BUF': [],
 'BUR': [],
 'BWI': [],
 'BZN': [],
 'CHS': [],
 'CID': [],
 'CLE': [],
 'CLT': [],
 'CMH': [],
 'COS': [],
 'CVG': [],
 'DAL': [],
 'DCA': [],
 'DEN': [],
 'DFW': [],
 'DRO': [],
 'DSM': [],
 'DTW': [],
 'EGE': [],
 'ELP': [],
 'EUG': [],
 'EWR': [],
 'FAR': [],
 'FAT': [],
 'FLG': [],
 'FLL': [],
 'FSD': [],
 'GEG': [],
 'GJT': [],
 'GRR': [],
 'HDN': [],
 'HOU': [],
 'IAD': [],
 'IAH': [],
 'ICT': [],
 'IDA': [],
 'IND': [],
 'JAC': [],
 'JAX': [],
 'JFK': [],
 'LAS': [],
 'LGA': [],
 'LIT': [],
 'LRD': [],
 'MCI': [],
 'MCO': [],
 'MDW': [],
 'MEM': [],
 'MFE': [],
 'MFR': [],
 'MIA': [],
 'MKE': [],
 'MMH': [],
 'MRY': [],
 'MSN': [],
 'MSO': [],
 'MSP': [],
 'MSY': [],
 'MTJ': [],
 'OAK': [],
 'OKC': [],
 'OMA': [],
 'ORD': [],
 'PAE': [],
 'PBI': [],
 'PDX': [],
 'PHL

In [65]:
set(station_codes) - set(list(missing_cols_dict.keys()))

{'SLC'}

### Get airport codes for looping

In [83]:
# get list of airports in main dict
airport_codes = list(destination_dfs.keys())
airport_codes

['ABQ',
 'ACV',
 'ASE',
 'ATL',
 'AUS',
 'BDL',
 'BHM',
 'BIH',
 'BIL',
 'BLI',
 'BNA',
 'BOI',
 'BOS',
 'BTR',
 'BUF',
 'BUR',
 'BWI',
 'BZN',
 'CHS',
 'CID',
 'CLE',
 'CLT',
 'CMH',
 'COS',
 'CVG',
 'DAL',
 'DCA',
 'DEN',
 'DFW',
 'DRO',
 'DSM',
 'DTW',
 'EGE',
 'ELP',
 'EUG',
 'EWR',
 'FAR',
 'FAT',
 'FLG',
 'FLL',
 'FSD',
 'GEG',
 'GJT',
 'GRR',
 'HDN',
 'HOU',
 'IAD',
 'IAH',
 'ICT',
 'IDA',
 'IND',
 'JAC',
 'JAX',
 'JFK',
 'LAS',
 'LGA',
 'LIT',
 'LRD',
 'MCI',
 'MCO',
 'MDW',
 'MEM',
 'MFE',
 'MFR',
 'MIA',
 'MKE',
 'MMH',
 'MRY',
 'MSN',
 'MSO',
 'MSP',
 'MSY',
 'MTJ',
 'OAK',
 'OKC',
 'OMA',
 'ORD',
 'PAE',
 'PBI',
 'PDX',
 'PHL',
 'PHX',
 'PIT',
 'PRC',
 'PSC',
 'PSP',
 'PVU',
 'RAP',
 'RDD',
 'RDM',
 'RDU',
 'RFD',
 'RIC',
 'RNO',
 'RSW',
 'SAF',
 'SAN',
 'SAT',
 'SBA',
 'SBN',
 'SBP',
 'SCK',
 'SDF',
 'SEA',
 'SFO',
 'SGF',
 'SGU',
 'SHV',
 'SJC',
 'SMF',
 'SNA',
 'STL',
 'STS',
 'TPA',
 'TUL',
 'TUS',
 'XNA',
 'ANC',
 'KOA',
 'LIH',
 'HNL',
 'OGG',
 'ITO']

## Data Check

### Create DateTime & Hour columns

In [84]:
# loop for each df
for airport in airport_codes:
    # save df reference
    df = destination_dfs[airport]
    
    # create datetime col
    df['DateTime'] = pd.to_datetime(df['date_time'], format='%m/%d/%y-%H:%M')
    df['Hour'] = pd.to_datetime(df['Time'], format='%H:%M').dt.round('h').dt.hour
    
    # remove old one
    destination_dfs[airport] = df.drop(columns=['date_time']).copy()

### Remove old dates

In [85]:
# filter out old dates
date_threshold = pd.Timestamp('2018-12-01')
for airport in airport_codes:
    # save df reference
    df = destination_dfs[airport]

    # filter
    destination_dfs[airport] = df[df['DateTime'] >= date_threshold].copy()

### Check data types for one df

In [86]:
destination_dfs[airport_codes[2]].dtypes

Date                             object
Time                             object
precip_accum_one_hour           float64
precip_accum_six_hour           float64
air_temp                        float64
dew_point_temperature           float64
relative_humidity               float64
wind_speed                      float64
wind_direction                  float64
wind_gust                       float64
visibility                      float64
ceiling                           Int64
sea_level_pressure              float64
weather_cond_code                 Int64
DateTime                 datetime64[ns]
Hour                              int32
dtype: object

## Handle Nulls

### Apply function

In [87]:
# apply function to handle nulls for each df
for airport in airport_codes:
    destination_dfs[airport] = fill_nulls(destination_dfs[airport])

1) filling nulls with 0 for: ['precip_accum_one_hour', 'precip_accum_six_hour', 'wind_gust']
2) weather_cond_code--> dropping column
3a) ceiling--> adding missingness indicator
3b) ceiling--> filling nulls with 35000 (max ceiling is 35000)
4) wind direction--> interpolate sin/cos
5a) sea_level_pressure--> adding missingness indicator
5b) interpolate rest: ['air_temp', 'dew_point_temperature', 'relative_humidity', 'wind_speed', 'visibility', 'sea_level_pressure']
-----------------------------------------
1) filling nulls with 0 for: ['precip_accum_one_hour', 'precip_accum_six_hour', 'wind_gust']
2) weather_cond_code--> dropping column
3a) ceiling--> adding missingness indicator
3b) ceiling--> filling nulls with 35000 (max ceiling is 12000)
4) wind direction--> interpolate sin/cos
5a) sea_level_pressure--> adding missingness indicator
5b) interpolate rest: ['air_temp', 'dew_point_temperature', 'relative_humidity', 'wind_speed', 'visibility', 'sea_level_pressure']
------------------------

### Remove old dates

In [88]:
# filter out old dates
date_threshold = pd.Timestamp('2019-01-01')
for airport in airport_codes:
    # save df reference
    df = destination_dfs[airport]

    # reset index
    df = df.reset_index()

    # filter
    destination_dfs[airport] = df[df['DateTime'] >= date_threshold].copy()

### Check nulls

In [89]:
# single check here
destination_dfs[airport_codes[2]].isna().sum()

DateTime                      0
Date                          0
Time                          0
precip_accum_one_hour         0
precip_accum_six_hour         0
air_temp                      0
dew_point_temperature         0
relative_humidity             0
wind_speed                    0
wind_gust                     0
visibility                    0
ceiling                       0
sea_level_pressure            0
Hour                          0
ceiling_missing               0
wind_dir_sin                  0
wind_dir_cos                  0
wind_direction_interp         0
sea_level_pressure_missing    0
dtype: int64

In [90]:
# check all of the df's for any null columns
for airport in airport_codes:
    df = destination_dfs[airport]
    print(f'{airport}: {df.columns[df.isna().any()].tolist()}')

ABQ: []
ACV: []
ASE: []
ATL: []
AUS: []
BDL: []
BHM: []
BIH: []
BIL: []
BLI: []
BNA: []
BOI: []
BOS: []
BTR: []
BUF: []
BUR: []
BWI: []
BZN: []
CHS: []
CID: []
CLE: []
CLT: []
CMH: []
COS: []
CVG: []
DAL: []
DCA: []
DEN: []
DFW: []
DRO: []
DSM: []
DTW: []
EGE: []
ELP: []
EUG: []
EWR: []
FAR: []
FAT: []
FLG: []
FLL: []
FSD: []
GEG: []
GJT: []
GRR: []
HDN: []
HOU: []
IAD: []
IAH: []
ICT: []
IDA: []
IND: []
JAC: []
JAX: []
JFK: []
LAS: []
LGA: []
LIT: []
LRD: []
MCI: []
MCO: []
MDW: []
MEM: []
MFE: []
MFR: []
MIA: []
MKE: []
MMH: []
MRY: []
MSN: []
MSO: []
MSP: []
MSY: []
MTJ: []
OAK: []
OKC: []
OMA: []
ORD: []
PAE: []
PBI: []
PDX: []
PHL: []
PHX: []
PIT: []
PRC: []
PSC: []
PSP: []
PVU: []
RAP: []
RDD: []
RDM: []
RDU: []
RFD: []
RIC: []
RNO: []
RSW: []
SAF: []
SAN: []
SAT: []
SBA: []
SBN: []
SBP: []
SCK: []
SDF: []
SEA: []
SFO: []
SGF: []
SGU: []
SHV: []
SJC: []
SMF: []
SNA: []
STL: []
STS: []
TPA: []
TUL: []
TUS: []
XNA: []
ANC: []
KOA: []
LIH: []
HNL: []
OGG: []
ITO: []


## Export

### Drop columns

In [91]:
# drop columns
drop_cols = ['Date', 'Time', 'Hour']
for airport in airport_codes:
    destination_dfs[airport] = df.drop(columns=drop_cols).copy()

In [92]:
# look at one updated col list
destination_dfs[airport_codes[0]].dtypes

DateTime                      datetime64[ns]
precip_accum_one_hour                float64
precip_accum_six_hour                float64
air_temp                             float64
dew_point_temperature                float64
relative_humidity                    float64
wind_speed                           float64
wind_gust                            float64
visibility                           float64
ceiling                                Int64
sea_level_pressure                   float64
ceiling_missing                        int32
wind_dir_sin                         float64
wind_dir_cos                         float64
wind_direction_interp                float64
sea_level_pressure_missing             int32
dtype: object

In [93]:
# set index as DateTime
for airport in airport_codes:
    # save df reference
    df = destination_dfs[airport]

    # reset index
    df = df.set_index('DateTime', inplace=True)

### Rename columns with dest_

In [94]:
# loop to add destination indicator to each column
for airport in airport_codes:
    df = destination_dfs[airport]
    df.columns = ['dest_' + col for col in df.columns if col != 'DateTime']

In [95]:
# list one dfs columns
destination_dfs[airport_codes[0]].columns

Index(['dest_precip_accum_one_hour', 'dest_precip_accum_six_hour',
       'dest_air_temp', 'dest_dew_point_temperature', 'dest_relative_humidity',
       'dest_wind_speed', 'dest_wind_gust', 'dest_visibility', 'dest_ceiling',
       'dest_sea_level_pressure', 'dest_ceiling_missing', 'dest_wind_dir_sin',
       'dest_wind_dir_cos', 'dest_wind_direction_interp',
       'dest_sea_level_pressure_missing'],
      dtype='object')

### Export zipped

In [96]:
# export each df as a zipped file
for airport in airport_codes:
    df = destination_dfs[airport]
    df.to_csv(
        f'Output/Weather/{airport}_2018-2025_cleaned.zip',
        compression={
            'method': 'zip',
            'archive_name': f'{airport}_2018-2025_cleaned.csv'
        }
    )