## Setup

In [119]:
# imports
import zipfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# function to open zipped file and read into df
def unzip_to_df(zip_filepath, file_inside_zip, **read_csv_kwargs):
    try:
        with zipfile.ZipFile(zip_filepath, 'r') as z:
            with z.open(file_inside_zip) as f:
                df = pd.read_csv(f, dtype={'Manufacturer Code': str}, **read_csv_kwargs)
        return df
    except Exception as e:
        print(f'Error occured: {e}')
        return None

In [3]:
# get df
zip_path = '../Work/DB_Work/Output/flight_delays.zip'
file_name = 'flight_delays.csv'

delays_df = unzip_to_df(zip_path, file_name)

In [4]:
# display
delays_df.head()

Unnamed: 0,Delay Bin,Departure Delay,Date,Year,Month,Day,Day of Week,Month (sin),Month (cos),Day (sin),...,Air Temperature,Dew Point Temperature,Relative Humidity,Wind Speed,Wind Direction,Wind Gust,Visibility,Ceiling,Sea Level Pressure,Weather Condition Code
0,Early,-2,01/01/2020,2020,1,1,2,0.5,0.866025,0.201299,...,57.02,33.08,40.23,5.75,80.0,0.0,10.0,,1017.5,
1,11-30 min,11,01/01/2020,2020,1,1,2,0.5,0.866025,0.201299,...,57.02,33.08,40.23,5.75,80.0,0.0,10.0,,1017.5,
2,31-60 min,45,01/01/2020,2020,1,1,2,0.5,0.866025,0.201299,...,57.02,33.08,40.23,5.75,80.0,0.0,10.0,,1017.5,
3,Early,-8,01/01/2020,2020,1,1,2,0.5,0.866025,0.201299,...,57.02,33.08,40.23,5.75,80.0,0.0,10.0,,1017.5,
4,Early,-11,01/01/2020,2020,1,1,2,0.5,0.866025,0.201299,...,57.02,33.08,40.23,5.75,80.0,0.0,10.0,,1017.5,


In [5]:
# get weather df
weather_path = '../Work/DB_Work/Data/Weather/LAX_KLAX_1997-2025.csv'
weather_df = pd.read_csv(
    weather_path,
    usecols=[
        'Date', 'Time','date_time', 'precip_accum_one_hour', 'precip_accum_six_hour', 'air_temp', 'dew_point_temperature', 
        'relative_humidity', 'wind_speed', 'wind_direction', 'wind_gust', 'visibility', 'ceiling', 'sea_level_pressure', 
        'weather_cond_code'
    ],
    dtype={
        'weather_cond_code': 'Int64',
        'ceiling': 'Int64'
    }  #make sure it gets read as nullable int type
)
# display
weather_df.head()

Unnamed: 0,Date,Time,date_time,precip_accum_one_hour,air_temp,relative_humidity,wind_speed,wind_direction,sea_level_pressure,visibility,weather_cond_code,wind_gust,precip_accum_six_hour,ceiling,dew_point_temperature
0,03/20/25,17:53,03/20/25-17:53,,59.0,62.1,11.51,240.0,1017.2,10.0,,,,,46.04
1,03/20/25,16:53,03/20/25-16:53,,60.98,51.51,13.81,240.0,1017.3,10.0,,,,,42.98
2,03/20/25,15:53,03/20/25-15:53,,62.06,51.67,17.26,250.0,1017.7,10.0,,,,,44.06
3,03/20/25,14:53,03/20/25-14:53,,62.96,50.06,13.81,260.0,1018.3,10.0,,,,,44.06
4,03/20/25,13:53,03/20/25-13:53,,62.06,53.48,12.66,260.0,1018.9,10.0,,,,,44.96


## Data Check
---

### Data types

In [6]:
# data types
weather_df.dtypes

Date                      object
Time                      object
date_time                 object
precip_accum_one_hour    float64
air_temp                 float64
relative_humidity        float64
wind_speed               float64
wind_direction           float64
sea_level_pressure       float64
visibility               float64
weather_cond_code          Int64
wind_gust                float64
precip_accum_six_hour    float64
ceiling                    Int64
dew_point_temperature    float64
dtype: object

In [7]:
# create datetime col and remove old one
weather_df['DateTime'] = pd.to_datetime(weather_df['date_time'], format='%m/%d/%y-%H:%M')
weather_df['Hour'] = pd.to_datetime(weather_df['Time'], format='%H:%M').dt.round('h').dt.hour
weather_df = weather_df.drop(columns=['date_time']).copy()

# display
weather_df.head()

Unnamed: 0,Date,Time,precip_accum_one_hour,air_temp,relative_humidity,wind_speed,wind_direction,sea_level_pressure,visibility,weather_cond_code,wind_gust,precip_accum_six_hour,ceiling,dew_point_temperature,DateTime,Hour
0,03/20/25,17:53,,59.0,62.1,11.51,240.0,1017.2,10.0,,,,,46.04,2025-03-20 17:53:00,18
1,03/20/25,16:53,,60.98,51.51,13.81,240.0,1017.3,10.0,,,,,42.98,2025-03-20 16:53:00,17
2,03/20/25,15:53,,62.06,51.67,17.26,250.0,1017.7,10.0,,,,,44.06,2025-03-20 15:53:00,16
3,03/20/25,14:53,,62.96,50.06,13.81,260.0,1018.3,10.0,,,,,44.06,2025-03-20 14:53:00,15
4,03/20/25,13:53,,62.06,53.48,12.66,260.0,1018.9,10.0,,,,,44.96,2025-03-20 13:53:00,14


### Remove old dates

In [8]:
# filter out old dates
yr_threshold = 2020
weather_df = weather_df[weather_df['DateTime'].dt.year >= yr_threshold].copy()

### Check nulls

In [9]:
# check nulls and len
print(f'Number of rows: {len(weather_df)}')
weather_df.isna().sum()

Number of rows: 53393


Date                         0
Time                         0
precip_accum_one_hour    50309
air_temp                     2
relative_humidity            3
wind_speed                  16
wind_direction            2636
sea_level_pressure        7871
visibility                   7
weather_cond_code        43417
wind_gust                51646
precip_accum_six_hour    52846
ceiling                  29105
dew_point_temperature        9
DateTime                     0
Hour                         0
dtype: int64

## Analyzing nulls
---

### Check if nulls cover large ranges

In [101]:
# function to check nulls per col depending on date group
def null_check(main_df, date_grp_cols):
    # copy df
    df = main_df.copy()

    # groupby day--> see if there are still nulls for that entire day
    null_check_df = df.groupby(date_grp_cols).count()

    # display if which columns don't have a single value for a year (count)
    columns = list(null_check_df.columns)
    null_col_cnt = []
    for col in columns:
        null_time = len(null_check_df[null_check_df[col] == 0])

        # only add columns that have nulls
        if null_time > 0:
            null_dict = {
                'Column': col,
                f'Null {date_grp_cols}': null_time,
                '% Null': round(null_time/len(null_check_df) * 100, 2)
            }
        
            null_col_cnt.append(null_dict)
    
    # sort by % null
    sorted_null_cnt = sorted(null_col_cnt, key=lambda x: x['% Null'], reverse=True)

    # display
    print(f'Number of {date_grp_cols}: {len(null_check_df)}')

    # return
    return sorted_null_cnt

##### Checking if there any valid column values for the entire day

In [26]:
null_check(weather_df, 'Date')

Number of Date: 1906


[{'Column': 'precip_accum_six_hour', 'Null Date': 1626, '% Null': 85.31},
 {'Column': 'precip_accum_one_hour', 'Null Date': 1617, '% Null': 84.84},
 {'Column': 'wind_gust', 'Null Date': 1340, '% Null': 70.3},
 {'Column': 'weather_cond_code', 'Null Date': 1062, '% Null': 55.72},
 {'Column': 'ceiling', 'Null Date': 393, '% Null': 20.62}]

##### Checking if there any valid column values for each hour per day

In [27]:
null_check(weather_df, ['Date', 'Hour'])

Number of ['Date', 'Hour']: 45606


[{'Column': 'precip_accum_six_hour',
  "Null ['Date', 'Hour']": 45059,
  '% Null': 98.8},
 {'Column': 'wind_gust', "Null ['Date', 'Hour']": 44056, '% Null': 96.6},
 {'Column': 'precip_accum_one_hour',
  "Null ['Date', 'Hour']": 43768,
  '% Null': 95.97},
 {'Column': 'weather_cond_code',
  "Null ['Date', 'Hour']": 39328,
  '% Null': 86.23},
 {'Column': 'ceiling', "Null ['Date', 'Hour']": 26976, '% Null': 59.15},
 {'Column': 'wind_direction', "Null ['Date', 'Hour']": 2020, '% Null': 4.43},
 {'Column': 'sea_level_pressure', "Null ['Date', 'Hour']": 90, '% Null': 0.2},
 {'Column': 'wind_speed', "Null ['Date', 'Hour']": 5, '% Null': 0.01},
 {'Column': 'relative_humidity', "Null ['Date', 'Hour']": 1, '% Null': 0.0},
 {'Column': 'visibility', "Null ['Date', 'Hour']": 2, '% Null': 0.0},
 {'Column': 'dew_point_temperature',
  "Null ['Date', 'Hour']": 1,
  '% Null': 0.0}]

### Check if nulls mean 0 --> **precip_accum** and **wind_gust** are null are likely 0

In [28]:
# check for following columns if 0 ever shows up 
print(f"precip_accum_one_hour: {len(weather_df[weather_df['precip_accum_one_hour'] == 0])}")
print(f"precip_accum_six_hour: {len(weather_df[weather_df['precip_accum_six_hour'] == 0])}")
print(f"ceiling: {len(weather_df[weather_df['ceiling'] == 0])}")
print(f"wind_direction: {len(weather_df[weather_df['wind_direction'] == 0])}")
print(f"wind_gust: {len(weather_df[weather_df['wind_gust'] == 0])}")
print(f"weather_cond_code: {len(weather_df[weather_df['weather_cond_code'] == 0])}")

precip_accum_one_hour: 0
precip_accum_six_hour: 0
ceiling: 52
wind_direction: 6724
wind_gust: 0
weather_cond_code: 0


### Look into **weather_cond_code** --> should DROP it!!!

##### Wind cond code--> maybe na is supposed to be 0 which means 'no value': https://docs.synopticdata.com/services/weather-condition-codes

In [29]:
# looking into unique values (have some that are out of normal weather condition codes)
weather_df['weather_cond_code'].unique()

<IntegerArray>
[  <NA>,   2493,      6,     31,     13,   2481,    493,      9,   2489,
   2494,      1,    733,   2497,     17,    751,  60093,      5,    737,
 198485, 199445,     77,   2546,   2482,   1045,     14, 199525,   2558,
    801,      7,    569,    494,     66,      8,   2485,   2557,    734,
     78,    566,    591]
Length: 39, dtype: Int64

### Check **ceiling**--> correlation with visibility

##### Ceiling = 0 seems to correlate with lower visibility range but null ceiling values are often quite high (may be above detection range)
- nulls can mean "unlimited/very high ceiling" --> indicating low chance of weather related delays

In [33]:
# check visibility for ceiling = 0
weather_df[weather_df['ceiling'] == 0].visibility.value_counts()

visibility
0.50    19
0.25    14
0.12     6
1.00     3
1.50     3
8.00     3
0.06     1
1.75     1
2.00     1
0.75     1
Name: count, dtype: int64

In [34]:
# check visibility for ceiling is null
weather_df[weather_df['ceiling'].isna()].visibility.value_counts()

visibility
10.00    24362
9.00      1453
8.00      1041
7.00       749
6.00       520
5.00       340
4.00       196
3.00       166
2.50        79
2.00        51
1.50        34
1.00        28
0.50        22
0.75        21
0.25        16
1.25        10
1.75         9
15.00        1
3.50         1
0.12         1
Name: count, dtype: int64

### Check if **wind_direction** being null correlates with wind strength (wind_speed, wind_gust)

##### Wind gust --> not much of a pattern

In [49]:
# wind_direction is null--> see wind gust values
weather_df[weather_df['wind_direction'].isna()].wind_gust.value_counts()

wind_gust
17.26    7
18.41    3
20.71    2
21.86    2
19.56    1
16.11    1
Name: count, dtype: int64

In [50]:
# wind_direction is NOT null--> see wind gust values
weather_df[weather_df['wind_direction'].notna()].wind_gust.value_counts()

wind_gust
20.71    200
21.86    184
23.02    156
24.17    128
19.56    127
25.32    120
18.41    104
26.47     87
28.77     63
27.62     63
17.26     56
29.92     54
32.22     53
31.07     50
33.37     34
16.11     30
34.52     28
35.67     27
37.98     21
36.82     19
39.13     16
41.43     14
40.28     14
42.58     13
26.46     10
27.63     10
43.73     10
24.16      8
19.57      6
46.03      5
32.21      4
17.27      4
44.88      3
29.93      3
21.85      2
39.12      2
48.33      1
33.38      1
40.29      1
Name: count, dtype: int64

##### Wind speed --> not much of a pattern

In [51]:
# wind_direction is null--> see wind speed values
weather_df[weather_df['wind_direction'].isna()].wind_speed.value_counts()

wind_speed
3.45    1192
4.60     711
5.75     374
6.91     187
3.44     100
4.61      56
Name: count, dtype: int64

In [52]:
# wind_direction is NOT null--> see wind speed values
weather_df[weather_df['wind_direction'].notna()].wind_speed.value_counts()

wind_speed
0.00     6723
4.60     5704
5.75     5425
3.45     4996
6.91     4412
8.06     3528
10.36    3420
9.21     3353
11.51    2915
12.66    2709
13.81    1868
14.96    1291
16.11     885
17.26     496
4.61      446
3.44      410
18.41     312
8.05      287
9.22      250
11.50     220
19.56     205
20.71     141
13.80     111
21.86     106
14.97      96
23.02      78
24.17      69
25.32      58
26.47      40
28.77      31
17.27      29
29.92      27
27.62      26
19.57      16
31.07      14
32.22      11
33.37       9
21.85       8
36.82       6
35.67       6
34.52       5
26.46       4
27.63       4
24.16       4
39.13       1
40.28       1
41.43       1
Name: count, dtype: int64

##### Most likely nulls from sensor or reporting gaps

### Check time gaps for other cols --> is it safe to interpolation?

In [133]:
def find_time_gaps(main_df, col_name):
    # make df copy
    df = main_df.copy()

    # define cols
    miss_col = f'{col_name}_missing'

    # sort by time
    df = df.sort_values('DateTime').reset_index(drop=True)

    # create missing flag
    df[miss_col] = df[col_name].isna()

    # compare each row's missing flag to previous row
    df['block_id'] = (
        df[miss_col] != df[miss_col].shift()
    ).cumsum()

    # filter blocks for miss_col
    missing_blocks = df[df[miss_col]].copy()

    # summarize each missing block
    gap_summary = (
        missing_blocks
        .groupby('block_id')
        .agg(
            start_time = ('DateTime', 'min'),
            end_time = ('DateTime', 'max'),
            gap_hrs = ('DateTime', 'count')
        )
        .reset_index()
    )

    # display
    display(gap_summary['gap_hrs'].value_counts())

#### sea level pressure --> mostly ok

In [134]:
find_time_gaps(weather_df, 'sea_level_pressure')

gap_hrs
1     4679
2      968
3      255
4       82
5       15
6        5
7        3
27       1
10       1
Name: count, dtype: int64

##### rest cols --> ok

In [135]:
find_time_gaps(weather_df, 'wind_direction')

gap_hrs
1    2014
2     250
3      30
5       4
4       3
Name: count, dtype: int64

In [137]:
find_time_gaps(weather_df, 'wind_speed')

gap_hrs
1    16
Name: count, dtype: int64

In [136]:
find_time_gaps(weather_df, 'air_temp')

gap_hrs
1    2
Name: count, dtype: int64

In [138]:
find_time_gaps(weather_df, 'relative_humidity')

gap_hrs
1    3
Name: count, dtype: int64

In [139]:
find_time_gaps(weather_df, 'visibility')

gap_hrs
1    7
Name: count, dtype: int64

In [140]:
find_time_gaps(weather_df, 'dew_point_temperature')

gap_hrs
1    7
2    1
Name: count, dtype: int64

## Handle Nulls
---

### Define function to fill nulls

In [None]:
# fix nulls
def fill_nulls(main_df):

    # ------------------------------------
    # Setup DF
    # ------------------------------------
    df = main_df.copy()  #make a copy

    # sort chronologically
    df = df.sort_values('DateTime')
    df = df.set_index('DateTime')

    # ------------------------------------
    # Fill nulls that are supposed to be 0
    # ------------------------------------
    fillna_0_cols = ['precip_accum_one_hour', 'precip_accum_six_hour', 'wind_gust']

    # show step
    print(f'1) filling nulls with 0 for: {fillna_0_cols}')

    # loop through
    for col in fillna_0_cols:
        df[col] = df[col].fillna(0)

    # -----------------------------------------
    # Drop col with too many nulls/invalid vals
    # -----------------------------------------
    # show step
    print('2) weather_cond_code--> dropping column')
    df = df.drop(columns=['weather_cond_code']).copy()

    # ---------------------------------------------------------
    # Ceiling: fill with high value & add missingness indicator
    # ---------------------------------------------------------
    # add missingness indicator col
    print('3a) ceiling--> adding missingness indicator')
    df['ceiling_missing'] = df['ceiling'].isna().astype(int)

    # fill nulls in og col with high val
    high_ceiling = 35000
    print(f'3b) ceiling--> filling nulls with {high_ceiling} (max ceiling is {df['ceiling'].max()})')
    df['ceiling'] = df['ceiling'].fillna(high_ceiling)

    # ------------------------------------------------------------
    # Wind Direction: Interpolate sin/cos (b/c it's 0-360 degrees)
    # ------------------------------------------------------------
    print('4) wind direction--> interpolate sin/cos')

    # storing sin/cos to handle cyclical nature
    df['wind_dir_rad'] = np.deg2rad(df['wind_direction'])
    df['wind_dir_sin'] = np.sin(df['wind_dir_rad'])
    df['wind_dir_cos'] = np.cos(df['wind_dir_rad'])

    # interpolate sin/cos
    df['wind_dir_sin'] = df['wind_dir_sin'].interpolate(method='time')
    df['wind_dir_cos'] = df['wind_dir_cos'].interpolate(method='time')

    # recompute direction
    df['wind_direction_interp'] = np.rad2deg(
        np.arctan2(df['wind_dir_sin'], df['wind_dir_cos'])
    )

    # convert values into 0-360 degrees
    df['wind_direction_interp'] = (df['wind_direction_interp'] + 360) % 360
    df['wind_direction_interp'] = df['wind_direction_interp'].round(1)

    # drop old wind_dir col and sin/cos/rad cols
    df = df.drop(columns=[
        'wind_direction',
        'wind_dir_rad',
        'wind_dir_sin',
        'wind_dir_cos'
    ]).copy()

    # return
    return df

In [88]:
# apply
weather2_df = fill_nulls(weather_df)
print(f'Number of rows: {len(weather2_df)}')

# display nulls
weather2_df.isna().sum()

1) filling nulls with 0 for: ['precip_accum_one_hour', 'precip_accum_six_hour', 'wind_gust']
2) weather_cond_code--> dropping column
3a) ceiling--> adding missingness indicator
3b) ceiling--> filling nulls with 35000 (max ceiling is 33000)
4) wind direction--> interpolate sin/cos
Number of rows: 53393


Date                        0
Time                        0
precip_accum_one_hour       0
air_temp                    2
relative_humidity           3
wind_speed                 16
sea_level_pressure       7871
visibility                  7
wind_gust                   0
precip_accum_six_hour       0
ceiling                     0
dew_point_temperature       9
Hour                        0
ceiling_missing             0
wind_direction_interp       0
dtype: int64

### Check nulls

In [102]:
null_check(weather2_df, 'Date')

Number of Date: 1906


[]

In [103]:
null_check(weather2_df, ['Date', 'Hour'])

Number of ['Date', 'Hour']: 45606


[{'Column': 'sea_level_pressure', "Null ['Date', 'Hour']": 90, '% Null': 0.2},
 {'Column': 'wind_speed', "Null ['Date', 'Hour']": 5, '% Null': 0.01},
 {'Column': 'relative_humidity', "Null ['Date', 'Hour']": 1, '% Null': 0.0},
 {'Column': 'visibility', "Null ['Date', 'Hour']": 2, '% Null': 0.0},
 {'Column': 'dew_point_temperature',
  "Null ['Date', 'Hour']": 1,
  '% Null': 0.0}]