## Setup

In [None]:
# imports
import zipfile
import pandas as pd

In [3]:
# function to open zipped file and read into df
def unzip_to_df(zip_filepath, file_inside_zip, **read_csv_kwargs):
    try:
        with zipfile.ZipFile(zip_filepath, 'r') as z:
            with z.open(file_inside_zip) as f:
                df = pd.read_csv(f, dtype={'Manufacturer Code': str}, **read_csv_kwargs)
        return df
    except Exception as e:
        print(f'Error occured: {e}')
        return None

In [4]:
# get df
zip_path = '../Work/DB_Work/Output/flight_delays.zip'
file_name = 'flight_delays.csv'

delays_df = unzip_to_df(zip_path, file_name)

In [5]:
# display
delays_df.head()

Unnamed: 0,Delay Bin,Departure Delay,Date,Year,Month,Day,Day of Week,Scheduled Departure Hour,Scheduled Departure Time,Actual Departure Time,...,Air Temperature,Dew Point Temperature,Relative Humidity,Wind Speed,Wind Direction,Wind Gust,Visibility,Ceiling,Sea Level Pressure,Weather Condition Code
0,Early,-2,01/01/2020,2020,1,1,2,0,00:15,00:13,...,57.02,33.08,40.23,5.75,80.0,0.0,10.0,,1017.5,
1,11-30 min,11,01/01/2020,2020,1,1,2,0,00:30,00:41,...,57.02,33.08,40.23,5.75,80.0,0.0,10.0,,1017.5,
2,31-60 min,45,01/01/2020,2020,1,1,2,0,00:30,01:15,...,57.02,33.08,40.23,5.75,80.0,0.0,10.0,,1017.5,
3,Early,-8,01/01/2020,2020,1,1,2,0,00:30,00:22,...,57.02,33.08,40.23,5.75,80.0,0.0,10.0,,1017.5,
4,Early,-11,01/01/2020,2020,1,1,2,0,00:40,00:29,...,57.02,33.08,40.23,5.75,80.0,0.0,10.0,,1017.5,


In [27]:
# get weather df
weather_path = '../Work/DB_Work/Data/Weather/LAX_KLAX_1997-2025.csv'
weather_df = pd.read_csv(
    weather_path,
    usecols=[
        'Date', 'Time','date_time', 'precip_accum_one_hour', 'precip_accum_six_hour', 'air_temp', 'dew_point_temperature', 
        'relative_humidity', 'wind_speed', 'wind_direction', 'wind_gust', 'visibility', 'ceiling', 'sea_level_pressure', 
        'weather_cond_code'
    ],
    dtype={"weather_cond_code": "Int64"}  #make sure it gets read as nullable int type
)
# display
weather_df.head()

Unnamed: 0,Date,Time,date_time,precip_accum_one_hour,air_temp,relative_humidity,wind_speed,wind_direction,sea_level_pressure,visibility,weather_cond_code,wind_gust,precip_accum_six_hour,ceiling,dew_point_temperature
0,03/20/25,17:53,03/20/25-17:53,,59.0,62.1,11.51,240.0,1017.2,10.0,,,,,46.04
1,03/20/25,16:53,03/20/25-16:53,,60.98,51.51,13.81,240.0,1017.3,10.0,,,,,42.98
2,03/20/25,15:53,03/20/25-15:53,,62.06,51.67,17.26,250.0,1017.7,10.0,,,,,44.06
3,03/20/25,14:53,03/20/25-14:53,,62.96,50.06,13.81,260.0,1018.3,10.0,,,,,44.06
4,03/20/25,13:53,03/20/25-13:53,,62.06,53.48,12.66,260.0,1018.9,10.0,,,,,44.96


## Data Check
---

### Data types

In [28]:
# data types
weather_df.dtypes

Date                      object
Time                      object
date_time                 object
precip_accum_one_hour    float64
air_temp                 float64
relative_humidity        float64
wind_speed               float64
wind_direction           float64
sea_level_pressure       float64
visibility               float64
weather_cond_code          Int64
wind_gust                float64
precip_accum_six_hour    float64
ceiling                  float64
dew_point_temperature    float64
dtype: object

In [29]:
# create datetime col and remove old one
weather_df['DateTime'] = pd.to_datetime(weather_df['date_time'], format='%m/%d/%y-%H:%M')
weather_df['Hour'] = pd.to_datetime(weather_df['Time'], format='%H:%M').dt.round('h').dt.hour
weather_df = weather_df.drop(columns=['date_time']).copy()

# display
weather_df.head()

Unnamed: 0,Date,Time,precip_accum_one_hour,air_temp,relative_humidity,wind_speed,wind_direction,sea_level_pressure,visibility,weather_cond_code,wind_gust,precip_accum_six_hour,ceiling,dew_point_temperature,DateTime,Hour
0,03/20/25,17:53,,59.0,62.1,11.51,240.0,1017.2,10.0,,,,,46.04,2025-03-20 17:53:00,18
1,03/20/25,16:53,,60.98,51.51,13.81,240.0,1017.3,10.0,,,,,42.98,2025-03-20 16:53:00,17
2,03/20/25,15:53,,62.06,51.67,17.26,250.0,1017.7,10.0,,,,,44.06,2025-03-20 15:53:00,16
3,03/20/25,14:53,,62.96,50.06,13.81,260.0,1018.3,10.0,,,,,44.06,2025-03-20 14:53:00,15
4,03/20/25,13:53,,62.06,53.48,12.66,260.0,1018.9,10.0,,,,,44.96,2025-03-20 13:53:00,14


### Remove old dates

In [30]:
# filter out old dates
yr_threshold = 2020
weather_df = weather_df[weather_df['DateTime'].dt.year >= yr_threshold].copy()

### Check nulls

In [31]:
# check nulls and len
print(f'Number of rows: {len(weather_df)}')
weather_df.isna().sum()

Number of rows: 53393


Date                         0
Time                         0
precip_accum_one_hour    50309
air_temp                     2
relative_humidity            3
wind_speed                  16
wind_direction            2636
sea_level_pressure        7871
visibility                   7
weather_cond_code        43417
wind_gust                51646
precip_accum_six_hour    52846
ceiling                  29105
dew_point_temperature        9
DateTime                     0
Hour                         0
dtype: int64

## Analyzing nulls
---

### Check if nulls cover large ranges

In [77]:
# function to check nulls per col depending on date group
def null_check(main_df, date_grp_cols):
    # copy df
    df = main_df.copy()

    # groupby day--> see if there are still nulls for that entire day
    null_check_df = df.groupby(date_grp_cols).count().reset_index()

    # display if which columns don't have a single value for a year (count)
    columns = list(null_check_df.columns)
    null_col_cnt = []
    for col in columns:
        null_time = len(null_check_df[null_check_df[col] == 0])

        # only add columns that have nulls
        if null_time > 0:
            null_dict = {
                'Column': col,
                f'Null {date_grp_cols}': null_time,
                '% Null': round(null_time/len(null_check_df) * 100, 2)
            }
        
            null_col_cnt.append(null_dict)
    
    # sort by % null
    sorted_null_cnt = sorted(null_col_cnt, key=lambda x: x['% Null'], reverse=True)

    # display
    print(f'Number of {date_grp_cols}: {len(null_check_df)}')
    display(sorted_null_cnt)

    # return
    sorted_null_cnt

##### Checking if there any valid column values for the entire day

In [78]:
null_check(weather_df, 'Date')

Number of Date: 1906


[{'Column': 'precip_accum_six_hour', 'Null Date': 1626, '% Null': 85.31},
 {'Column': 'precip_accum_one_hour', 'Null Date': 1617, '% Null': 84.84},
 {'Column': 'wind_gust', 'Null Date': 1340, '% Null': 70.3},
 {'Column': 'weather_cond_code', 'Null Date': 1062, '% Null': 55.72},
 {'Column': 'ceiling', 'Null Date': 393, '% Null': 20.62}]

##### Checking if there any valid column values for each hour per day

In [79]:
null_check(weather_df, ['Date', 'Hour'])

Number of ['Date', 'Hour']: 45606


[{'Column': 'precip_accum_six_hour',
  "Null ['Date', 'Hour']": 45059,
  '% Null': 98.8},
 {'Column': 'wind_gust', "Null ['Date', 'Hour']": 44056, '% Null': 96.6},
 {'Column': 'precip_accum_one_hour',
  "Null ['Date', 'Hour']": 43768,
  '% Null': 95.97},
 {'Column': 'weather_cond_code',
  "Null ['Date', 'Hour']": 39328,
  '% Null': 86.23},
 {'Column': 'ceiling', "Null ['Date', 'Hour']": 26976, '% Null': 59.15},
 {'Column': 'wind_direction', "Null ['Date', 'Hour']": 2020, '% Null': 4.43},
 {'Column': 'Hour', "Null ['Date', 'Hour']": 1901, '% Null': 4.17},
 {'Column': 'sea_level_pressure', "Null ['Date', 'Hour']": 90, '% Null': 0.2},
 {'Column': 'wind_speed', "Null ['Date', 'Hour']": 5, '% Null': 0.01},
 {'Column': 'relative_humidity', "Null ['Date', 'Hour']": 1, '% Null': 0.0},
 {'Column': 'visibility', "Null ['Date', 'Hour']": 2, '% Null': 0.0},
 {'Column': 'dew_point_temperature',
  "Null ['Date', 'Hour']": 1,
  '% Null': 0.0}]

### Check if nulls mean 0 --> precip_accum and wind_gust are null are likely 0

In [35]:
# check for following columns if 0 ever shows up 
print(f"precip_accum_one_hour: {len(weather_df[weather_df['precip_accum_one_hour'] == 0])}")
print(f"precip_accum_six_hour: {len(weather_df[weather_df['precip_accum_six_hour'] == 0])}")
print(f"ceiling: {len(weather_df[weather_df['ceiling'] == 0])}")
print(f"wind_direction: {len(weather_df[weather_df['wind_direction'] == 0])}")
print(f"wind_gust: {len(weather_df[weather_df['wind_gust'] == 0])}")
print(f"weather_cond_code: {len(weather_df[weather_df['weather_cond_code'] == 0])}")

precip_accum_one_hour: 0
precip_accum_six_hour: 0
ceiling: 52
wind_direction: 6724
wind_gust: 0
weather_cond_code: 0


### Look into weather_cond_code --> should DROP it!!!

##### Wind cond code--> maybe na is supposed to be 0 which means 'no value': https://docs.synopticdata.com/services/weather-condition-codes

In [None]:
# looking into unique values (have some that are out of normal weather condition codes)
weather_df['weather_cond_code'].unique()

<IntegerArray>
[  <NA>,   2493,      6,     31,     13,   2481,    493,      9,   2489,
   2494,      1,    733,   2497,     17,    751,  60093,      5,    737,
 198485, 199445,     77,   2546,   2482,   1045,     14, 199525,   2558,
    801,      7,    569,    494,     66,      8,   2485,   2557,    734,
     78,    566,    591]
Length: 39, dtype: Int64

### Check if wind direction being 0 correlates with wind speed as 0

##### Yes--> so nulls in wind direction likely don't mean 0

In [37]:
# see wind speed values for wind_direction=0
weather_df[weather_df['wind_direction'] == 0].wind_speed.value_counts()

wind_speed
0.00    6723
3.45       1
Name: count, dtype: int64

In [38]:
# see wind direction values for wind speed = 0
weather_df[weather_df['wind_speed'] == 0].wind_direction.value_counts()

wind_direction
0.0    6723
Name: count, dtype: int64

In [None]:
# check if wind speed is 0 for any wind_direction
weather_df[weather_df['wind_direction'].isna()].wind_speed.value_counts()

wind_speed
3.45    1192
4.60     711
5.75     374
6.91     187
3.44     100
4.61      56
Name: count, dtype: int64

### check cieling

In [40]:
weather_df[weather_df['ceiling'] == 0].value_counts()

Series([], Name: count, dtype: int64)

In [41]:
weather_df[weather_df['ceiling'].isna()]

Unnamed: 0,Date,Time,precip_accum_one_hour,air_temp,relative_humidity,wind_speed,wind_direction,sea_level_pressure,visibility,weather_cond_code,wind_gust,precip_accum_six_hour,ceiling,dew_point_temperature,DateTime,Hour
0,03/20/25,17:53,,59.00,62.10,11.51,240.0,1017.2,10.0,,,,,46.04,2025-03-20 17:53:00,18
1,03/20/25,16:53,,60.98,51.51,13.81,240.0,1017.3,10.0,,,,,42.98,2025-03-20 16:53:00,17
2,03/20/25,15:53,,62.06,51.67,17.26,250.0,1017.7,10.0,,,,,44.06,2025-03-20 15:53:00,16
3,03/20/25,14:53,,62.96,50.06,13.81,260.0,1018.3,10.0,,,,,44.06,2025-03-20 14:53:00,15
4,03/20/25,13:53,,62.06,53.48,12.66,260.0,1018.9,10.0,,,,,44.96,2025-03-20 13:53:00,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53387,01/01/20,05:53,,51.08,49.97,4.61,70.0,1017.7,10.0,,,,,33.08,2020-01-01 05:53:00,6
53388,01/01/20,04:53,,51.98,48.34,3.44,120.0,1017.6,10.0,,,,,33.08,2020-01-01 04:53:00,5
53390,01/01/20,02:53,,53.96,43.04,4.61,60.0,1017.9,10.0,,,,,32.00,2020-01-01 02:53:00,3
53391,01/01/20,01:53,,53.06,46.46,3.44,60.0,1017.8,10.0,,,,,33.08,2020-01-01 01:53:00,2


## Handle Nulls
---

### Define function to fill nulls

In [51]:
# fix nulls
def fill_nulls(main_df):

    # make a copy
    df = main_df.copy()

    # ------------------------------------
    # Fill nulls that are supposed to be 0
    # ------------------------------------
    df['precip_accum_one_hour'] = df['precip_accum_one_hour'].fillna(0)
    df['precip_accum_six_hour'] = df['precip_accum_six_hour'].fillna(0)
    df['wind_gust'] = df['wind_gust'].fillna(0)

    # ------------------------------------
    # Drop col with too many nulls
    # ------------------------------------
    df = df.drop(columns=['weather_cond_code']).copy()

    # return
    return df

In [52]:
# apply
weather2_df = fill_nulls(weather_df)
print(f'Number of rows: {len(weather2_df)}')
weather2_df.isna().sum()

Number of rows: 53393


Date                         0
Time                         0
precip_accum_one_hour        0
air_temp                     2
relative_humidity            3
wind_speed                  16
wind_direction            2636
sea_level_pressure        7871
visibility                   7
wind_gust                    0
precip_accum_six_hour        0
ceiling                  29105
dew_point_temperature        9
DateTime                     0
Hour                         0
dtype: int64

### Check nulls

In [83]:
null_check(weather2_df, 'Date')

Number of Date: 1906


[{'Column': 'ceiling', 'Null Date': 393, '% Null': 20.62}]

In [81]:
null_check(weather2_df, ['Date', 'Hour'])

Number of ['Date', 'Hour']: 45606


[{'Column': 'ceiling', "Null ['Date', 'Hour']": 26976, '% Null': 59.15},
 {'Column': 'wind_direction', "Null ['Date', 'Hour']": 2020, '% Null': 4.43},
 {'Column': 'Hour', "Null ['Date', 'Hour']": 1901, '% Null': 4.17},
 {'Column': 'sea_level_pressure', "Null ['Date', 'Hour']": 90, '% Null': 0.2},
 {'Column': 'wind_speed', "Null ['Date', 'Hour']": 5, '% Null': 0.01},
 {'Column': 'relative_humidity', "Null ['Date', 'Hour']": 1, '% Null': 0.0},
 {'Column': 'visibility', "Null ['Date', 'Hour']": 2, '% Null': 0.0},
 {'Column': 'dew_point_temperature',
  "Null ['Date', 'Hour']": 1,
  '% Null': 0.0}]