## Setup

In [1]:
# imports
import zipfile
import pandas as pd

In [2]:
# function to open zipped file and read into df
def unzip_to_df(zip_filepath, file_inside_zip, **read_csv_kwargs):
    try:
        with zipfile.ZipFile(zip_filepath, 'r') as z:
            with z.open(file_inside_zip) as f:
                df = pd.read_csv(f, dtype={'Manufacturer Code': str}, **read_csv_kwargs)
        return df
    except Exception as e:
        print(f'Error occured: {e}')
        return None

In [4]:
# get df
zip_path = '../Work/DB_Work/Output/flight_delays.zip'
file_name = 'flight_delays.csv'

delays_df = unzip_to_df(zip_path, file_name)

In [5]:
# display
delays_df.head()

Unnamed: 0,Delay Bin,Departure Delay,Date,Year,Month,Day,Day of Week,Scheduled Departure Hour,Scheduled Departure Time,Actual Departure Time,...,Air Temperature,Dew Point Temperature,Relative Humidity,Wind Speed,Wind Direction,Wind Gust,Visibility,Ceiling,Sea Level Pressure,Weather Condition Code
0,Early,-2,01/01/2020,2020,1,1,2,0,00:15,00:13,...,57.02,33.08,40.23,5.75,80.0,0.0,10.0,,1017.5,
1,11-30 min,11,01/01/2020,2020,1,1,2,0,00:30,00:41,...,57.02,33.08,40.23,5.75,80.0,0.0,10.0,,1017.5,
2,31-60 min,45,01/01/2020,2020,1,1,2,0,00:30,01:15,...,57.02,33.08,40.23,5.75,80.0,0.0,10.0,,1017.5,
3,Early,-8,01/01/2020,2020,1,1,2,0,00:30,00:22,...,57.02,33.08,40.23,5.75,80.0,0.0,10.0,,1017.5,
4,Early,-11,01/01/2020,2020,1,1,2,0,00:40,00:29,...,57.02,33.08,40.23,5.75,80.0,0.0,10.0,,1017.5,


In [7]:
# get weather df
weather_path = '../Work/DB_Work/Data/Weather/LAX_KLAX_1997-2025.csv'
weather_df = pd.read_csv(weather_path, usecols=['Date', 'Time','date_time', 'precip_accum_one_hour', 'precip_accum_six_hour', 'air_temp', 'dew_point_temperature', 
                         'relative_humidity', 'wind_speed', 'wind_direction', 'wind_gust',
                         'visibility', 'ceiling', 'sea_level_pressure', 'weather_cond_code'])
# display
weather_df.head()

Unnamed: 0,Date,Time,date_time,precip_accum_one_hour,air_temp,relative_humidity,wind_speed,wind_direction,sea_level_pressure,visibility,weather_cond_code,wind_gust,precip_accum_six_hour,ceiling,dew_point_temperature
0,03/20/25,17:53,03/20/25-17:53,,59.0,62.1,11.51,240.0,1017.2,10.0,,,,,46.04
1,03/20/25,16:53,03/20/25-16:53,,60.98,51.51,13.81,240.0,1017.3,10.0,,,,,42.98
2,03/20/25,15:53,03/20/25-15:53,,62.06,51.67,17.26,250.0,1017.7,10.0,,,,,44.06
3,03/20/25,14:53,03/20/25-14:53,,62.96,50.06,13.81,260.0,1018.3,10.0,,,,,44.06
4,03/20/25,13:53,03/20/25-13:53,,62.06,53.48,12.66,260.0,1018.9,10.0,,,,,44.96


## Data Check
---

### Data types

In [8]:
# data types
weather_df.dtypes

Date                      object
Time                      object
date_time                 object
precip_accum_one_hour    float64
air_temp                 float64
relative_humidity        float64
wind_speed               float64
wind_direction           float64
sea_level_pressure       float64
visibility               float64
weather_cond_code        float64
wind_gust                float64
precip_accum_six_hour    float64
ceiling                  float64
dew_point_temperature    float64
dtype: object

In [9]:
# create datetime col and remove old one
weather_df['DateTime'] = pd.to_datetime(weather_df['date_time'], format='%m/%d/%y-%H:%M')
weather_df = weather_df.drop(columns=['date_time']).copy()

# display
weather_df.head()

Unnamed: 0,Date,Time,precip_accum_one_hour,air_temp,relative_humidity,wind_speed,wind_direction,sea_level_pressure,visibility,weather_cond_code,wind_gust,precip_accum_six_hour,ceiling,dew_point_temperature,DateTime
0,03/20/25,17:53,,59.0,62.1,11.51,240.0,1017.2,10.0,,,,,46.04,2025-03-20 17:53:00
1,03/20/25,16:53,,60.98,51.51,13.81,240.0,1017.3,10.0,,,,,42.98,2025-03-20 16:53:00
2,03/20/25,15:53,,62.06,51.67,17.26,250.0,1017.7,10.0,,,,,44.06,2025-03-20 15:53:00
3,03/20/25,14:53,,62.96,50.06,13.81,260.0,1018.3,10.0,,,,,44.06,2025-03-20 14:53:00
4,03/20/25,13:53,,62.06,53.48,12.66,260.0,1018.9,10.0,,,,,44.96,2025-03-20 13:53:00


### Remove old dates

In [10]:
# filter out old dates
yr_threshold = 2020
weather_df = weather_df[weather_df['DateTime'].dt.year >= yr_threshold].copy()

### Check nulls

In [11]:
# check nulls and len
print(f'Number of rows: {len(weather_df)}')
weather_df.isna().sum()

Number of rows: 53393


Date                         0
Time                         0
precip_accum_one_hour    50309
air_temp                     2
relative_humidity            3
wind_speed                  16
wind_direction            2636
sea_level_pressure        7871
visibility                   7
weather_cond_code        43417
wind_gust                51646
precip_accum_six_hour    52846
ceiling                  29105
dew_point_temperature        9
DateTime                     0
dtype: int64

In [12]:
# groupby day--> see if there are still nulls for that entire day
null_check_df = weather_df.groupby('Date').count()

# display if which columns don't have a single value for a year (count)
columns = list(null_check_df.columns)
null_col_cnt = []
for col in columns:
    null_dict = {
        'Column': col,
        'Null Rows': len(null_check_df[null_check_df[col] == 0])
    }
    
    null_col_cnt.append(null_dict)

# display
print(f'Number of days: {len(null_check_df)}')
null_col_cnt

Number of days: 1906


[{'Column': 'Time', 'Null Rows': 0},
 {'Column': 'precip_accum_one_hour', 'Null Rows': 1617},
 {'Column': 'air_temp', 'Null Rows': 0},
 {'Column': 'relative_humidity', 'Null Rows': 0},
 {'Column': 'wind_speed', 'Null Rows': 0},
 {'Column': 'wind_direction', 'Null Rows': 0},
 {'Column': 'sea_level_pressure', 'Null Rows': 0},
 {'Column': 'visibility', 'Null Rows': 0},
 {'Column': 'weather_cond_code', 'Null Rows': 1062},
 {'Column': 'wind_gust', 'Null Rows': 1340},
 {'Column': 'precip_accum_six_hour', 'Null Rows': 1626},
 {'Column': 'ceiling', 'Null Rows': 393},
 {'Column': 'dew_point_temperature', 'Null Rows': 0},
 {'Column': 'DateTime', 'Null Rows': 0}]

## Handle Nulls
---