In [2]:
import polars as pl
from pathlib import Path
from dateutil import parser

In [3]:
PATH_WEATHER = Path("Data/Predict With - X/Weather/Newark")
PATH_CLEAN = Path("Clean")

In [4]:
def print_entire_df(df):
    with pl.Config() as cfg:
        cfg.set_tbl_rows(-1) # Display all rows
        cfg.set_tbl_cols(-1) # Display all columns
        cfg.set_tbl_width_chars(999)
        print(df)

# DATA READING

In [5]:

data_2019 = pl.read_csv(PATH_WEATHER / "hourly_weather_newark_2019.csv")
data_2020 = pl.read_csv(PATH_WEATHER / "hourly_weather_newark_2020.csv")
data_2021 = pl.read_csv(PATH_WEATHER / "hourly_weather_newark_2021.csv")
data_2022 = pl.read_csv(PATH_WEATHER / "hourly_weather_newark_2022.csv")
data_2023 = pl.read_csv(PATH_WEATHER / "hourly_weather_newark_2023.csv")
data_2024 = pl.read_csv(PATH_WEATHER / "hourly_weather_newark_2024.csv")

print(data_2019) # TAKE 2019 FOR INSTANCE
print(data_2020)

shape: (10_169, 10)
┌────────────┬───────────┬───────────┬──────────┬───┬───────────┬──────────┬───────────┬───────────┐
│ Time       ┆ Temperatu ┆ Dew Point ┆ Humidity ┆ … ┆ Wind Gust ┆ Pressure ┆ Precipita ┆ Condition │
│ ---        ┆ re        ┆ ---       ┆ ---      ┆   ┆ ---       ┆ ---      ┆ tion      ┆ ---       │
│ str        ┆ ---       ┆ str       ┆ str      ┆   ┆ str       ┆ str      ┆ ---       ┆ str       │
│            ┆ str       ┆           ┆          ┆   ┆           ┆          ┆ str       ┆           │
╞════════════╪═══════════╪═══════════╪══════════╪═══╪═══════════╪══════════╪═══════════╪═══════════╡
│ 2019-01-01 ┆ 46F       ┆ 46F       ┆ 100%     ┆ … ┆ 0mph      ┆ 29.73in  ┆ 0.1in     ┆ Light     │
│ 1:51 AM    ┆           ┆           ┆          ┆   ┆           ┆          ┆           ┆ Rain      │
│ 2019-01-01 ┆ 47F       ┆ 46F       ┆ 97%      ┆ … ┆ 0mph      ┆ 29.67in  ┆ 0.0in     ┆ Light     │
│ 2:51 AM    ┆           ┆           ┆          ┆   ┆           ┆      

# DATA CLEANING

In [6]:
weather_merged = pl.concat([data_2019, data_2020, data_2021, data_2022, data_2023, data_2024])

weather_clean = (
    weather_merged
    .rename({"Time": "Date"})
    .with_columns( # CONVERT DATE STRINGS TO DATETIME OBJECT
        pl.col("Date")
        .map_elements(lambda date_str: parser.parse(date_str), return_dtype=pl.Datetime)
        .dt.round("1h")
    )
    .drop_nulls("Date") # WE ONLY HAPPEN TO BE UNABLE TO PARSE ONE ROW, SO THIS IS FINE
    .with_columns(
        pl.col("Temperature").str.strip_suffix("F").cast(pl.Float64),
        pl.col("Dew Point").str.strip_suffix("F").cast(pl.Float64),
        pl.col("Humidity").str.strip_suffix("%").cast(pl.Float64) / 100,
        pl.col("Wind Speed").str.strip_suffix("mph").cast(pl.Float64),
        pl.col("Wind Gust").str.strip_suffix("mph").cast(pl.Float64),
        pl.col("Pressure").str.strip_suffix("in").cast(pl.Float64),
        pl.col("Precipitation").str.strip_suffix("in").cast(pl.Float64)
    )
    .sort(by="Date")
    .unique("Date", keep="last") # KEEPING THE LAST OBSERVATION IS FAIR
    .with_columns(
        pl.col("Date").diff().alias("Difference")
        .fill_null(pl.duration(hours=0)) # Fill first row where difference is null
        .dt.total_minutes()
    )
)

weather_clean.write_csv(PATH_CLEAN / "weather.csv", datetime_format="%Y-%m-%d %H:%M:%S")

In [7]:
weather_clean.describe()

statistic,Date,Temperature,Dew Point,Humidity,Wind,Wind Speed,Wind Gust,Pressure,Precipitation,Condition,Difference
str,str,f64,f64,f64,str,f64,f64,f64,f64,str,f64
"""count""","""52423""",52423.0,52423.0,52423.0,"""52408""",52423.0,52423.0,52423.0,52423.0,"""52423""",52423.0
"""null_count""","""0""",0.0,0.0,0.0,"""15""",0.0,0.0,0.0,0.0,"""0""",0.0
"""mean""","""2022-01-01 02:42:50.535833""",57.646586,42.88196,0.611892,,9.372642,4.666387,30.001391,0.003933,,60.210595
"""std""",,17.744791,18.702371,0.197968,,5.374168,10.264642,0.374462,0.031034,,14.226911
"""min""","""2019-01-01 01:00:00""",0.0,-18.0,0.0,"""CALM""",0.0,0.0,0.0,0.0,"""Blowing Snow / Windy""",0.0
"""25%""","""2020-07-01 15:00:00""",43.0,28.0,0.46,,6.0,0.0,29.86,0.0,,60.0
"""50%""","""2021-12-31 23:00:00""",58.0,44.0,0.61,,8.0,0.0,30.0,0.0,,60.0
"""75%""","""2023-07-03 05:00:00""",73.0,59.0,0.77,,13.0,0.0,30.16,0.0,,60.0
"""max""","""2025-01-01 00:00:00""",102.0,77.0,1.0,"""WSW""",43.0,71.0,30.95,1.5,"""Wintry Mix / Windy""",1620.0
