# Daily Weather Data for Each ZIP Code

KD Nearest Neighbour Search Trees

https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html

https://web.stanford.edu/class/cs168/l/l3.pdf

- Default is 1 neighbour k=1
- Default metric: Euclidean distance p = 2 

In [1]:
import pandas as pd
import geopandas as gpd
import pyarrow.dataset as ds
import time
import random
from tqdm import tqdm
from matplotlib import pyplot
pd.options.mode.chained_assignment = None  # default='warn'
pd.options.mode.copy_on_write = False

In [2]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
pd.options.mode.copy_on_write = False
from pathlib import Path
from datetime import datetime
from joblib import Parallel, delayed
import numpy as np
import re
from typing import List
import geopandas as gp
gp.options.io_engine = "pyogrio"
from scipy.spatial import cKDTree
from shapely.ops import transform
from shapely.geometry import Point, Polygon
import pyproj


def assign_stations_to_localities(stations_df: gp.GeoDataFrame, localities: gp.GeoDataFrame, station_list: List[str] = None, closest=True) -> gp.GeoDataFrame:
    if station_list != None:
        stations_df = stations_df.loc[stations_df["stationId"].isin(station_list)].reset_index(drop=True)
    
    if closest:
        # Bucheli et al. Method
        
        tree = cKDTree(stations_df.apply(lambda x: (x.longitude_m, x.latitude_m, x.height_100), axis=1).tolist())
        distances, indices = tree.query(localities.apply(lambda l: (l.longitude_x, l.latitude_y, l.altitude_100), axis=1).tolist())
        
        rows = stations_df.iloc[indices]
        rows.loc[:,"station_geometry"] = rows.geometry
        
        localities = pd.concat([localities.reset_index(), rows.reset_index(drop=True).drop("geometry", axis=1)], axis=1)
    else:
        stations_df.loc[:,"station_geometry"] = stations_df.geometry
        localities = localities.sjoin(stations_df)
        
    
    return localities

In [5]:
ws = Path("/home/aschneuwl/workspace/")
ws2 = Path("/mnt/wks3/aschneuwl/workspace")

In [6]:
def thi(t:float, rel_h:float) -> float:
    return ((1.8*t) + 32) - (0.55 - (0.0055 * rel_h)) * ((1.8*t) - 26)

In [7]:
from joblib import Parallel, delayed

In [6]:
def create_weather_variables_for_day(df_max, df_min, df_mean, station_data, stations_meta, localities, df_precip, df_sunshine)-> pd.DataFrame:
    # Daily Max Temperature
    df_max.loc[:,"center"] = df_max.apply(lambda x: Point(x.lon, x.lat), axis=1)
    df_max.set_geometry("center", inplace=True)
    df_max.set_crs("EPSG:4326", inplace=True)
    
    max_tree = cKDTree(df_max.apply(lambda x: (x.E, x.N), axis=1).tolist())
    distances, indices = max_tree.query(localities.apply(lambda l: (l.longitude_x, l.latitude_y), axis=1).tolist())
    gridded_temp = pd.concat([localities[["ZIP4"]], df_max.iloc[indices].reset_index(drop=True)[["time", "TmaxD"]]], axis=1)

    # Daily Min Temperature
    df_min.loc[:,"center"] = df_min.apply(lambda x: Point(x.lon, x.lat), axis=1)
    df_min = df_min.set_geometry("center")
    df_min = df_min.set_crs("EPSG:4326")
    
    min_tree = cKDTree(df_min.apply(lambda x: (x.E, x.N), axis=1).tolist())
    distances, indices = min_tree.query(localities.apply(lambda l: (l.longitude_x, l.latitude_y), axis=1).tolist())
    gridded_temp = pd.concat([gridded_temp, df_min.iloc[indices].reset_index(drop=True)[["TminD"]]], axis=1)

    # Daily Mean Temperature
    df_mean.loc[:,"center"] = df_mean.apply(lambda x: Point(x.lon, x.lat), axis=1)
    df_mean = df_mean.set_geometry("center")
    df_mean = df_mean.set_crs("EPSG:4326")
    
    mean_tree = cKDTree(df_mean.apply(lambda x: (x.E, x.N), axis=1).tolist())
    distances, indices = mean_tree.query(localities.apply(lambda l: (l.longitude_x, l.latitude_y), axis=1).tolist())
    gridded_temp = pd.concat([gridded_temp, df_mean.iloc[indices].reset_index(drop=True)[["TabsD"]]], axis=1)

    # Daily Precipitation
    df_precip.loc[:,"center"] = df_precip.apply(lambda x: Point(x.lon, x.lat), axis=1)
    df_precip = df_precip.set_geometry("center")
    df_precip = df_precip.set_crs("EPSG:4326")
    
    precip_tree = cKDTree(df_precip.apply(lambda x: (x.E, x.N), axis=1).tolist())
    distances, indices = precip_tree.query(localities.apply(lambda l: (l.longitude_x, l.latitude_y), axis=1).tolist())
    gridded_temp = pd.concat([gridded_temp, df_precip.iloc[indices].reset_index(drop=True)[["RhiresD"]]], axis=1)

    # Sunshine duration
    df_sunshine.loc[:,"center"] = df_sunshine.apply(lambda x: Point(x.lon, x.lat), axis=1)
    df_sunshine = df_sunshine.set_geometry("center")
    df_sunshine = df_sunshine.set_crs("EPSG:4326")
    
    sunshine_tree = cKDTree(df_sunshine.apply(lambda x: (x.E, x.N), axis=1).tolist())
    distances, indices = sunshine_tree.query(localities.apply(lambda l: (l.longitude_x, l.latitude_y), axis=1).tolist())
    gridded_temp = pd.concat([gridded_temp, df_sunshine.iloc[indices].reset_index(drop=True)[["SrelD"]]], axis=1)

    # Humidity from Weather Stations
    assigned_stations = assign_stations_to_localities(stations_meta, localities, list(station_data.stationId.values), closest=True)[["ZIP4", "stationId"]]
    assigned_stations.loc[:,"relHumDMax"] = assigned_stations.stationId.apply(lambda x: station_data.loc[station_data.stationId == x,"relHumDMax"].values[0])
    assigned_stations.loc[:,"relHumDMin"] = assigned_stations.stationId.apply(lambda x: station_data.loc[station_data.stationId == x,"relHumDMin"].values[0])
    assigned_stations.loc[:,"relHumDMean"] = assigned_stations.stationId.apply(lambda x: station_data.loc[station_data.stationId == x,"relHumDMean"].values[0])
    assigned_stations.loc[:,"time"] = assigned_stations.stationId.apply(lambda x: station_data.loc[station_data.stationId == x,"time"].values[0])
    
    weather_zip_day = gridded_temp.set_index("ZIP4").merge(assigned_stations.set_index("ZIP4")[["relHumDMax", "relHumDMin", "relHumDMean" , "stationId"]], left_index=True, right_index=True, how="outer").reset_index()
    
    # Daily THI low at min temperature and max relative humidity
    #weather_zip_day.loc[:,"thi_min"] = weather_zip_day.apply(lambda x: thi(x.TminD, x.relHumDMax), axis=1)
    
    # Daily THI max at max temperature and min relative humidity
    #weather_zip_day.loc[:,"thi_max"] = weather_zip_day.apply(lambda x: thi(x.TmaxD, x.relHumDMin), axis=1)

    # Daily THI mean at mean temperature and min relative humidity
    #weather_zip_day.loc[:,"thi_mean"] = weather_zip_day.apply(lambda x: thi(x.TmaxD, x.relHumDMean), axis=1)

    # Sort
    weather_zip_day = weather_zip_day.sort_values(["ZIP4", "time"])
    

    return weather_zip_day

In [7]:
def open_paritioned_df(year, fpath):
    f = ds.field('year') == year
    pds = ds.dataset(fpath, format='parquet', partitioning='hive')
    table = pds.to_table(filter=f)
    df = table.to_pandas()
    return df

In [8]:
def worker(year: int):
    # Ignore Pandas Warnings
    pd.options.mode.chained_assignment = None  # default='warn'
    pd.options.mode.copy_on_write = False
    
    # Load Data
    prefix = ws2 / Path("data/preprocessed/meteo/")
    gridded_min = open_paritioned_df(year, prefix / Path("TminD.parquet"))
    gridded_max = open_paritioned_df(year, prefix / Path("TmaxD.parquet"))
    gridded_mean = open_paritioned_df(year, prefix / Path("TabsD.parquet"))
    gridded_precipitation = open_paritioned_df(year, prefix / Path("RhiresD.parquet"))
    gridded_sunshine = open_paritioned_df(year, prefix / Path("SrelD.parquet"))
    station_relhum = pd.read_parquet(prefix / Path("relHumD_min_max_mean_meteosuisse_stations.parquet"))
    stations_meta = gpd.read_parquet(prefix / Path("meteosuisse_stations_meta.parquet"))
    localities = gpd.read_parquet(ws2 / Path("data/preprocessed/geo/") /  Path("swiss_localities_with_altitudes.parquet"))

    # Create Daily Weather Variables
    start_date = f"{year}-01-01"
    end_date = f"{year}-12-31"
    date_range = pd.date_range(start=start_date, end=end_date).strftime('%Y-%m-%d').tolist()

    frames = []
    
    for d in date_range:
        print(d)
        weather_zip_day = create_weather_variables_for_day(gpd.GeoDataFrame(gridded_max[gridded_max["time"] == d]),
                                                                       gpd.GeoDataFrame(gridded_min[gridded_min["time"] == d]),
                                                                       gpd.GeoDataFrame(gridded_mean[gridded_mean["time"] == d]),
                                                                       pd.DataFrame(station_relhum[station_relhum["time"] == d]),
                                                                       gpd.GeoDataFrame(stations_meta),
                                                                       gpd.GeoDataFrame(localities),
                                                                       gpd.GeoDataFrame(gridded_precipitation[gridded_precipitation["time"] == d]),
                                                                       gpd.GeoDataFrame(gridded_sunshine[gridded_sunshine["time"] == d]))
        frames.append(weather_zip_day)

    fname = ws2 / Path("data/preprocessed/meteo/") / Path(f"weather_zip_{year}.parquet")
    pd.concat(frames).to_parquet(fname)

    return fname

In [None]:
worker(1980)

In [9]:
all_files = Parallel(n_jobs=25, batch_size=1, verbose=10)(delayed(worker)(i) for i in tqdm(range(1980,2024,1)))

  0%|                                                                                                                                                                                                                                                                                                                        | 0/44 [00:00<?, ?it/s][Parallel(n_jobs=25)]: Using backend LokyBackend with 25 concurrent workers.
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 44/44 [00:00<00:00, 193.11it/s]
[Parallel(n_jobs=25)]: Done   5 out of  44 | elapsed: 121.2min remaining: 945.2min
[Parallel(n_jobs=25)]: Done  10 out of  44 | elapsed: 125.3min remaining: 426.1min
[Parallel(n_jobs=25)]: Done  15 out of  44 | elapsed: 129.3min remaining: 25

2004-01-01
2004-01-02
2004-01-03
2004-01-04
2004-01-05
2004-01-06
2004-01-07
2004-01-08
2004-01-09
2004-01-10
2004-01-11
2004-01-12
2004-01-13
2004-01-14
2004-01-15
2004-01-16
2004-01-17
2004-01-18
2004-01-19
2004-01-20
2004-01-21
2004-01-22
2004-01-23
2004-01-24
2004-01-25
2004-01-26
2004-01-27
2004-01-28
2004-01-29
2004-01-30
2004-01-31
2004-02-01
2004-02-02
2004-02-03
2004-02-04
2004-02-05
2004-02-06
2004-02-07
2004-02-08
2004-02-09
2004-02-10
2004-02-11
2004-02-12
2004-02-13
2004-02-14
2004-02-15
2004-02-16
2004-02-17
2004-02-18
2004-02-19
2004-02-20
2004-02-21
2004-02-22
2004-02-23
2004-02-24
2004-02-25
2004-02-26
2004-02-27
2004-02-28
2004-02-29
2004-03-01
2004-03-02
2004-03-03
2004-03-04
2004-03-05
2004-03-06
2004-03-07
2004-03-08
2004-03-09
2004-03-10
2004-03-11
2004-03-12
2004-03-13
2004-03-14
2004-03-15
2004-03-16
2004-03-17
2004-03-18
2004-03-19
2004-03-20
2004-03-21
2004-03-22
2004-03-23
2004-03-24
2004-03-25
2004-03-26
2004-03-27
2004-03-28
2004-03-29
2004-03-30
2004-03-31

[Parallel(n_jobs=25)]: Done  25 out of  44 | elapsed: 143.3min remaining: 108.9min


1985-01-01
1985-01-02
1985-01-03
1985-01-04
1985-01-05
1985-01-06
1985-01-07
1985-01-08
1985-01-09
1985-01-10
1985-01-11
1985-01-12
1985-01-13
1985-01-14
1985-01-15
1985-01-16
1985-01-17
1985-01-18
1985-01-19
1985-01-20
1985-01-21
1985-01-22
1985-01-23
1985-01-24
1985-01-25
1985-01-26
1985-01-27
1985-01-28
1985-01-29
1985-01-30
1985-01-31
1985-02-01
1985-02-02
1985-02-03
1985-02-04
1985-02-05
1985-02-06
1985-02-07
1985-02-08
1985-02-09
1985-02-10
1985-02-11
1985-02-12
1985-02-13
1985-02-14
1985-02-15
1985-02-16
1985-02-17
1985-02-18
1985-02-19
1985-02-20
1985-02-21
1985-02-22
1985-02-23
1985-02-24
1985-02-25
1985-02-26
1985-02-27
1985-02-28
1985-03-01
1985-03-02
1985-03-03
1985-03-04
1985-03-05
1985-03-06
1985-03-07
1985-03-08
1985-03-09
1985-03-10
1985-03-11
1985-03-12
1985-03-13
1985-03-14
1985-03-15
1985-03-16
1985-03-17
1985-03-18
1985-03-19
1985-03-20
1985-03-21
1985-03-22
1985-03-23
1985-03-24
1985-03-25
1985-03-26
1985-03-27
1985-03-28
1985-03-29
1985-03-30
1985-03-31
1985-04-01

[Parallel(n_jobs=25)]: Done  30 out of  44 | elapsed: 221.1min remaining: 103.2min


1983-01-01
1983-01-02
1983-01-03
1983-01-04
1983-01-05
1983-01-06
1983-01-07
1983-01-08
1983-01-09
1983-01-10
1983-01-11
1983-01-12
1983-01-13
1983-01-14
1983-01-15
1983-01-16
1983-01-17
1983-01-18
1983-01-19
1983-01-20
1983-01-21
1983-01-22
1983-01-23
1983-01-24
1983-01-25
1983-01-26
1983-01-27
1983-01-28
1983-01-29
1983-01-30
1983-01-31
1983-02-01
1983-02-02
1983-02-03
1983-02-04
1983-02-05
1983-02-06
1983-02-07
1983-02-08
1983-02-09
1983-02-10
1983-02-11
1983-02-12
1983-02-13
1983-02-14
1983-02-15
1983-02-16
1983-02-17
1983-02-18
1983-02-19
1983-02-20
1983-02-21
1983-02-22
1983-02-23
1983-02-24
1983-02-25
1983-02-26
1983-02-27
1983-02-28
1983-03-01
1983-03-02
1983-03-03
1983-03-04
1983-03-05
1983-03-06
1983-03-07
1983-03-08
1983-03-09
1983-03-10
1983-03-11
1983-03-12
1983-03-13
1983-03-14
1983-03-15
1983-03-16
1983-03-17
1983-03-18
1983-03-19
1983-03-20
1983-03-21
1983-03-22
1983-03-23
1983-03-24
1983-03-25
1983-03-26
1983-03-27
1983-03-28
1983-03-29
1983-03-30
1983-03-31
1983-04-01

[Parallel(n_jobs=25)]: Done  35 out of  44 | elapsed: 226.1min remaining: 58.1min


1981-01-01
1981-01-02
1981-01-03
1981-01-04
1981-01-05
1981-01-06
1981-01-07
1981-01-08
1981-01-09
1981-01-10
1981-01-11
1981-01-12
1981-01-13
1981-01-14
1981-01-15
1981-01-16
1981-01-17
1981-01-18
1981-01-19
1981-01-20
1981-01-21
1981-01-22
1981-01-23
1981-01-24
1981-01-25
1981-01-26
1981-01-27
1981-01-28
1981-01-29
1981-01-30
1981-01-31
1981-02-01
1981-02-02
1981-02-03
1981-02-04
1981-02-05
1981-02-06
1981-02-07
1981-02-08
1981-02-09
1981-02-10
1981-02-11
1981-02-12
1981-02-13
1981-02-14
1981-02-15
1981-02-16
1981-02-17
1981-02-18
1981-02-19
1981-02-20
1981-02-21
1981-02-22
1981-02-23
1981-02-24
1981-02-25
1981-02-26
1981-02-27
1981-02-28
1981-03-01
1981-03-02
1981-03-03
1981-03-04
1981-03-05
1981-03-06
1981-03-07
1981-03-08
1981-03-09
1981-03-10
1981-03-11
1981-03-12
1981-03-13
1981-03-14
1981-03-15
1981-03-16
1981-03-17
1981-03-18
1981-03-19
1981-03-20
1981-03-21
1981-03-22
1981-03-23
1981-03-24
1981-03-25
1981-03-26
1981-03-27
1981-03-28
1981-03-29
1981-03-30
1981-03-31
1981-04-01

[Parallel(n_jobs=25)]: Done  40 out of  44 | elapsed: 232.7min remaining: 23.3min


1982-01-01
1982-01-02
1982-01-03
1982-01-04
1982-01-05
1982-01-06
1982-01-07
1982-01-08
1982-01-09
1982-01-10
1982-01-11
1982-01-12
1982-01-13
1982-01-14
1982-01-15
1982-01-16
1982-01-17
1982-01-18
1982-01-19
1982-01-20
1982-01-21
1982-01-22
1982-01-23
1982-01-24
1982-01-25
1982-01-26
1982-01-27
1982-01-28
1982-01-29
1982-01-30
1982-01-31
1982-02-01
1982-02-02
1982-02-03
1982-02-04
1982-02-05
1982-02-06
1982-02-07
1982-02-08
1982-02-09
1982-02-10
1982-02-11
1982-02-12
1982-02-13
1982-02-14
1982-02-15
1982-02-16
1982-02-17
1982-02-18
1982-02-19
1982-02-20
1982-02-21
1982-02-22
1982-02-23
1982-02-24
1982-02-25
1982-02-26
1982-02-27
1982-02-28
1982-03-01
1982-03-02
1982-03-03
1982-03-04
1982-03-05
1982-03-06
1982-03-07
1982-03-08
1982-03-09
1982-03-10
1982-03-11
1982-03-12
1982-03-13
1982-03-14
1982-03-15
1982-03-16
1982-03-17
1982-03-18
1982-03-19
1982-03-20
1982-03-21
1982-03-22
1982-03-23
1982-03-24
1982-03-25
1982-03-26
1982-03-27
1982-03-28
1982-03-29
1982-03-30
1982-03-31
1982-04-01

[Parallel(n_jobs=25)]: Done  44 out of  44 | elapsed: 237.1min finished


In [10]:
all_files

[PosixPath('/mnt/wks3/aschneuwl/workspace/data/preprocessed/meteo/weather_zip_1980.parquet'),
 PosixPath('/mnt/wks3/aschneuwl/workspace/data/preprocessed/meteo/weather_zip_1981.parquet'),
 PosixPath('/mnt/wks3/aschneuwl/workspace/data/preprocessed/meteo/weather_zip_1982.parquet'),
 PosixPath('/mnt/wks3/aschneuwl/workspace/data/preprocessed/meteo/weather_zip_1983.parquet'),
 PosixPath('/mnt/wks3/aschneuwl/workspace/data/preprocessed/meteo/weather_zip_1984.parquet'),
 PosixPath('/mnt/wks3/aschneuwl/workspace/data/preprocessed/meteo/weather_zip_1985.parquet'),
 PosixPath('/mnt/wks3/aschneuwl/workspace/data/preprocessed/meteo/weather_zip_1986.parquet'),
 PosixPath('/mnt/wks3/aschneuwl/workspace/data/preprocessed/meteo/weather_zip_1987.parquet'),
 PosixPath('/mnt/wks3/aschneuwl/workspace/data/preprocessed/meteo/weather_zip_1988.parquet'),
 PosixPath('/mnt/wks3/aschneuwl/workspace/data/preprocessed/meteo/weather_zip_1989.parquet'),
 PosixPath('/mnt/wks3/aschneuwl/workspace/data/preprocessed/

In [11]:
daily_weather_data_zip_centers = pd.concat([pd.read_parquet(f) for f in all_files])

In [None]:
daily_weather_data_zip_centers["year"] = daily_weather_data_zip_centers["time"].dt.year

In [None]:

daily_weather_data_zip_centers["TabsD_t1"] = daily_weather_data_zip_centers.groupby('ZIP4')['TabsD'].shift(1)
daily_weather_data_zip_centers["TabsD_t2"] = daily_weather_data_zip_centers.groupby('ZIP4')['TabsD'].shift(2)

daily_weather_data_zip_centers["TmaxD_t1"] = daily_weather_data_zip_centers.groupby('ZIP4')['TmaxD'].shift(1)
daily_weather_data_zip_centers["TmaxD_t2"] = daily_weather_data_zip_centers.groupby('ZIP4')['TmaxD'].shift(2)

daily_weather_data_zip_centers["TminD_t1"] = daily_weather_data_zip_centers.groupby('ZIP4')['TminD'].shift(1)
daily_weather_data_zip_centers["TminD_t2"] = daily_weather_data_zip_centers.groupby('ZIP4')['TminD'].shift(2)

daily_weather_data_zip_centers["RhiresD_t1"] = daily_weather_data_zip_centers.groupby('ZIP4')['RhiresD'].shift(1)
daily_weather_data_zip_centers["RhiresD_t2"] = daily_weather_data_zip_centers.groupby('ZIP4')['RhiresD'].shift(2)

daily_weather_data_zip_centers["SrelD_t1"] = daily_weather_data_zip_centers.groupby('ZIP4')['SrelD'].shift(1)
daily_weather_data_zip_centers["SrelD_t2"] = daily_weather_data_zip_centers.groupby('ZIP4')['SrelD'].shift(2)

daily_weather_data_zip_centers["relHumDMax_t1"] = daily_weather_data_zip_centers.groupby('ZIP4')['relHumDMax'].shift(1)
daily_weather_data_zip_centers["relHumDMax_t2"] = daily_weather_data_zip_centers.groupby('ZIP4')['relHumDMax'].shift(2)

daily_weather_data_zip_centers["relHumDMin_t1"] = daily_weather_data_zip_centers.groupby('ZIP4')['relHumDMin'].shift(1)
daily_weather_data_zip_centers["relHumDMin_t2"] = daily_weather_data_zip_centers.groupby('ZIP4')['relHumDMin'].shift(2)

daily_weather_data_zip_centers["relHumDMean_t1"] = daily_weather_data_zip_centers.groupby('ZIP4')['relHumDMean'].shift(1)
daily_weather_data_zip_centers["relHumDMean_t2"] = daily_weather_data_zip_centers.groupby('ZIP4')['relHumDMean'].shift(2)

In [12]:
daily_weather_data_zip_centers.to_parquet("/mnt/wks3/aschneuwl/workspace/data/preprocessed/meteo/daily_weather_data_zip_centers.parquet", partition_cols=["year"])

In [None]:
[f.unlink() for f in all_files]

In [None]:
def thi(t:float, rel_h:float) -> float:
    return ((1.8*t) + 32) - (0.55 - (0.0055 * rel_h)) * ((1.8*t) - 26)

In [11]:
daily_weather_data_zip_centers = pd.read_parquet(ws2 / Path("data/preprocessed/meteo/daily_weather_data_zip_centers.parquet"))

In [12]:
daily_weather_data_zip_centers = daily_weather_data_zip_centers.sort_values(["ZIP4", "time"])

Unnamed: 0,ZIP4,time,TmaxD,TminD,TabsD,RhiresD,SrelD,relHumDMax,relHumDMin,relHumDMean,stationId,year
0,1000,1980-01-01,-1.336796,-6.070635,-3.480043,0.000000,41.029152,88.0,73.0,81.1,CHD,1980
1,1003,1980-01-01,1.447627,-2.048699,-0.100141,0.016947,38.444569,98.0,65.0,82.9,CLA,1980
2,1004,1980-01-01,1.066458,-3.111316,-0.803274,0.018315,39.063473,98.0,65.0,82.9,CLA,1980
3,1005,1980-01-01,1.271688,-2.532606,-0.422468,0.012610,38.361488,98.0,65.0,82.9,CLA,1980
4,1006,1980-01-01,2.653433,-2.097073,0.448977,0.010497,38.606064,98.0,65.0,82.9,CLA,1980
...,...,...,...,...,...,...,...,...,...,...,...,...
3176,9652,2023-12-31,9.768590,-2.125596,2.631709,6.366999,4.006831,94.4,57.6,82.4,MMRIC,2023
3177,9655,2023-12-31,9.567946,-2.100390,2.831954,5.651462,4.947568,97.9,39.0,68.4,MMWAA,2023
3178,9656,2023-12-31,7.388371,-2.526915,1.938660,5.123246,4.810104,91.6,45.8,69.1,MMWET,2023
3179,9657,2023-12-31,10.765450,-0.130328,4.686968,4.613649,5.206018,97.9,39.0,68.4,MMWAA,2023


In [15]:
daily_weather_data_zip_centers["TabsD_t1"] = daily_weather_data_zip_centers.groupby('ZIP4')['TabsD'].shift(1)
daily_weather_data_zip_centers["TabsD_t2"] = daily_weather_data_zip_centers.groupby('ZIP4')['TabsD'].shift(2)

daily_weather_data_zip_centers["TmaxD_t1"] = daily_weather_data_zip_centers.groupby('ZIP4')['TmaxD'].shift(1)
daily_weather_data_zip_centers["TmaxD_t2"] = daily_weather_data_zip_centers.groupby('ZIP4')['TmaxD'].shift(2)

daily_weather_data_zip_centers["TminD_t1"] = daily_weather_data_zip_centers.groupby('ZIP4')['TminD'].shift(1)
daily_weather_data_zip_centers["TminD_t2"] = daily_weather_data_zip_centers.groupby('ZIP4')['TminD'].shift(2)

daily_weather_data_zip_centers["RhiresD_t1"] = daily_weather_data_zip_centers.groupby('ZIP4')['RhiresD'].shift(1)
daily_weather_data_zip_centers["RhiresD_t2"] = daily_weather_data_zip_centers.groupby('ZIP4')['RhiresD'].shift(2)

daily_weather_data_zip_centers["SrelD_t1"] = daily_weather_data_zip_centers.groupby('ZIP4')['SrelD'].shift(1)
daily_weather_data_zip_centers["SrelD_t2"] = daily_weather_data_zip_centers.groupby('ZIP4')['SrelD'].shift(2)

daily_weather_data_zip_centers["relHumDMax_t1"] = daily_weather_data_zip_centers.groupby('ZIP4')['relHumDMax'].shift(1)
daily_weather_data_zip_centers["relHumDMax_t2"] = daily_weather_data_zip_centers.groupby('ZIP4')['relHumDMax'].shift(2)

daily_weather_data_zip_centers["relHumDMin_t1"] = daily_weather_data_zip_centers.groupby('ZIP4')['relHumDMin'].shift(1)
daily_weather_data_zip_centers["relHumDMin_t2"] = daily_weather_data_zip_centers.groupby('ZIP4')['relHumDMin'].shift(2)

daily_weather_data_zip_centers["relHumDMean_t1"] = daily_weather_data_zip_centers.groupby('ZIP4')['relHumDMean'].shift(1)
daily_weather_data_zip_centers["relHumDMean_t2"] = daily_weather_data_zip_centers.groupby('ZIP4')['relHumDMean'].shift(2)

In [16]:
daily_weather_data_zip_centers.to_parquet("/mnt/wks3/aschneuwl/workspace/data/preprocessed/meteo/daily_weather_data_zip_centers.parquet", partition_cols=["year"])