In [1]:
import pandas as pd
from pathlib import Path
from joblib import Parallel, delayed
import itertools
import numpy as np
from tqdm import tqdm
from typing import List

In [2]:
workspace = Path("/mnt/wks3/aschneuwl/workspace")

In [3]:
milk_data_fpath = workspace / Path("data/preprocessed/dairy/k33_milk_fat_protein_iqa_filtered.parquet")
weather_fpath = workspace / Path("data/preprocessed/meteo/daily_weather_data_zip_centers.parquet")

In [4]:
w = pd.read_parquet(weather_fpath)

In [5]:
w

Unnamed: 0,ZIP4,time,TmaxD,TminD,TabsD,RhiresD,SrelD,relHumDMax,relHumDMin,relHumDMean,...,RhiresD_t2,SrelD_t1,SrelD_t2,relHumDMax_t1,relHumDMax_t2,relHumDMin_t1,relHumDMin_t2,relHumDMean_t1,relHumDMean_t2,year
0,1000,1980-01-01,-1.336796,-6.070635,-3.480043,0.000000,41.029152,88.0,73.0,81.1,...,,,,,,,,,,1980
1,1003,1980-01-01,1.447627,-2.048699,-0.100141,0.016947,38.444569,98.0,65.0,82.9,...,,,,,,,,,,1980
2,1004,1980-01-01,1.066458,-3.111316,-0.803274,0.018315,39.063473,98.0,65.0,82.9,...,,,,,,,,,,1980
3,1005,1980-01-01,1.271688,-2.532606,-0.422468,0.012610,38.361488,98.0,65.0,82.9,...,,,,,,,,,,1980
4,1006,1980-01-01,2.653433,-2.097073,0.448977,0.010497,38.606064,98.0,65.0,82.9,...,,,,,,,,,,1980
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3176,9652,2023-12-31,9.768590,-2.125596,2.631709,6.366999,4.006831,94.4,57.6,82.4,...,2.195905,93.206398,0.918633,91.3,91.3,65.6,69.2,83.9,81.8,2023
3177,9655,2023-12-31,9.567946,-2.100390,2.831954,5.651462,4.947568,97.9,39.0,68.4,...,1.236877,96.219353,1.713098,95.9,97.2,38.6,51.7,83.9,77.7,2023
3178,9656,2023-12-31,7.388371,-2.526915,1.938660,5.123246,4.810104,91.6,45.8,69.1,...,0.856907,96.908875,2.523607,87.2,85.3,77.6,71.0,82.5,78.9,2023
3179,9657,2023-12-31,10.765450,-0.130328,4.686968,4.613649,5.206018,97.9,39.0,68.4,...,0.982836,97.069481,4.692240,95.9,97.2,38.6,51.7,83.9,77.7,2023


In [4]:
years = sorted(list(pd.read_parquet(weather_fpath).year.unique()))

In [5]:
def merge_milk_weather_data(year: int, milk_fpath: Path, weather_fpath: Path):
    milk = pd.read_parquet(milk_fpath, filters=[[("year", "==", year)]], engine="pyarrow")
    weather = pd.read_parquet(weather_fpath, filters=[[("year", "==", year)]], engine="pyarrow")
    df_filtered = pd.merge(milk, weather, left_on=["zip", "sampleWeighingDate"], right_on=["ZIP4", "time"], how='inner')
    df_filtered["year"] = df_filtered["year_x"]
    df_filtered = df_filtered.drop(["time", "ZIP4", "year_x", "year_y"], axis=1)

    return df_filtered

In [6]:
filtered = Parallel(n_jobs=-1)(delayed(merge_milk_weather_data)(y,milk_data_fpath,weather_fpath) for y in tqdm(years))
filtered = [s for s in filtered if s is not None]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 44/44 [00:02<00:00, 21.93it/s]


In [7]:
filtered = pd.concat(filtered)

  filtered = pd.concat(filtered)
  filtered = pd.concat(filtered)


In [13]:
filtered.shape

(132266438, 49)

In [9]:
filtered.sampleWeighingDate.min()

Timestamp('1982-01-05 00:00:00')

In [10]:
filtered.sampleWeighingDate.max()

Timestamp('2023-12-31 00:00:00')

In [14]:
sorted(filtered.columns)

['RhiresD',
 'RhiresD_t1',
 'RhiresD_t2',
 'SrelD',
 'SrelD_t1',
 'SrelD_t2',
 'TabsD',
 'TabsD_t1',
 'TabsD_t2',
 'TmaxD',
 'TmaxD_t1',
 'TmaxD_t2',
 'TminD',
 'TminD_t1',
 'TminD_t2',
 'aceton',
 'acetonIr',
 'acetonMmol',
 'altitude',
 'animalBreedCode',
 'animalId',
 'bhbConcentration',
 'calvingDate',
 'days_in_milk',
 'farmIdLocationSample',
 'fat',
 'lactationNumber',
 'lactose',
 'locationType',
 'milk',
 'milkUreaNitrogen',
 'milkingMethod',
 'protein',
 'relHumDMax',
 'relHumDMax_t1',
 'relHumDMax_t2',
 'relHumDMean',
 'relHumDMean_t1',
 'relHumDMean_t2',
 'relHumDMin',
 'relHumDMin_t1',
 'relHumDMin_t2',
 'sampleMethod',
 'sampleWeighingDate',
 'somaticCellCount',
 'stationId',
 'weighingType',
 'year',
 'zip']

In [21]:
workspace / Path(f"data/preprocessed/dairy/{milk_data_fpath.name.split(".")[0]}_with_weather.parquet")

PosixPath('/mnt/wks3/aschneuwl/workspace/data/preprocessed/dairy/k33_milk_fat_protein_iqa_filtered_with_weather.parquet')

In [12]:
filtered.to_parquet(workspace / Path(f"data/preprocessed/dairy/{milk_data_fpath.name.split(".")[0]}_with_weather.parquet"))