In [1]:
import pandas as pd

In [51]:
# Loading data
df_crop = pd.read_parquet("data/processed/crop_yield_annual.parquet")
df_weather = pd.read_parquet("data/processed/weather_state_month.parquet")

# Checking dtypes for merge
print(df_crop.dtypes, '\n')
print(df_weather.dtypes)

state_alpha              object
state_name               object
year                      int64
crop                     object
yield_unit               object
yield                    object
reference_period_desc    object
source_desc              object
dtype: object 

state          object
year            int64
month           int64
avg_temp      float64
prcp          float64
n_stations      int64
temp_Z        float64
prcp_Z        float64
dtype: object


In [52]:
# Checking for missing values
print(df_crop.isna().sum(), '\n')
print(df_weather.isna().sum(), '\n')

# Listwise deletion of empty values
df_weather.dropna(inplace=True)

# Comparing year ranges
print(df_weather['year'].max(), df_weather['year'].min())
print(df_crop['year'].max(), df_crop['year'].min())

state_alpha              0
state_name               0
year                     0
crop                     0
yield_unit               0
yield                    0
reference_period_desc    0
source_desc              0
dtype: int64 

state          0
year           0
month          0
avg_temp      35
prcp           8
n_stations     0
temp_Z        35
prcp_Z         8
dtype: int64 

2026 1884
2025 1961


In [53]:
df_weather.columns

Index(['state', 'year', 'month', 'avg_temp', 'prcp', 'n_stations', 'temp_Z',
       'prcp_Z'],
      dtype='object')

In [54]:
# Restricting weather df year range
df_weather = df_weather[df_weather["year"].between(1961, 2025)]

# Aggregating for annual estimates
df_weather = df_weather.groupby(["state", "year"], as_index=False).agg(
        avg_temp_yr=("avg_temp", "mean"),
        prcp_yr=("prcp", "sum"),
        temp_Z_yr=("temp_Z", "mean"),
        prcp_Z_yr=("prcp_Z", "mean"),
        n_stations_yr=("n_stations", "mean"),
    )

In [55]:
# Merging weather on crop
df_panel = df_crop.merge(
    df_weather,
    left_on=["state_alpha","year"],
    right_on=["state","year"],
    how="inner"
)