In [2]:
import pandas as pd
import requests
import time



In [9]:
# GET NASA DATA
# api docs: https://power.larc.nasa.gov/api/pages/?urls.primaryName=Monthly+%26+Annual#/Data%20Requests/monthly_single_point_data_request_api_temporal_monthly_point_get

counties = pd.read_csv('counties.csv')

BASE_URL = "https://power.larc.nasa.gov/api/temporal/monthly/point"
PARAMETERS = "ALLSKY_SFC_SW_DWN,RH2M,WS2M,WS2M_MAX" # solar radiation, humidity, wind speed avg, wind speed max
START_DATE = "1984"
END_DATE = "2024"

all_data = []
for idx, row in counties.iterrows():
    county_name = row['NAME']
    
    # specify counties by their lon/lat centroids
    lat = row['INTPTLAT']
    lon = row['INTPTLONG']

    print(f"[{idx+1}/{len(counties)}] Getting data for: {county_name}")
    
    params = {
        'start': START_DATE,
        'end': END_DATE,
        'latitude': lat,
        'longitude': lon,
        'community': 'ag',
        'parameters': PARAMETERS,
        'format': 'json'
    }
    
    try:
        response = requests.get(BASE_URL, params=params)
        response.raise_for_status()
        
        data = response.json()
        data = data['properties']['parameter']
        
        monthly_rows = {}
        
        for param_name, monthly_values in data.items():
            for month_key, value in monthly_values.items():
                year = int(month_key[:4])
                month = int(month_key[4:6])
                
                # use april to september
                if 4 <= month <= 9:            
                    key = (county_name, year, month)

                    if key not in monthly_rows:
                        monthly_rows[key] = {
                            "County": county_name,
                            "Year": year,
                            "Month": month
                        }

                    monthly_rows[key][param_name] = value

        all_data.extend(monthly_rows.values())

    except Exception as e:
        print(e)

    time.sleep(0.5)

nasa_df = pd.DataFrame(all_data)
nasa_df = nasa_df.sort_values(['County', 'Year']).reset_index(drop=True)
nasa_df.to_csv('nasa_weather.csv', index=False)
nasa_df

[1/72] Getting data for: Adams County
[2/72] Getting data for: Ashland County
[3/72] Getting data for: Barron County
[4/72] Getting data for: Bayfield County
[5/72] Getting data for: Brown County
[6/72] Getting data for: Buffalo County
[7/72] Getting data for: Burnett County
[8/72] Getting data for: Calumet County
[9/72] Getting data for: Chippewa County
[10/72] Getting data for: Clark County
[11/72] Getting data for: Columbia County
[12/72] Getting data for: Crawford County
[13/72] Getting data for: Dane County
[14/72] Getting data for: Dodge County
[15/72] Getting data for: Door County
[16/72] Getting data for: Douglas County
[17/72] Getting data for: Dunn County
[18/72] Getting data for: Eau Claire County
[19/72] Getting data for: Florence County
[20/72] Getting data for: Fond du Lac County
[21/72] Getting data for: Forest County
[22/72] Getting data for: Grant County
[23/72] Getting data for: Green County
[24/72] Getting data for: Green Lake County
[25/72] Getting data for: Iowa Co

Unnamed: 0,County,Year,Month,ALLSKY_SFC_SW_DWN,RH2M,WS2M,WS2M_MAX
0,Adams County,1984,4,16.85,75.80,4.40,12.36
1,Adams County,1984,5,17.63,74.25,3.26,9.64
2,Adams County,1984,6,20.75,78.96,3.42,9.15
3,Adams County,1984,7,21.90,74.48,2.68,8.19
4,Adams County,1984,8,18.52,68.76,2.31,6.40
...,...,...,...,...,...,...,...
17707,Wood County,2024,5,19.51,74.42,3.12,8.31
17708,Wood County,2024,6,19.37,80.19,3.06,7.51
17709,Wood County,2024,7,21.09,80.56,2.03,5.34
17710,Wood County,2024,8,18.64,79.33,2.38,5.90


In [26]:
# CLEAN NASA DATA

nasa_df = nasa_df.sort_values(['County', 'Year']).reset_index(drop=True)
nasa_df = nasa_df.rename(columns={
    'ALLSKY_SFC_SW_DWN': 'solar_radiation',
    'RH2M': 'humidity',
    'WS2M': 'wind_speed',
    'WS2M_MAX': 'wind_speed_max',
})
nasa_df['County'] = nasa_df['County'].str.replace(' County', '', regex=False).str.lower()
nasa_df['County'] = nasa_df['County'].str.replace('st. croix', 'st croix', regex=False)
nasa_df.to_csv('nasa_weather.csv', index=False)
print(nasa_df.isnull().sum())
nasa_df

County             0
Year               0
Month              0
solar_radiation    0
humidity           0
wind_speed         0
wind_speed_max     0
dtype: int64


Unnamed: 0,County,Year,Month,solar_radiation,humidity,wind_speed,wind_speed_max
0,adams,1984,4,16.85,75.80,4.40,12.36
1,adams,1984,5,17.63,74.25,3.26,9.64
2,adams,1984,6,20.75,78.96,3.42,9.15
3,adams,1984,7,21.90,74.48,2.68,8.19
4,adams,1984,8,18.52,68.76,2.31,6.40
...,...,...,...,...,...,...,...
17707,wood,2024,5,19.51,74.42,3.12,8.31
17708,wood,2024,6,19.37,80.19,3.06,7.51
17709,wood,2024,7,21.09,80.56,2.03,5.34
17710,wood,2024,8,18.64,79.33,2.38,5.90


In [None]:
# GET PRISM DATA
# prism data: https://prism.oregonstate.edu/explorer/bulk.php

csv_list = ['prism_weather_1984-1994.csv', 'prism_weather_1995-2009.csv', 'prism_weather_2010-2024.csv']
dfs = []

for csv in csv_list:
    df = pd.read_csv(csv)

    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month

    # use april to september
    df = df[df['Month'].between(4, 9)]

    dfs.append(df)

prism_df = pd.concat(dfs)
prism_df

Unnamed: 0,Name,Longitude,Latitude,Elevation (ft),Date,ppt (inches),tmin (degrees F),tmean (degrees F),tmax (degrees F),tdmean (degrees F),vpdmin (hPa),vpdmax (hPa),Year,Month
3,Adams,-89.7672,43.9738,991,1984-04-01,3.83,35.5,46.5,57.5,29.7,1.57,10.85,1984,4
4,Adams,-89.7672,43.9738,991,1984-05-01,2.27,41.4,53.7,65.9,40.2,1.12,13.40,1984,5
5,Adams,-89.7672,43.9738,991,1984-06-01,5.96,56.2,68.1,80.0,57.0,1.20,18.43,1984,6
6,Adams,-89.7672,43.9738,991,1984-07-01,3.29,56.9,69.4,81.9,58.0,1.08,20.90,1984,7
7,Adams,-89.7672,43.9738,991,1984-08-01,2.54,58.5,70.9,83.2,60.5,0.85,20.17,1984,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12952,Wood,-90.0388,44.4614,1060,2024-05-01,8.23,47.1,58.7,70.3,47.1,0.86,15.00,2024,5
12953,Wood,-90.0388,44.4614,1060,2024-06-01,6.51,55.4,65.8,76.3,57.8,0.55,14.60,2024,6
12954,Wood,-90.0388,44.4614,1060,2024-07-01,3.69,59.0,69.6,80.2,61.3,0.44,16.81,2024,7
12955,Wood,-90.0388,44.4614,1060,2024-08-01,6.70,57.5,68.9,80.3,60.6,0.46,17.65,2024,8


In [17]:
# CLEAN PRISM DATA
prism_df = prism_df.rename(columns={
    'Name': 'County',
    'ppt (inches)': 'precip',
    'tmin (degrees F)': 'temp_min',
    'tmean (degrees F)': 'temp_mean',
    'tmax (degrees F)': 'temp_max',
    'tdmean (degrees F)': 'dewpoint_mean',
    'vpdmax (hPa)': 'vpd_max',
    'vpdmin (hPa)': 'vpd_min'
})

prism_df = prism_df.sort_values(['County', 'Year']).reset_index(drop=True)
prism_df['County'] = prism_df['County'].str.lower()
prism_df['County'] = prism_df['County'].str.replace('st. croix', 'st croix', regex=False)
prism_df = prism_df.drop(['Longitude', 'Latitude', 'Elevation (ft)', 'Date'], axis=1)
prism_df.to_csv('prism_weather.csv', index=False)
print(prism_df.isnull().sum())
prism_df

County           0
precip           0
temp_min         0
temp_mean        0
temp_max         0
dewpoint_mean    0
vpd_min          0
vpd_max          0
Year             0
Month            0
dtype: int64


Unnamed: 0,County,precip,temp_min,temp_mean,temp_max,dewpoint_mean,vpd_min,vpd_max,Year,Month
0,adams,3.83,35.5,46.5,57.5,29.7,1.57,10.85,1984,4
1,adams,2.27,41.4,53.7,65.9,40.2,1.12,13.40,1984,5
2,adams,5.96,56.2,68.1,80.0,57.0,1.20,18.43,1984,6
3,adams,3.29,56.9,69.4,81.9,58.0,1.08,20.90,1984,7
4,adams,2.54,58.5,70.9,83.2,60.5,0.85,20.17,1984,8
...,...,...,...,...,...,...,...,...,...,...
17707,wood,8.23,47.1,58.7,70.3,47.1,0.86,15.00,2024,5
17708,wood,6.51,55.4,65.8,76.3,57.8,0.55,14.60,2024,6
17709,wood,3.69,59.0,69.6,80.2,61.3,0.44,16.81,2024,7
17710,wood,6.70,57.5,68.9,80.3,60.6,0.46,17.65,2024,8


In [27]:
# MERGE WEATHER DATA

weather_df = pd.merge(nasa_df, prism_df, on=['Year', 'County', 'Month'], how='left')
prism_df.to_csv('weather_clean.csv', index=False)
print(weather_df.isnull().sum())
weather_df

County             0
Year               0
Month              0
solar_radiation    0
humidity           0
wind_speed         0
wind_speed_max     0
precip             0
temp_min           0
temp_mean          0
temp_max           0
dewpoint_mean      0
vpd_min            0
vpd_max            0
dtype: int64


Unnamed: 0,County,Year,Month,solar_radiation,humidity,wind_speed,wind_speed_max,precip,temp_min,temp_mean,temp_max,dewpoint_mean,vpd_min,vpd_max
0,adams,1984,4,16.85,75.80,4.40,12.36,3.83,35.5,46.5,57.5,29.7,1.57,10.85
1,adams,1984,5,17.63,74.25,3.26,9.64,2.27,41.4,53.7,65.9,40.2,1.12,13.40
2,adams,1984,6,20.75,78.96,3.42,9.15,5.96,56.2,68.1,80.0,57.0,1.20,18.43
3,adams,1984,7,21.90,74.48,2.68,8.19,3.29,56.9,69.4,81.9,58.0,1.08,20.90
4,adams,1984,8,18.52,68.76,2.31,6.40,2.54,58.5,70.9,83.2,60.5,0.85,20.17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17707,wood,2024,5,19.51,74.42,3.12,8.31,8.23,47.1,58.7,70.3,47.1,0.86,15.00
17708,wood,2024,6,19.37,80.19,3.06,7.51,6.51,55.4,65.8,76.3,57.8,0.55,14.60
17709,wood,2024,7,21.09,80.56,2.03,5.34,3.69,59.0,69.6,80.2,61.3,0.44,16.81
17710,wood,2024,8,18.64,79.33,2.38,5.90,6.70,57.5,68.9,80.3,60.6,0.46,17.65


In [None]:
# GET SOIL DATA
# soil grids data: https://rest.isric.org/soilgrids/v2.0/docs#/default/

counties = pd.read_csv('counties.csv')

BASE_URL = "https://rest.isric.org/soilgrids/v2.0/properties/query"
# soil ph (H2O), clay content, sand content, silt content, soil organic carbon, total nitrogen, bulk density, cation exchange capacity 
PROPERTIES = ["phh2o", "clay", "sand", "silt", "soc", "nitrogen", "bdod", "cec"]

# offsets (in degrees; 1-2 km shifts) to try if initial centroid fails
OFFSETS = [(0, 0), (0.01, 0), (-0.01, 0), (0, 0.01), (0, -0.01), (0.02, 0), (-0.02, 0), (0, 0.02), (0, -0.02)]

all_data = []
for idx, row in counties.iterrows():
    county_name = row['NAME']
    
    # original centroid
    original_lat = row['INTPTLAT']
    original_lon = row['INTPTLONG']

    print(f"[{idx+1}/{len(counties)}] Getting data for: {county_name}")
    
    success = False
    for delta_lat, delta_lon in OFFSETS:
        current_lat = original_lat + delta_lat
        current_lon = original_lon + delta_lon
        
        params = {
            'lat': current_lat,
            'lon': current_lon,
            'depth': ['0-5cm'],
            'property': PROPERTIES,
            'value': ['mean']
        }

        layer_data = {
            'County': county_name         
        }
        
        try:
            response = requests.get(BASE_URL, params=params)
            response.raise_for_status()
            
            data = response.json()
            layers = data['properties']['layers']

            for layer in layers:
                val = layer['depths'][0]['values']['mean']
                layer_data[layer['name']] = val

            if layer_data['clay'] is not None:
                all_data.append(layer_data)
                break
            
        except Exception as e:
            print(f"  Error on attempt: {e}")
        
        time.sleep(5)
    
    time.sleep(5)

soil_df = pd.DataFrame(all_data)
print(soil_df.isnull().sum())

soil_df

[1/72] Getting data for: Adams County
[2/72] Getting data for: Ashland County
[3/72] Getting data for: Barron County
[4/72] Getting data for: Bayfield County
[5/72] Getting data for: Brown County
[6/72] Getting data for: Buffalo County
[7/72] Getting data for: Burnett County
[8/72] Getting data for: Calumet County
[9/72] Getting data for: Chippewa County
[10/72] Getting data for: Clark County
[11/72] Getting data for: Columbia County
[12/72] Getting data for: Crawford County
[13/72] Getting data for: Dane County
[14/72] Getting data for: Dodge County
[15/72] Getting data for: Door County
[16/72] Getting data for: Douglas County
[17/72] Getting data for: Dunn County
[18/72] Getting data for: Eau Claire County
[19/72] Getting data for: Florence County
[20/72] Getting data for: Fond du Lac County
[21/72] Getting data for: Forest County
[22/72] Getting data for: Grant County
[23/72] Getting data for: Green County
[24/72] Getting data for: Green Lake County
[25/72] Getting data for: Iowa Co

Unnamed: 0,County,bdod,cec,clay,nitrogen,phh2o,sand,silt,soc
0,Adams County,132,163,192,698,60,489,319,375
1,Ashland County,115,249,258,601,51,216,526,605
2,Barron County,133,150,185,516,55,317,498,677
3,Bayfield County,111,266,187,545,50,480,333,668
4,Brown County,134,289,308,781,66,328,364,429
...,...,...,...,...,...,...,...,...,...
67,Waukesha County,136,189,277,912,64,153,570,374
68,Waupaca County,132,169,236,1029,60,411,353,468
69,Waushara County,132,187,158,918,60,555,287,494
70,Winnebago County,128,225,385,876,63,184,430,527


In [None]:
# CLEAN SOIL DATA
soil_df['County'] = soil_df['County'].str.replace(' County', '', regex=False).str.lower()
soil_df['County'] = soil_df['County'].str.replace('st. croix', 'st croix', regex=False)
soil_df.to_csv('soil_clean.csv', index=False)
print(soil_df.isnull().sum())
soil_df

County      0
bdod        0
cec         0
clay        0
nitrogen    0
phh2o       0
sand        0
silt        0
soc         0
dtype: int64


Unnamed: 0,County,bdod,cec,clay,nitrogen,phh2o,sand,silt,soc
0,adams,132,163,192,698,60,489,319,375
1,ashland,115,249,258,601,51,216,526,605
2,barron,133,150,185,516,55,317,498,677
3,bayfield,111,266,187,545,50,480,333,668
4,brown,134,289,308,781,66,328,364,429
...,...,...,...,...,...,...,...,...,...
67,waukesha,136,189,277,912,64,153,570,374
68,waupaca,132,169,236,1029,60,411,353,468
69,waushara,132,187,158,918,60,555,287,494
70,winnebago,128,225,385,876,63,184,430,527


In [25]:
# CLEAN YIELD DATA
# crop yield data: https://quickstats.nass.usda.gov/

yield_df = pd.read_csv('yield.csv')
yield_df = yield_df[yield_df['Year'].between(1984, 2024)]
yield_df = yield_df[yield_df['County'] != 'OTHER COUNTIES']
yield_df['County'] = yield_df['County'].str.lower()
yield_df = yield_df[yield_df['County'] != 'other (combined) counties']
yield_df = yield_df[['Year', 'County', 'Value']]
yield_df = yield_df.rename(columns={'Value': 'Yield'})

yield_df.to_csv('yield_clean.csv', index=False)
print(yield_df.isnull().sum())
yield_df


Year      0
County    0
Yield     0
dtype: int64


Unnamed: 0,Year,County,Yield
1,2024,adams,120.1
2,2024,green lake,168.3
3,2024,juneau,141.0
4,2024,marquette,126.0
5,2024,portage,159.5
...,...,...,...
2695,1984,monroe,113.9
2696,1984,pepin,109.2
2697,1984,pierce,106.1
2698,1984,st croix,102.9
