In [68]:
import urllib.request
import urllib.parse
import pandas as pd
from io import StringIO
import pandas as pd
import os
import altair as alt

In [97]:
def get_data(parameters):

    base_url = 'http://quickstats.nass.usda.gov/api/api_GET/?key=D48CBA42-D6F6-31A1-9EF1-46D1A70988F5&'

    full_url = base_url + parameters
        
    result = urllib.request.urlopen(full_url)
    get_data = result.read().decode('utf-8')
            
    df = pd.read_csv(StringIO(get_data))
            
    return df  

filter_list = [
    ('IOWA','CORN','PRODUCTION','CORN, GRAIN - PRODUCTION, MEASURED IN BU'),
    ('IOWA','CORN','AREA HARVESTED','CORN, GRAIN - ACRES HARVESTED'),
    ('IOWA','CORN','YIELD','CORN, GRAIN - YIELD, MEASURED IN BU / ACRE'),
    ('ILLINOIS','SOYBEANS','PRODUCTION','SOYBEANS - PRODUCTION, MEASURED IN BU'),
    ('ILLINOIS','SOYBEANS','AREA HARVESTED','SOYBEANS - ACRES HARVESTED'),
    ('ILLINOIS','SOYBEANS','YIELD','SOYBEANS - YIELD, MEASURED IN BU / ACRE'),
    ('IDAHO','BARLEY','PRODUCTION','BARLEY - PRODUCTION, MEASURED IN BU'),
    ('IDAHO','BARLEY','AREA HARVESTED','BARLEY - ACRES HARVESTED'),
    ('IDAHO','BARLEY','YIELD','BARLEY - YIELD, MEASURED IN BU / ACRE'),
    ('SOUTH DAKOTA','OATS','PRODUCTION','OATS - PRODUCTION, MEASURED IN BU'),
    ('SOUTH DAKOTA','OATS','AREA HARVESTED','OATS - ACRES HARVESTED'),
    ('SOUTH DAKOTA','OATS','YIELD','OATS - YIELD, MEASURED IN BU / ACRE') 
    ]
combined_df = pd.DataFrame()

for state,commodity,stat_desc,description in filter_list:
    source_desc = 'source_desc=SURVEY'
    sector_desc = '&sector_desc=CROPS'
    commodity_desc = f'&commodity_desc={commodity}'
    statisticcat_desc = '&statisticcat_desc=' + urllib.parse.quote(stat_desc)
    short_desc = '&short_desc=' + urllib.parse.quote(description)
    location_desc = '&location_desc='+ urllib.parse.quote(state)
    year__GE = '&year__GE=2013'
    format = '&format=CSV'

    parameters = source_desc + sector_desc + commodity_desc + statisticcat_desc + short_desc + location_desc + year__GE + format

    df = get_data(parameters)
    df = df[(df['year']!=2024)&(df['reference_period_desc']=='YEAR')]
    df = df[['short_desc','year','Value']]
    combined_df = pd.concat([combined_df, df], ignore_index=True)

combined_df.head() 


Unnamed: 0,short_desc,year,Value
0,"CORN, GRAIN - PRODUCTION, MEASURED IN BU",2023,2522550000
1,"CORN, GRAIN - PRODUCTION, MEASURED IN BU",2022,2470000000
2,"CORN, GRAIN - PRODUCTION, MEASURED IN BU",2021,2539800000
3,"CORN, GRAIN - PRODUCTION, MEASURED IN BU",2020,2283300000
4,"CORN, GRAIN - PRODUCTION, MEASURED IN BU",2019,2564100000


In [70]:
def read_crop_data(csv_paths, crop):
    dfs = []
    for path in csv_paths:
        df = pd.read_csv(path)
        # Select columns
        df = df[["NAME", "DATE", "PRCP", "TAVG", "TMAX", "TMIN"]]

        # Extract State Abbreviation
        df["STATE"] = df["NAME"].str.extract(r",\s*([A-Z]{2})\s")[0]
        df = df.drop(columns=["NAME"])

        # Extract Year and Month
        df["YEAR"] = df["DATE"].str.split("-").str[0]
        df["MONTH"] = df["DATE"].str.split("-").str[1]
        df = df.drop(columns={"DATE"})

        # Map of numerical month value to month
        month_mapping = {
            "01": "January",
            "02": "February",
            "03": "March",
            "04": "April",
            "05": "May",
            "06": "June",
            "07": "July",
            "08": "August",
            "09": "September",
            "10": "October",
            "11": "November",
            "12": "December",
        }

        # Replace numerical month value with month
        df["MONTH"] = df["MONTH"].replace(month_mapping)

        # Create column for corresponding crop type
        df["CROP"] = crop

        dfs.append(df)

    #Combining dataframes of the same crop
    df_combined = pd.concat(dfs, ignore_index = True)

    return df_combined


In [71]:
# Relative paths for .csv data for each crop
corn_paths = ["noaa_csv_files/Corn_1.csv", "noaa_csv_files/Corn_2.csv"]
soybean_paths = ["noaa_csv_files/Soybean_1.csv", "noaa_csv_files/Soybean_2.csv"]
barley_paths = ["noaa_csv_files/Barley_1.csv", "noaa_csv_files/Barley_2.csv", "noaa_csv_files/Barley_3.csv"]
oats_paths = ["noaa_csv_files/Oats_1.csv", "noaa_csv_files/Oats_2.csv"]

# Crop types
crops = ["corn", "soybean", "barley", "oats"]

#List of .csv paths
paths = [corn_paths, soybean_paths, barley_paths, oats_paths]

#Use for loop and read_crop_data() fo create crop dataframes
crop_data = {}
for path, crop in zip(paths, crops):
    crop_data[crop] = read_crop_data(path, crop)

for crop in crops:
    crop_data[crop].to_csv(os.path.join("cleaned_csvs", crop + ".csv"))

In [90]:
print(crop_data['corn']['STATE'].unique())
print(crop_data['soybean']['STATE'].unique())
print(crop_data['barley']['STATE'].unique())
print(crop_data['oats']['STATE'].unique())

['IA' 'NE' 'MN' 'IL']
['NE' 'IA' 'IL' 'MN' 'IN']
['ID' 'MT' 'ND' 'WA' 'CO' 'WY']
['ND' 'SD' 'MN' 'WI' 'IA']


In [72]:
soybean_df = crop_data["soybean"]
growing_months = ['June', 'July', 'August', 'September']

soybean_df = soybean_df[(soybean_df['STATE']=='IA')]
soybean_df = soybean_df[soybean_df['MONTH'].isin(growing_months)].dropna()
grouped_df = soybean_df.groupby('YEAR').agg({'PRCP': 'mean', 'TMIN': 'mean', 'TMAX': 'mean', 'TAVG': 'mean'}).reset_index()

grouped_df['YEAR'] = grouped_df['YEAR'].astype(int)
grouped_df


Unnamed: 0,YEAR,PRCP,TMIN,TMAX,TAVG
0,2013,2.955,59.175,81.05,70.1
1,2014,6.365,57.6,78.25,67.925
2,2015,6.0975,58.925,80.575,69.75
3,2016,3.08,59.0,82.4,70.7
4,2017,2.9475,57.15,82.025,69.575
5,2018,5.3725,59.775,81.275,70.575
6,2019,3.605,58.95,81.125,70.05
7,2020,1.8225,59.325,83.85,71.6
8,2021,2.6075,59.375,84.775,72.1
9,2022,2.305,58.525,84.85,71.7


In [73]:
merged_df = pd.merge(combined_df, grouped_df, left_on='year', right_on='YEAR')
merged_df = merged_df.drop('year', axis=1)
#merged_df = merged_df[merged_df["short_desc"]=="SOYBEANS - ACRES HARVESTED"]
merged_df.head()

Unnamed: 0,short_desc,Value,YEAR,PRCP,TMIN,TMAX,TAVG
0,"SOYBEANS - PRODUCTION, MEASURED IN BU",573040000,2023,2.8425,59.8,82.825,71.3
1,"SOYBEANS - PRODUCTION, MEASURED IN BU",586755000,2022,2.305,58.525,84.85,71.7
2,"SOYBEANS - PRODUCTION, MEASURED IN BU",631890000,2021,2.6075,59.375,84.775,72.1
3,"SOYBEANS - PRODUCTION, MEASURED IN BU",505980000,2020,1.8225,59.325,83.85,71.6
4,"SOYBEANS - PRODUCTION, MEASURED IN BU",501600000,2019,3.605,58.95,81.125,70.05


In [74]:
yield_df = merged_df[merged_df['short_desc']=='SOYBEANS - YIELD, MEASURED IN BU / ACRE']
yield_df['YEAR'] = yield_df['YEAR'].astype(str)

# Create Altair chart for Yield
yield_chart = alt.Chart(yield_df).mark_line().encode(
    x='YEAR:O',
    y=alt.Y('Value', title='Yield (Bushels/Acre)', scale=alt.Scale(domain=[40, 70])),
    color=alt.value('blue'),
).properties(
    title='Soybean Yield and Average Temperature in Iowa Over Time (Growing Season June-Sep)',
    width=800, 
    height=400 
)

# Create Altair chart for Average Temperature
temp_chart = alt.Chart(yield_df).mark_line().encode(
    x='YEAR:O',
    y=alt.Y('TAVG', title='Average Temperature (°F)', scale=alt.Scale(domain=[65, 75])),
    color=alt.value('red')
)

precip_chart = alt.Chart(yield_df).mark_line().encode(
    x='YEAR:O',
    y=alt.Y('PRCP', title='Average Precipitation', scale=alt.Scale(domain=[0, 7]),axis=alt.Axis(offset=40)),
    color=alt.value('green')
)

# Overlay the charts
combined_chart = alt.layer(temp_chart, yield_chart,precip_chart).resolve_scale(
    y='independent'
)

combined_chart.display()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yield_df['YEAR'] = yield_df['YEAR'].astype(str)


In [77]:
# Scatter plot for yield vs. temperature
scatter_temp = alt.Chart(yield_df).mark_circle(size=60).encode(
    x=alt.X('TAVG:Q', title='Average Temperature (°F)', scale=alt.Scale(domain=[65, 75])),
    y=alt.Y('Value:Q', title='Yield (Bushels/Acre)', scale=alt.Scale(domain=[40, 70])),
    color='YEAR:O', 
    tooltip=['Value', 'TAVG']
).properties(
    title='Yield vs. Avg Temperature'
)

# Scatter plot for yield vs. precipitation
scatter_precip = alt.Chart(yield_df).mark_circle(size=60).encode(
    x=alt.X('PRCP:Q', title='Average Precipitation (mm)', scale=alt.Scale(domain=[0, 7])),
    y=alt.Y('Value:Q', title='Yield (Bushels/Acre)', scale=alt.Scale(domain=[40, 70])),
    color='YEAR:O', 
    tooltip=['Value', 'PRCP']
).properties(
    title='Yield vs. Avg Precipitation'
)

# Display the charts
scatter_temp.display()
scatter_precip.display()