In [None]:

import pandas as pd
import os 

project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
cleaned_dir = os.path.join(project_root, "data", "cleaned")

#create dir for user if it does not exist
os.makedirs(cleaned_dir, exist_ok=True)

In [None]:
df = pd.read_csv("../data/processed/illinois_corn_wide.csv")
df

Unnamed: 0,year,acres_planted,acres_harvested,production_bushels,yield_bu_per_acre
0,1902,,10850000.0,4.448500e+08,41.0
1,1903,,10550000.0,3.745250e+08,35.5
2,1904,,10500000.0,4.095000e+08,39.0
3,1905,,10500000.0,4.410000e+08,42.0
4,1906,,10500000.0,4.095000e+08,39.0
...,...,...,...,...,...
119,2021,11000000.0,10850000.0,2.191700e+09,202.0
120,2022,10800000.0,10421518.0,2.143284e+09,214.0
121,2023,11200000.0,11050000.0,2.276300e+09,206.0
122,2024,10800000.0,10650000.0,2.311050e+09,217.0


In [25]:
df.dtypes

year                    int64
acres_planted         float64
acres_harvested       float64
production_bushels    float64
yield_bu_per_acre     float64
dtype: object

In [26]:
df.isna().sum()

year                   0
acres_planted         24
acres_harvested        0
production_bushels     0
yield_bu_per_acre      0
dtype: int64

We notice that there are 24 missing values in "acres_planted" from years 1902-1925, but since we are doing analysis on acres_harvested, we will leave these observations in the dataset.

In [27]:
df.describe()

Unnamed: 0,year,acres_planted,acres_harvested,production_bushels,yield_bu_per_acre
count,124.0,100.0,124.0,124.0,124.0
mean,1963.5,10235920.0,9799961.0,962879300.0,92.655645
std,35.939764,1332565.0,1401729.0,674815100.0,56.480589
min,1902.0,7645000.0,6713000.0,148357000.0,22.1
25%,1932.75,9201000.0,8535500.0,394855200.0,40.75
50%,1963.5,10412500.0,9992500.0,701875000.0,79.0
75%,1994.25,11200000.0,10872500.0,1436940000.0,135.0
max,2025.0,13200000.0,13096230.0,2409000000.0,219.0


In [28]:
df["acres_harvested"] = df["acres_harvested"] / 1000000
df["acres_planted"] = df["acres_planted"] / 1000000
df["production_bushels"] = df["production_bushels"] / 1000000
df

Unnamed: 0,year,acres_planted,acres_harvested,production_bushels,yield_bu_per_acre
0,1902,,10.850000,444.850000,41.0
1,1903,,10.550000,374.525000,35.5
2,1904,,10.500000,409.500000,39.0
3,1905,,10.500000,441.000000,42.0
4,1906,,10.500000,409.500000,39.0
...,...,...,...,...,...
119,2021,11.0,10.850000,2191.700000,202.0
120,2022,10.8,10.421518,2143.284486,214.0
121,2023,11.2,11.050000,2276.300000,206.0
122,2024,10.8,10.650000,2311.050000,217.0


We divided each float by one million to make the columns easier to read and interpret. We just need to note that all values are expressed in millions.

In [29]:
df.describe()

Unnamed: 0,year,acres_planted,acres_harvested,production_bushels,yield_bu_per_acre
count,124.0,100.0,124.0,124.0,124.0
mean,1963.5,10.23592,9.799961,962.879346,92.655645
std,35.939764,1.332565,1.401729,674.815136,56.480589
min,1902.0,7.645,6.713,148.357,22.1
25%,1932.75,9.201,8.5355,394.85525,40.75
50%,1963.5,10.4125,9.9925,701.875,79.0
75%,1994.25,11.2,10.8725,1436.94,135.0
max,2025.0,13.2,13.096231,2409.0,219.0


No outliers were detected and each column only contains valid data so no further cleaning is needed for this dataset.

In [None]:
output_filename = "nass_clean.csv"
output_path = os.path.join(cleaned_dir, output_filename)

df.to_csv(output_path, index=False)
print(f"\nCleaned and annualized GSOM data written to: {output_path}")