In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

AGRI_WOH = '../data/agri_woh.csv'

# 1 Data Cleaning

In [None]:
woh = pd.read_csv(AGRI_WOH)
woh.columns

In [None]:
# drop columns with just one value
num_unique = woh.apply(pd.Series.nunique, axis=0)
cols_to_drop = num_unique[num_unique==1].index
woh.drop(columns=cols_to_drop, inplace=True)
woh.columns

In [None]:
# drop columns with all null values
col_is_null = woh.isnull().all()
cols_to_drop = col_is_null[col_is_null].index
woh.drop(columns=cols_to_drop, inplace=True)
woh.columns

In [None]:
# drop redundant columns
cols_to_drop = ['Ag District Code', 'County ANSI']
woh.drop(columns=cols_to_drop, inplace=True)
woh.columns

In [None]:
# transform 'Value' from string to proper integers
woh['Value'] = woh.Value.str.replace('[^0-9]', '')\
                            .replace('', np.nan)\
                            .fillna(0)\
                            .astype('float', errors='ignore')

In [None]:
woh = woh.sort_values(['County', 'Year']).reset_index(drop=True)
woh.head()

# 2 Descriptive Summary

In [None]:
# the county and its agricultural district
woh[['Ag District', 'County']].drop_duplicates().sort_values('County')

In [None]:
# unique commodities
woh.Commodity.unique()

In [None]:
# all available data on commodities
woh['Data Item'].unique()

# 3 Viz: Yield

In [None]:
# data items we are interested in for measuring productivity
woh[woh['Data Item'].str.contains('YIELD')]['Data Item'].unique()

# BU = bushels (64 pints of dry goods)

In [None]:
data_items = woh[woh['Data Item'].str.contains('YIELD')]['Data Item'].unique()

for data_item in data_items:
    subset_woh = woh[woh['Data Item']==data_item]
    subset_woh = subset_woh.pivot(index='Year', columns='County', values='Value')
    subset_woh.index = subset_woh.index.astype('int')
    
    subset_woh.plot.line(marker='.', figsize=(9,5))
    plt.xticks(subset_woh.index, rotation=90)
    plt.xlabel('')

    plt.legend(bbox_to_anchor=(1,1))
    min_year = subset_woh.index.min()
    max_year = subset_woh.index.max()
    plt.title("%s\n(%d-%d)"%(data_item, min_year, max_year))
    plt.grid(linestyle='--', alpha=.3)
    
# analysis
# seems like only corn and hay data is available

# 4 Viz: Combined Yield

In [None]:
yields_in_tons = ['CORN, SILAGE - YIELD, MEASURED IN TONS / ACRE',
                  'HAY - YIELD, MEASURED IN TONS / ACRE']
yields_in_bu = ['CORN, GRAIN - YIELD, MEASURED IN BU / ACRE',
                'OATS - YIELD, MEASURED IN BU / ACRE',
                'SOYBEANS - YIELD, MEASURED IN BU / ACRE',
                'WHEAT - YIELD, MEASURED IN BU / ACRE']

In [None]:
_, axes = plt.subplots(2, sharex=True)
ylabels = ['Ton per Acre', 'Bushels per Acre']
titles = ['Combined Corn (Silage) and Hay Yield, measured in Tons per Acre',
          'Combined Corn (Grain), Oats, Soybean and Wheat Yield, measured in Bushels per Acre']
for i, cat in enumerate([yields_in_tons, yields_in_bu]):
    subset_df = woh[woh['Data Item'].isin(cat)]\
        .pivot_table(index='Year', columns='County', values='Value', aggfunc='sum')
    subset_df.plot.area(figsize=(16, 7),
                        ax=axes[i],
                        cmap='Set3',
                        legend=True if i==0 else False,
                        stacked=True)
    
    axes[i].set_ylabel(ylabels[i])
    axes[i].set_xticks(subset_df.index)
    axes[i].set_title(titles[i])

axes[0].legend(bbox_to_anchor=(1,1))
