In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

AGRI_CENSUS = '../data/agri_census_finance.csv'

WOH = ['DELAWARE', 'GREENE', 'SCHOHARIE', 'SULLIVAN', 'ULSTER']

# 1 Data Cleaning

In [None]:
df = pd.read_csv(AGRI_CENSUS)
df.columns

In [None]:
# drop columns with just one value
num_unique = df.apply(pd.Series.nunique, axis=0)
cols_to_drop = num_unique[num_unique==1].index
df.drop(columns=cols_to_drop, inplace=True)
df.columns, df.shape

In [None]:
# drop columns with all null values
col_is_null = df.isnull().all()
cols_to_drop = col_is_null[col_is_null].index
df.drop(columns=cols_to_drop, inplace=True)
df.columns, df.shape

In [None]:
# drop redundant columns
cols_to_drop = ['Ag District Code', 'County ANSI', 'CV (%)']
df.drop(columns=cols_to_drop, inplace=True)
df.columns, df.shape

In [None]:
df = df.sort_values(['County', 'Year']).reset_index(drop=True)
df.head()

# 2 Viz

In [None]:
sorted(df['Data Item'].unique())

In [None]:
def plot_census_data(subset_df):
    # data wrangling
    subset_df = subset_df.pivot(index='Year', columns='County', values='Value')
    subset_df = subset_df\
                    .fillna(0.0)\
                    .apply(lambda col: (col.str.replace('[^0-9]', ''))\
                                           .astype('float', errors='ignore'), axis=0) # convert string to proper integers
    subset_df.index = subset_df.index.astype('int')

    # plot
    subset_df.plot.line(marker='.', figsize=(9,5))
    plt.xticks(subset_df.index, rotation=90) # year labelling
    plt.xlabel('')
    plt.legend(bbox_to_anchor=(1,1))         # legend
    plt.grid(linestyle='--', alpha=.3)       # grid

In [None]:
# for data_item in data_items:
data_item = 'COMMODITY TOTALS - SALES, MEASURED IN $ / OPERATION'
domain = 'TOTAL'
subset_df = df[(df['Data Item']==data_item) 
               & (df.Domain==domain) 
               & (df.County.isin(WOH))]
plot_census_data(subset_df)
plt.title(data_item+'\n(WOH Counties)')


In [None]:
data_item = 'CROP TOTALS - SALES, MEASURED IN $'
domain = 'TOTAL'
subset_df = df[(df['Data Item']==data_item) 
               & (df.Domain==domain) 
               & (df.County.isin(WOH))]

plot_census_data(subset_df)
plt.title(data_item+'\n(WOH Counties)')


In [None]:
data_item = 'CROP TOTALS - SALES, MEASURED IN PCT OF FARM SALES'
subset_df = df[(df['Data Item']==data_item)
               & (df.County.isin(WOH))]
plot_census_data(subset_df)
plt.title(data_item+'\n(WOH Counties)')

In [None]:
data_item = 'INCOME, FARM-RELATED - RECEIPTS, MEASURED IN $ / OPERATION'
subset_df = df[(df['Data Item']==data_item)
               & (df.Domain=='TOTAL')
               & (df.County.isin(WOH))]

plot_census_data(subset_df)
plt.title(data_item+'\n(WOH Counties)')


In [None]:
# investigate ulster's sharp increase from 2012 to 2017
ulster = df[(df.County=='ULSTER') 
   & (df['Data Item'].str.contains('INCOME'))
   & (df.Domain=='TOTAL')]\
    .pivot(index='Year', columns='Data Item', values='Value')\
    .apply(lambda col: (col.str.replace('[^0-9]', ''))\
                               .replace('', np.nan)\
                               .fillna(0)\
                               .astype('float', errors='ignore'), axis=0)

ulster.plot()
plt.xticks(ulster.index)
legend_labels = ulster.columns.str.replace('INCOME, FARM-RELATED, ', '')\
                    .str.replace('INCOME, FARM-RELATED - ', '')
plt.legend(labels=legend_labels, bbox_to_anchor=(1,1))
plt.title('Ulster County: INCOME, FARM-RELATED Data')