In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

# input files
QCEW_FILEPATH = '../data/QCEW_2000-2018_cleaned_naics_sectors.csv'
LAP_ACRES_FILEPATH = '../data/LAP_landuse_acres.csv'

# constants
WOH_COUNTIES = ['Delaware', 'Greene', 'Schoharie', 'Sullivan', 'Ulster']
EOH_COUNTIES = ['Dutchess', 'Putnam', 'Westchester']
METRICS = ['Real Total Salary', 'Real Average Salary', 'Average Employment', 'Establishments']
TOTAL_INDUSTRIES = ['Total, All Industries', 'Total, All Private', 'Total, All Government']
LANDUSE_TYPES = ['Agricultural Land', 'Forest Land', 'Urban/Built-up Land', 'Others']
       

In [None]:
qcew = pd.read_csv(QCEW_FILEPATH, index_col=0)
qcew.head()

In [None]:
# load LAP Acres
lap_acres = pd.read_csv(LAP_ACRES_FILEPATH, index_col=0)
lap_acres.head()

In [None]:
# years avail for our datasets
print('years avail for qcew:', qcew.Year.min(), '-', qcew.Year.max())
print('years avail for lap:', lap_acres.year.min(), '-', lap_acres.year.max())
common_years = set(qcew.Year).intersection(set(lap_acres.index))
print('common years:', common_years)

In [None]:
# truncate datasets to common years ie 2000-2018
lap_acres = lap_acres[lap_acres.year.isin(common_years)]
qcew = qcew[qcew.Year.isin(common_years)]

In [None]:
lap_acres.landuse.unique()

# Output as json (for viz)

In [None]:
# WOH: industries x metric x counties x landuse
correlations = {}
for metric in METRICS:
    correlations[metric] = {}
    for landuse_type in LANDUSE_TYPES:
        correlations[metric][landuse_type] = {}
        for county in WOH_COUNTIES: 
            subset = qcew[qcew.index==county].pivot(index='Year',
                                                         columns='NAICS Title',
                                                         values=metric)
            subset.dropna(axis='columns', how='any', inplace=True)

            subset['landuse_acres'] = lap_acres[(lap_acres.landuse==landuse_type) 
                                                & (lap_acres.county==county)]\
                                        .pivot_table(index='year', 
                                                     values='landuse_acres', 
                                                     aggfunc='sum', 
                                                     fill_value=0)
            correlations[metric][landuse_type][county] = subset.corr().iloc[-1, :-1].to_dict() # industry: corr val

In [None]:
import json
with open('../output/woh_corr_4.json', 'w') as f:
    json.dump(correlations, f)

# Viz

In [None]:
def plot_corr(landuse_type, metric):
    correlations = {}
    for industry in np.setdiff1d(qcew['NAICS Title'].unique() , TOTAL_INDUSTRIES):
        qcew_subset = qcew[(qcew['NAICS Title']==industry) & qcew.index.isin(WOH_COUNTIES)]\
                        .pivot_table(index='Year', 
                                     columns='County', 
                                     values=metric)
        if qcew_subset.isnull().any().any(): continue

        lap_subset = lap_acres[lap_acres.landuse==landuse_type]\
                        .pivot_table(index='year', 
                                     columns='county', 
                                     values='landuse_acres', 
                                     aggfunc='sum', 
                                     fill_value=0)

        correlations[industry] = qcew_subset.corrwith(lap_subset).to_dict()

    correlations = pd.DataFrame(correlations)
    # plot
    _, ax = plt.subplots(figsize=(8,6))
    sns.heatmap(correlations.transpose(), 
                center=0, vmin=-1, vmax=1, 
                cmap=sns.diverging_palette(10, 145, sep=40, n=10),
                annot=True,
                ax=ax)
    ax.set_title('Correlation between\nLanduse (acres acquired): %s\nEconomic Metric: %s'%(landuse_type, metric))


In [None]:
plot_corr(landuse_type='Forest Land', metric='Real Total Salary')

In [None]:
requests = [{'metric': 'Real Average Salary',
             'county': 'Greene',
             'industry': 'Other Services, Ex. Public Admin'
            },
            {'metric': 'Real Average Salary',
             'county': 'Schoharie',
             'industry': 'Agriculture, Forestry, Fishing and Hunting'
            },
            {'metric': 'Real Average Salary',
             'county': 'Ulster',
             'industry': 'Agriculture, Forestry, Fishing and Hunting'
            },
            {'metric': 'Real Average Salary',
             'county': 'Greene',
             'industry': 'Arts, Entertainment, and Recreation'
            },
            {'metric': 'Real Average Salary',
             'county': 'Schoharie',
             'industry': 'Administrative and Waste Services'
            },
            {'metric': 'Average Employment',
             'county': 'Ulster',
             'industry': 'Information'
            },
            {'metric': 'Average Employment',
             'county': 'Ulster',
             'industry': 'Arts, Entertainment, and Recreation'
            },
            {'metric': 'Establishments',
             'county': 'Ulster',
             'industry': 'Manufacturing'
            },
            {'metric': 'Establishments',
             'county': 'Ulster',
             'industry': 'Professional and Technical Services'
            },
            {'metric': 'Establishments',
             'county': 'Greene',
             'industry': 'Retail Trade'
            }]

In [None]:
# plot trends for specific correlations
from sklearn.preprocessing import MinMaxScaler
from matplotlib.lines import Line2D

scaler = MinMaxScaler()

for req in requests:
    metric = req['metric']
    county = req['county']
    industry = req['industry']

    ### data wrangling
    # lap data
    subset = lap_acres[lap_acres.county==county]\
                    .pivot_table(index='year', 
                                 columns='landuse', 
                                 values='landuse_acres', 
                                 aggfunc='sum', 
                                 fill_value=0)\
                    [LANDUSE_TYPES]
    subset['All Lands'] = subset.sum(axis=1)

    # add economic data to it
    subset = subset.join(qcew[(qcew.index==county) 
                       & (qcew['NAICS Title']==industry)]\
                    .set_index('Year')
                    [[metric]])
    # min max scale the data
    subset = pd.DataFrame(scaler.fit_transform(subset), 
                          columns=subset.columns,
                          index=subset.index)
    labels = ['%s (%.2f)'%pair for pair in subset.corr().iloc[-1, :-1].items()]
    labels.append(metric)
    ### plotting
    # plot lap acres
    title = 'Metric: %s\nCounty: %s\nIndustry: %s'%(metric, county, industry)
    ax = subset.iloc[:, :-1].plot(figsize=(12,5),
                                  title=title)
    # plot economic metric
    subset[metric].plot(ax=ax, linewidth=4, color='sienna')

    plt.legend(labels=labels,
               bbox_to_anchor=(1,1))
    plt.xlabel('Year')
    plt.grid(True)

In [None]:
for landuse_type in LANDUSE_TYPES:
    plot_corr(landuse_type=landuse_type, metric='Average Employment')

In [None]:
for landuse_type in LANDUSE_TYPES:
    plot_corr(landuse_type=landuse_type, metric='Establishments')