In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

# input files
QCEW_FILEPATH = '../data/QCEW_2000-2018_cleaned_naics_sectors.csv'
LAP_COUNTS_FILEPATH = '../data/LAP_annual_counts.csv'
LAP_ACRES_FILEPATH = '../data/LAP_annual_acres.csv'
LAP_ACRES_NORM_FILEPATH = '../data/LAP_annual_acres_normalized.csv'

# constants
WOH_COUNTIES = ['Delaware', 'Greene', 'Schoharie', 'Sullivan', 'Ulster']
EOH_COUNTIES = ['Dutchess', 'Putnam', 'Westchester']
METRICS = ['Real Total Salary', 'Real Average Salary', 'Average Employment', 'Establishments']
TOTAL_INDUSTRIES = ['Total, All Industries', 'Total, All Private', 'Total, All Government']

In [None]:
qcew = pd.read_csv(QCEW_FILEPATH, index_col=0)
qcew.head()

In [None]:
# load LAP Acres
lap_acres = pd.read_csv(LAP_ACRES_FILEPATH, index_col=0)
lap_acres.head()

In [None]:
# years avail for our datasets
print('years avail for qcew:', qcew.Year.min(), '-', qcew.Year.max())
print('years avail for lap:', lap_acres.index.min(), '-', lap_acres.index.max())
common_years = set(qcew.Year).intersection(set(lap_acres.index))
print('common years:', common_years)

In [None]:
# truncate datasets to common years ie 2000-2018
lap_acres = lap_acres[lap_acres.index.isin(common_years)]
qcew = qcew[qcew.Year.isin(common_years)]

# Plot correlation of all industries for each metric

In [None]:
# plot only industries with non-null values
corr_data = []
industries = qcew['NAICS Title'].unique()
        
for industry in industries:
    for category in METRICS:
        # row = year, column = WOH, value = specific industry and metric 
        subset = qcew[(qcew['NAICS Title']==industry) 
                      & qcew.index.isin(lap_acres.columns)]
        qcew_f = subset.pivot_table(index='Year', 
                                    columns='County', 
                                    values=category)
        if qcew_f.isnull().any().any(): # discard this industry for this metric if there are null values
            continue
        
        # calc corr with acres acquired (normalized by county area)
        correlations = qcew_f.corrwith(lap_acres).tolist()
        entry = {cty: correlations[i] for i, cty in enumerate(qcew_f.columns)}
        entry.update({'industry': industry,
                      'metric': category})
        corr_data.append(entry)
        
correlations = pd.DataFrame(corr_data)
correlations.head()

### => Export correlations as json for viz

In [None]:
import json
with open('../output/woh_corr_4.json', 'r') as f:
    data = json.load(f)

In [None]:
lap_acres.index = lap_acres.index.astype('int')

In [None]:
# update json: # metric, landuse, county, {industry: corr_val}
for metric in METRICS: 
    data[metric]['All Landuse Types'] = {}
    for county in WOH_COUNTIES + EOH_COUNTIES:
        # compute correlations
        industry_data = qcew[(qcew.index==county)].pivot(index='Year', 
                                                         columns='NAICS Title', 
                                                         values=metric)
        industry_data = industry_data.dropna(axis='columns', how='any')
        industry_data = industry_data.join(lap_acres[county].to_frame('acres'))
        corr = industry_data.corr().iloc[-1, :-1].to_dict()
        
        # add corr data
        if county not in data[metric]['All Landuse Types']:
            data[metric]['All Landuse Types'][county] = {}
        data[metric]['All Landuse Types'][county] = corr

In [None]:
for item in [data.keys(), 
             data['Real Average Salary'].keys(), 
             data['Real Average Salary']['All Landuse Types'].keys(),
             data['Real Average Salary']['All Landuse Types']['Putnam'].keys()]:
    print(item)
    print()

In [None]:
with open('../output/corr_4.json', 'w') as f:
    json.dump(data, f)


In [None]:
pc_area = {
    'Delaware': .57,
    'Greene': .47,
    'Schoharie': .09,
    'Sullivan': .07,
    'Ulster': .31,
    'Dutchess': .04,
    'Putnam': .59,
    'Westchester': .38
}

sorted_counties = [k for k, v in sorted(pc_area.items(), key=lambda item: item[1])]
sorted_woh = list(filter(lambda c: c in WOH_COUNTIES, sorted_counties))
sorted_eoh = list(filter(lambda c: c in EOH_COUNTIES, sorted_counties))

In [None]:
# plot only industries with non-null values
for metric in METRICS:
    qcew_corr = correlations[correlations.metric==metric]\
                .set_index('industry')[sorted_woh+sorted_eoh]
#     qcew_corr = pd.concat([qcew_corr.sort_index().drop(index=TOTAL_INDUSTRIES), qcew_corr.loc[TOTAL_INDUSTRIES]])
    
    _, ax = plt.subplots(figsize=(11,8))
    sns.heatmap(qcew_corr,
                ax=ax,
                vmin=-1,vmax=1,
                center=0, 
                cmap=sns.diverging_palette(10, 145, sep=40, as_cmap=True),
                annot=True)
    ax.set_ylabel('')
    ax.set_title('%s (2000-2018)'%metric, pad=30)
    ax.tick_params(labeltop=True, bottom=False)

In [None]:
labels = {}
for k, v in pc_area.items():
    soh = 'WOH' if k in WOH_COUNTIES else 'EOH'
    labels[k] = f"{soh}\n{k}\n{int(v*100)}%"

In [None]:
# viz: sort according to value
for i, metric in enumerate(METRICS):
    metric_df = correlations[(correlations.metric==metric) 
                             & ~(correlations.industry.isin(TOTAL_INDUSTRIES))]\
                    [sorted_counties]
    sorted_vals = {}
    for county in metric_df.columns:
        sorted_vals[county] = sorted(metric_df[county].tolist())

    _, ax = plt.subplots(figsize=(10,8))
    sns.heatmap(pd.DataFrame(sorted_vals)[::-1],
                ax=ax,
                vmin=-1,vmax=1,
                center=0, 
                cmap=sns.diverging_palette(10, 145, sep=40, n=10),
                annot=True)
    ax.set_ylabel('')
    ax.set_xticklabels([labels[c] for c in sorted_counties])
    ax.set_title('%s (sorted values)'%metric, pad=30)
    ax.tick_params(labeltop=True, bottom=False)

In [None]:
for metric in METRICS:
    print('===== %s ====='%metric)
    print('==> Max correlations:')
    print(correlations[(correlations.metric==metric) 
                       & ~(correlations.industry.isin(TOTAL_INDUSTRIES))]\
              .set_index('industry')[WOH_COUNTIES+EOH_COUNTIES].idxmax())
    print('\n')
    print('==> Min correlations:')
    print(correlations[(correlations.metric==metric) 
                       & ~(correlations.industry.isin(TOTAL_INDUSTRIES))]\
              .set_index('industry')[WOH_COUNTIES+EOH_COUNTIES].idxmin())
    print('\n\n')

# Plot trendlines for select EOH counties/industries

In [None]:
requests = [{'metric': 'Real Average Salary',
             'county': 'Westchester',
             'industry': 'Arts, Entertainment, and Recreation'
            },
            {'metric': 'Real Average Salary',
             'county': 'Putnam',
             'industry': 'Other Services, Ex. Public Admin'
            },
            {'metric': 'Average Employment',
             'county': 'Putnam',
             'industry': 'Information'
            },
            {'metric': 'Average Employment',
             'county': 'Putnam',
             'industry': 'Health Care and Social Assistance'
            },
            {'metric': 'Establishments',
             'county': 'Dutchess',
             'industry': 'Finance and Insurance'
            },
            {'metric': 'Establishments',
             'county': 'Putnam',
             'industry': 'Professional and Technical Services'
            }]

In [None]:
# plot trends for specific correlations
from sklearn.preprocessing import MinMaxScaler
from matplotlib.lines import Line2D

scaler = MinMaxScaler()

for req in requests:
    metric = req['metric']
    county = req['county']
    industry = req['industry']

    ### data wrangling
    # lap data
    subset = lap_acres[[county]].rename(columns={county: 'All Lands'})

    # add economic data to it
    subset = subset.join(qcew[(qcew.index==county) 
                       & (qcew['NAICS Title']==industry)]\
                    .set_index('Year')[[metric]])

    # min max scale the data
    subset = pd.DataFrame(scaler.fit_transform(subset), 
                          columns=subset.columns,
                          index=subset.index)
    labels = ['%s (%.2f)'%pair for pair in subset.corr().iloc[-1, :-1].items()]
    labels.append(metric)

    ### plotting
    # plot lap acres
    title = 'Metric: %s\nCounty: %s\nIndustry: %s'%(metric, county, industry)
    ax = subset.iloc[:, :-1].plot(figsize=(12,3),
                                  title=title, 
                                  color='tab:purple'
                                 )
    # plot economic metric
    subset[metric].plot(ax=ax, linewidth=4, color='sienna')

    plt.legend(labels=labels,
               bbox_to_anchor=(1,1))
    plt.xlabel('Year')
    plt.grid(True)