In [None]:
# importing necessary packages

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # graphing library
import seaborn as sns # add-on customization
import warnings # handle system warnigns
from sklearn.preprocessing import StandardScaler # normalizer
import datetime as dt # time and date modifications
import random # randomizer library
from scipy import stats # statistical testing library

In [None]:
# modifying notebook saves
%autosave 180

# modifying pandas defaults
pd.options.display.precision = 3

# modifying matplotlib for inline display
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# modifying seaborn
sns.set()
sns.set_style("whitegrid")
sns.set_context("notebook")
#colors = sns.color_palette("GnBu", 10).as_hex()
#colors= sns.color_palette("mako_r", 10)
colors = sns.color_palette("muted", n_colors=12)

#colors = ['#fcde9c', '#faa476', '#f0746e', '#e34f6f', '#dc3977', '#b9257a', '#7c1d6f']

# modifying warnigns
warnings.filterwarnings('ignore')

In [None]:
def dfStats(df):
  print(f'Dataframe stats: \n================================================')
  print(f'{df.info()}')

  print('\nUnique values in columns:\n')
  for column in df.columns:
    print(f'{column}:{df[column].nunique()}')

  print(f'\nColumns with NaNs:\n{df.columns[df.isna().all()].to_list()}')
  print(f'\nDuplicates in dataset:\n{df.duplicated().sum()}')

def pairPlot(df):
  _=sns.pairplot(df)
  plt.show()

def colRename(df):
  df.columns = df.columns.str.replace(' ','_').str.lower()
  return df

def nullFinder(df):
  nullCol = df.columns[df.isna().all()].to_list()
  return nullCol

def dropColumns(df):
  columns = nullFinder(df)
  df.drop(columns, axis = 1, inplace=True)
  return df

def categorizer(df):
  cols = {}
  for column in df.columns:
    cols[column] = df[column].nunique()
  
  catCols=[]
  for column, value in cols.items():
    if value<=df.shape[0]/10:
      catCols.append(column)

  for column in catCols:
    df[column] = df[column].astype('category')
  return df

def replacer(listVals, name):
  for item in listVals:
    df.county.replace(item, name, inplace=True)

def caller(df):
  dfStats(df)
  colRename(df)
  dataframe = dropColumns(df)
  pairPlot(df)

  return dataframe

# Data Wrangling

![County Categorization](https://upload.wikimedia.org/wikipedia/commons/thumb/3/3d/Map_of_New_York_Economic_Regions.svg/2560px-Map_of_New_York_Economic_Regions.svg.png)

![Counties](https://www.familysearch.org/wiki/en/img_auth.php/d/d6/New-york-county-map.gif)

In [None]:
qcew = pd.read_csv('https://raw.githubusercontent.com/angelialau/Watershed_Investments/master/data/QCEW_2000-2018_cleaned_selected_ind.csv?token=AD5Z33FSBWJ7HVRNPMEXYQC7D4SAY')
qcew.head()

In [None]:
qcew['NAICS Title'].unique()

### Natural resource related employment:
##### 'Total, All Industries','Agriculture, Forestry, Fishing and Hunting','Wood Product Manufacturing','Forestry and Logging', 'Logging',   'Agriculture & Forestry Support Activity','Support Activities for Crop Production','Fishing, Hunting and Trapping''Fishing'


In [None]:
qcew['Year'].min(), qcew['Year'].max()

In [None]:
qcew.shape

In [None]:
df=caller(qcew)


In [None]:
df.head()

In [None]:
temp_df = df.copy()

In [None]:
WOH = ['Delaware', 'Sullivan', 'Ulster', 'Greene', 'Schoharie']
UpST = ['Steuben', 'Schuyler', 'Chemung', 'Tompkins', 'Tioga', 'Chenango', 'Broome']
UpNC = ['Essex','Clinton','Franklin','St. Lawrence', 'Jefferson', 'Lewis','Hamilton']
UpCNY = ['Oneida', 'Herkimer', 'Fulton', 'Montgomery', 'Otsego', 'Oswego', 'Onondaga','Cayuga','Cortland', 'Madison',]
UpCD = [ 'Albany', 'Columbia', 'Warren', 'Washington', 'Saratoga', 'Schenectady', 'Rensselaer' ]
UpWNY = ['Niagara','Erie','Chautauqua','Cattaraugus','Allegany']
DnHV = ['Dutchess', 'Orange', 'Putnam', 'Rockland', 'Westchester']
UpFL = ['Orleans','Wyoming','Livingston','Ontario','Yates','Seneca','Wayne','Genesee','Monroe']
NY= ['Richmond','Kings','New York','Queens','Bronx']
LI=['Nassau','Suffolk']

In [None]:
replacer(WOH, 'West of Hudson');
replacer(UpST, 'Southern Tier');
replacer(UpNC, 'North Country');
replacer(UpCNY, 'Central New York');
replacer(UpCD, 'Capital District');
replacer(UpWNY, 'Western New York');
replacer(DnHV, 'Hudson Valley');
replacer(UpFL, 'Finger Lankes');
replacer(NY, 'New York City');
replacer(LI, 'Long Island');

In [None]:
df.head()

In [None]:
df.naics_title.value_counts()

In [None]:
df.drop(df.index[(df.naics_title!='Museums, Parks and Historical Sites') & (df.naics_title!='RV Parks and Recreational Camps') & (df.naics_title!='Agriculture, Forestry, Fishing and Hunting') & (df.naics_title!='Wood Product Manufacturing') & (df.naics_title!='Forestry and Logging') \
                   & (df.naics_title!='Logging') & (df.naics_title!='Agriculture & Forestry Support Activity') & (df.naics_title!='Support Activities for Crop Production') & (df.naics_title!='Fishing')\
                   & (df.naics_title!='Fishing, Hunting and Trapping')], inplace=True)

In [None]:
df.naics_title.value_counts()

In [None]:
df.head()

In [None]:
df.reset_index(inplace=True, drop=True)
df.head()

In [None]:
indCounty = df.groupby(['county','naics_title']).sum().reset_index()
indCounty.average_employment.fillna(0, inplace=True)
indCounty.head()

In [None]:
plt.figure(figsize=(30,10))
ax = sns.barplot(x='naics_title', y='average_employment', data=indCounty, hue='county', palette=colors)
plt.title('Occupation average employment by Counties', fontsize=20)
plt.xlabel('Occupation', fontsize=16)
plt.xticks(rotation=45)
plt.ylabel('Average Employment', fontsize=16)
plt.legend(loc='upper right')
#h, l = ax.get_legend_handles_labels()
#ax.legend(h, labels, title="Counties")
plt.yscale('log')

In [None]:
empCh = df[['county','naics_title','average_employment','year']].copy()
empCh.head()

In [None]:
empCh = empCh.pivot_table(index=('county','naics_title'), columns=["year"], values="average_employment", aggfunc=np.sum, observed=True)
empCh.head()

In [None]:
empCh = empCh.iloc[:,[0,-1]]
empCh.head()

In [None]:
empCh.fillna(0, inplace=True)
empCh.head()

In [None]:
empCh.columns = ['2000', '2018']
empCh['Diff'] = ((empCh['2018'] - empCh['2000'])/empCh['2000'])*100
empCh.reset_index(inplace=True)
empCh.head()

In [None]:
plt.clf()
#label= ['Hudson Valley','Long Island','Neww York City','Statewide','Capital District','Central New York','Finger Lakes','North County','Souther Tier','Western New York','West of Hudson']

plt.figure(figsize=(40,8))
sns.barplot(x='naics_title', y='Diff', hue='county', data=empCh, palette=colors,edgecolor='k', lw=1);
plt.title('Change in occupation from 2000 to 2018', fontsize=20)
plt.xlabel('Occupation', fontsize=16)
plt.ylabel('Absolute Change', fontsize=16)
plt.xticks(rotation=10, fontsize = 12, ha='center')
plt.legend(loc='upper right', fontsize=12)
#hx, lx = ax.get_legend_handles_labels()
#ax.legend(hx, label, title="Interested in politics")
plt.yscale('linear')
plt.show();

In [None]:
naturalResourcedf = df.groupby(['naics_title', 'county']).sum()
naturalResourcedf= naturalResourcedf.loc[:,'average_employment'].to_frame().reset_index()
naturalResourcedf = naturalResourcedf.pivot_table(index='county', columns=["naics_title"], values="average_employment", aggfunc=np.sum, observed=True)
for col in naturalResourcedf.columns:
  naturalResourcedf[col] = naturalResourcedf[col]*100/naturalResourcedf[col].sum()


naturalResourcedf = naturalResourcedf.unstack().to_frame().reset_index()
naturalResourcedf.head()

In [None]:
plt.figure(figsize=(35,8))
sns.barplot(x='naics_title', y=0, hue='county', data=x1, palette=colors, edgecolor='k', lw=1)
plt.xticks(rotation=10, fontsize = 12, ha='center')
plt.xlabel('Occupation', fontsize=13)
plt.ylabel('% employment concentration', fontsize=13)
plt.legend(bbox_to_anchor=(1.1, 0.99), loc='upper right', ncol=1, fontsize=12)
plt.title('Employment concentration in natural resource occupation', fontsize=18)
plt.show()

In [None]:
temp_df.head()

In [None]:
temp_df.shape

In [None]:
temp_df.drop(temp_df.index[(temp_df.naics_title!='Museums, Parks and Historical Sites') & (temp_df.naics_title!='RV Parks and Recreational Camps') & (temp_df.naics_title!='Agriculture, Forestry, Fishing and Hunting') & (temp_df.naics_title!='Wood Product Manufacturing') & (temp_df.naics_title!='Forestry and Logging') \
                   & (temp_df.naics_title!='Logging') & (temp_df.naics_title!='Agriculture & Forestry Support Activity') & (temp_df.naics_title!='Support Activities for Crop Production') & (temp_df.naics_title!='Fishing')\
                   & (temp_df.naics_title!='Fishing, Hunting and Trapping')], inplace=True)
temp_df.shape

#### WOH = ['Delaware', 'Sullivan', 'Ulster', 'Greene', 'Schoharie']

In [None]:
temp_df.drop(temp_df.index[(temp_df.county!='Delaware') & (temp_df.county!='Sullivan') & (temp_df.county!='Ulster') &(temp_df.county!='Greene') &(temp_df.county!='Schoharie')], inplace=True) 
temp_df.shape

In [None]:
temp_df.head()

In [None]:
WempCh = temp_df[['county','naics_title','average_employment','year']].copy()
WempCh = WempCh.pivot_table(index=('county','naics_title'), columns=["year"], values="average_employment", aggfunc=np.sum, observed=True)
WempCh = WempCh.iloc[:,[0,-1]]
WempCh.head()

In [None]:
WempCh.fillna(0, inplace=True)
WempCh.columns = ['2000', '2018']
WempCh['Diff'] = ((WempCh['2018'] - WempCh['2000'])/WempCh['2000'])*100
WempCh.reset_index(inplace=True)
WempCh.head()

In [None]:
plt.clf()
#label= ['Hudson Valley','Long Island','Neww York City','Statewide','Capital District','Central New York','Finger Lakes','North County','Souther Tier','Western New York','West of Hudson']

plt.figure(figsize=(37,5))
sns.barplot(x='naics_title', y='Diff', hue='county', data=WempCh, palette=colors,edgecolor='k', lw=1);
plt.title('WOH Occupation percentage change from 2000 to 2018', fontsize=20)
plt.xlabel('Occupation', fontsize=16)
plt.ylabel('% Change', fontsize=16)
plt.xticks(rotation=0, fontsize = 12, ha='center')
plt.legend(loc='upper right', fontsize=12)
#hx, lx = ax.get_legend_handles_labels()
#ax.legend(hx, label, title="Interested in politics")
plt.yscale('linear')
plt.show();

In [None]:
naturalResourceWOH = temp_df.groupby(['naics_title', 'county']).sum()
naturalResourceWOH= naturalResourceWOH.loc[:,'average_employment'].to_frame().reset_index()
naturalResourceWOH = naturalResourceWOH.pivot_table(index='county', columns=["naics_title"], values="average_employment", aggfunc=np.sum, observed=True)
for col in naturalResourceWOH.columns:
  naturalResourceWOH[col] = naturalResourceWOH[col]*100/naturalResourceWOH[col].sum()


naturalResourceWOH = naturalResourceWOH.unstack().to_frame().reset_index()
naturalResourceWOH.head()

In [None]:
plt.figure(figsize=(35,8))
sns.barplot(x='naics_title', y=0, hue='county', data=y1, palette=colors,edgecolor='k', lw=1)
plt.xticks(rotation=10, fontsize = 12, ha='center')
plt.xlabel('Occupation', fontsize=13)
plt.ylabel('% employment concentration', fontsize=13)
plt.legend(bbox_to_anchor=(1.1, 0.99), loc='upper right', ncol=1, fontsize=12)
plt.title('Employment concentration in natural resource occupation (WOH)', fontsize=18)
plt.show()