In [3]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

data_url = 'https://data.cityofnewyork.us/api/views/kpav-sd4t/rows.csv?accessType=DOWNLOAD'
source_df = pd.read_csv(data_url)
source_df = source_df.fillna('')
df = source_df.drop_duplicates('Job ID')

Method

- We will use the midpoint of the salary range for each position.
- For the purposes of this analysis all daily and hourly positions will be shown as full time:
  - Hourly Positions will be estimated at 40 hours a week for 50 weeks
  - Daily Salaries will be estimated at 5 days a week for 50 weeks
- Time the job is left open will be used as a proxy for difficulty to fill
  - This is calculated as the time from last update to the original posting date
- Duplicate rows are removed - only first occurence of each Job ID is shown
  
  
Notes:

Positions like City Seasonal Aide and City Park Worker have hundreds of low paying open positons

In [4]:
#df[df['Agency']=='FIRE DEPARTMENT'].head(2)

In [5]:
external_posted_job_ids = source_df[source_df['Posting Type'] == 'External']['Job ID'].unique() # get unique IDs from full dataframe
df['has_external_posting'] = [True if job_id in external_posted_job_ids else False for job_id in df['Job ID']]
df.has_external_posting.value_counts()

salary = df[['Agency','# Of Positions','Salary Range From', 'Salary Range To','Salary Frequency', 'has_external_posting']]
salary['salary_midpoint'] = salary['Salary Range From'] + (salary['Salary Range To'] - salary['Salary Range From'])/2

# Annualize Hourly and Daily rates (see http://stackoverflow.com/questions/12307099/modifying-a-subset-of-rows-in-a-pandas-dataframe)
salary.ix[df['Salary Frequency'] =='Hourly', 'salary_midpoint'] = salary.ix[df['Salary Frequency'] =='Hourly', 'salary_midpoint'] * 40 * 50
salary.ix[df['Salary Frequency'] =='Daily', 'salary_midpoint'] = salary.ix[df['Salary Frequency'] =='Daily', 'salary_midpoint'] * 5 * 50

salary['salary_total'] = salary['# Of Positions'] * salary['salary_midpoint']

salary_grouped = salary.groupby('Agency').sum()
salary[salary['Salary Frequency'] == 'Annual'].head(5)
salary_grouped['average_salary'] = salary_grouped['salary_total'] / salary_grouped['# Of Positions']
#ordered_salaries = salary_grouped[['# Of Positions', 'average_salary']].sort_values('average_salary', ascending=False)
ordered_salaries = salary_grouped[['average_salary']].sort_values('average_salary', ascending=False)

In [6]:
%%javascript
require.config({
    paths: {
        d3: '//d3js.org/d3.v4.min'
    }
});

<IPython.core.display.Javascript object>

In [7]:
from IPython.display import Javascript
Javascript('window.chartData={};'.format(ordered_salaries.to_json()))

<IPython.core.display.Javascript object>

In [8]:
%%javascript
require(['d3'], function(d3){
  $("#chart").remove();
  //create canvas
  element.append("<div id='chart'></div>");
  $("#chart").width("960px");
  $("#chart").height("600px");        
  var margin = {top: 20, right: 20, bottom: 30, left: 40};
  var width = 880 - margin.left - margin.right;
  var height = 500 - margin.top - margin.bottom;
  var svg = d3.select("#chart").append("svg")
    //.style("position", "relative")
    //.style("max-width", "960px")
    .attr("width", width + "px")
    .attr("height", (height + 50) + "px")
    .append("g")
    .attr("transform", "translate(" + margin.left + "," + margin.top + ")");

    svg.append("text").attr('x',10).attr('y',100).text(JSON.stringify(chartData))
});

<IPython.core.display.Javascript object>