In [90]:
from IPython.display import HTML
HTML("""
<link href='https://fonts.googleapis.com/css?family=Roboto' rel='stylesheet' type='text/css'>
<style>
div.text_cell_render {
    font-family: 'Roboto' sans-serif;
    text-align: justify;
    padding-left: 100px;
    padding-right:100px;
    line-height: 130%;
    font-size: 115%;
    width: inherit;
}

.dataframe * {
    border: 1px solid lightgray !important;
}
div.output_prompt {display: none;}
div.prompt {display: none;}
</style>

<script>
var codeVisible = true; 
function code_toggle() {
 if (codeVisible){
   $('div.input').hide();
 } else {
   $('div.input').show();
 }
 codeVisible = !codeVisible
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle code visibility"></form>
""")

In [11]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

data_url = 'https://data.cityofnewyork.us/api/views/kpav-sd4t/rows.csv?accessType=DOWNLOAD'
source_df = pd.read_csv(data_url)
source_df = source_df.fillna('')
df = source_df.drop_duplicates('Job ID')

### Job Data for NYC's municipal agencies

Through [NYC Open Data](http://opendata.cityofnewyork.us/) the city provides data on all open jobs. It is a rich dataset including information like salary ranges, job requirements, and posting dates.  We use this data to compare different city agencies, although differences (listed below) between [this dataset](https://data.cityofnewyork.us/City-Government/NYC-Jobs/kpav-sd4t) and the career portals of the agencies themselves suggest it is not completely up to date.

In [74]:
included_agencies = ['OFFICE OF MANAGEMENT & BUDGET', 'FINANCIAL INFO SVCS AGENCY', 'DEPARTMENT OF CITY PLANNING']
included_columns = ['Agency','# Of Positions','Business Title', 'Salary Range From','Salary Range To']

pd.options.display.max_colwidth = 220
df[df['Agency'].isin(included_agencies)][included_columns]


#jobid = 277860

#source_df[source_df['Job ID']==jobid].head(1121)
#df[df['Agency']=='DEPT OF ENVIRONMENT PROTECTION'].head(1121)['Posting Date']

Unnamed: 0,Agency,# Of Positions,Business Title,Salary Range From,Salary Range To
259,OFFICE OF MANAGEMENT & BUDGET,1,DEPUTY CHIEF OF STAFF,106475,106475
713,DEPARTMENT OF CITY PLANNING,1,Business Improvement Analyst,63817,75000
1368,DEPARTMENT OF CITY PLANNING,1,Senior Facilities Planner - Planning Coordination,65000,78000
1483,DEPARTMENT OF CITY PLANNING,1,Planning Analyst - Planning Coordination,55000,65000
1644,FINANCIAL INFO SVCS AGENCY,1,QTP Automation Script Developer,75000,109000
1709,FINANCIAL INFO SVCS AGENCY,1,WINDOWS ADMINISTRATOR,100000,120000
2073,FINANCIAL INFO SVCS AGENCY,2,DATA CENTER - COMPUTER ASSOCIATE,47692,69493
2436,FINANCIAL INFO SVCS AGENCY,1,ASSISTANT EXECUTIVE DIRECTOR (AED) of AGENCY OPERATIONS,89283,180000
2448,DEPARTMENT OF CITY PLANNING,1,"Deputy Executive Director - Land Use, Environmental Review & Technical Review",140000,160000
2830,FINANCIAL INFO SVCS AGENCY,1,TECH SUPPORT SCHEDULER,57223,80000


In [34]:
external_posted_job_ids = source_df[source_df['Posting Type'] == 'External']['Job ID'].unique() # get unique IDs from full dataframe
df['has_external_posting'] = [True if job_id in external_posted_job_ids else False for job_id in df['Job ID']]
df.has_external_posting.value_counts()

salary = df[['Agency','# Of Positions','Salary Range From', 'Salary Range To','Salary Frequency', 'has_external_posting']]
salary['salary_midpoint'] = salary['Salary Range From'] + (salary['Salary Range To'] - salary['Salary Range From'])/2

# Annualize Hourly and Daily rates (see http://stackoverflow.com/questions/12307099/modifying-a-subset-of-rows-in-a-pandas-dataframe)
salary.ix[df['Salary Frequency'] =='Hourly', 'salary_midpoint'] = salary.ix[df['Salary Frequency'] =='Hourly', 'salary_midpoint'] * 40 * 50
salary.ix[df['Salary Frequency'] =='Daily', 'salary_midpoint'] = salary.ix[df['Salary Frequency'] =='Daily', 'salary_midpoint'] * 5 * 50

salary['salary_total'] = salary['# Of Positions'] * salary['salary_midpoint']

salary_grouped = salary.groupby('Agency').sum()
salary[salary['Salary Frequency'] == 'Annual'].head(5)
salary_grouped['average_salary'] = salary_grouped['salary_total'] / salary_grouped['# Of Positions']
#ordered_salaries = salary_grouped[['# Of Positions', 'average_salary']].sort_values('average_salary', ascending=False)
ordered_salaries = salary_grouped[['average_salary']].sort_values('average_salary', ascending=True)
ordered_salaries = ordered_salaries.reset_index()

ordered_position_counts = salary_grouped[['# Of Positions']].sort_values('# Of Positions', ascending=True)
ordered_position_counts = ordered_position_counts.reset_index()

In [6]:
%%javascript
require.config({
    paths: {
        d3: '//d3js.org/d3.v4.min'
    }
});

<IPython.core.display.Javascript object>

In [126]:
from IPython.display import Javascript
Javascript('window.chartData={};'.format(ordered_salaries.to_json(orient='records')))

<IPython.core.display.Javascript object>

In [80]:
from IPython.display import HTML
HTML("""
<style>
.bar {
  fill: steelblue;
}
.axis path,
.axis line {
  fill: none;
  stroke: #D4D8DA;
  stroke-width: 1px;
  shape-rendering: crispEdges;
}
.x path {
  display: none;
}
.toolTip {
	position: absolute;
  display: none;
  min-width: 80px;
  height: auto;
  background: none repeat scroll 0 0 #ffffff;
  border: 1px solid #6F257F;
  padding: 14px;
  text-align: center;
}
</style>
""")

In [206]:
%%javascript
window.createBarChart = function createBarChart(el, chartData, metric, title, xAxisFormat,d3) {
  var margin = {top: 30, right: 20, bottom: 20, left: 120};
  var width = 960 - margin.left - margin.right;
  var height = 1100 - margin.top - margin.bottom;
  var svg = d3.select(el).append("svg")
    .attr("width", width + margin.left + margin.right)
    .attr("height", height + margin.top + margin.bottom)
    .append("g")
    .attr("transform", "translate(" + margin.left + "," + margin.top + ")");
    
  var x = d3.scaleLinear().range([0, width-margin.left - margin.right]);
  var y = d3.scaleBand().range([height-margin.top - margin.bottom, 0]);
  var data = chartData

  var g = svg.append("g")
    .attr("transform", "translate(" + margin.left + "," + margin.top + ")");
  x.domain([0, d3.max(data, function(d) { return d[metric]; })]);
  y.domain(data.map(function(d) { return d.Agency; })).padding(0.1);
  g.append("g")
    .attr("class", "x axis")
    .call(d3.axisTop(x).ticks(5).tickFormat(function(d) { return d3.format(xAxisFormat)(d); }));
    
  svg.append("text")             
      .attr("transform","translate(" + (width/2) + " ," + 0 + ")")
      .style("text-anchor", "middle")
      .text(title);

  g.append("g")
    .attr("class", "y axis")
    .call(d3.axisLeft(y));
  g.selectAll(".bar")
    .data(data)
  .enter().append("rect")
    .attr("class", "bar")
    .attr("x", 0)
    .attr("height", y.bandwidth())
    .attr("y", function(d) { return y(d.Agency); })
    .attr("width", function(d) { return x(d[metric]); })
}

<IPython.core.display.Javascript object>

### Salaries by Agency

The Office of Management & Budget tops the list of Salaries although the agency has only one position - Deputy Chief of Staff.

In [207]:
%%javascript
require(['d3'], function(d3){
    
$('#chart').remove();
element.append("<div id='chart'></div>");
$('#chart').width("960px");
$('#chart').height("1200px");   

window.createBarChart('#chart',
                      window.chartData,
                      'average_salary',
                      'NYC City Jobs - Average Salary by Department',
                      "$,.2s",
                      d3)
});

<IPython.core.display.Javascript object>

### Open Positions

Topping the list of open positions by agency is the Department of Parks and Recreation which is currently hiring hundreds of seasonal park workers at a low hourly rate. This also explains the department's low placement in the average salary chart.

In [173]:
from IPython.display import Javascript
Javascript('window.positionCountData={};'.format(ordered_position_counts.to_json(orient='records')))

<IPython.core.display.Javascript object>

In [209]:
%%javascript
require(['d3'], function(d3){
  $('#chart2').remove();
  element.append("<div id='chart2'></div>");
  $('#chart2').width("960px");
  $('#chart2').height("1200px");
  createBarChart('#chart2',
                 window.positionCountData,
                 '# Of Positions',
                 'NYC City Jobs - Open Positions by Department',
                 ",",
                 d3)
});

<IPython.core.display.Javascript object>

### Method

- The midpoint of the salary range for each position is used
- Duplicate rows are removed - only first occurence of each Job ID is shown. (This removes duplicates when a job is posted both internally and externally)
- All daily and hourly positions will be annualized:
  - Hourly Positions are estimated at 40 hours a week for 50 weeks
  - Daily Salaries are estimated at 5 days a week for 50 weeks
- Time since original posting is used as a proxy for difficulty to fill

### Data Concerns

This dataset does not always match the career information on the agency pages raising potential concerns about it's accuracy.  Examples include:
- The Office of Management & Budget only includes one job posted in 2015, however there are dozens of more recent jobs on the [agency's career page](http://www1.nyc.gov/site/omb/careers/careers.page)
- The Department of Environmental Protection includes jobs posted all the way back to 2012.  The oldest posting on the [agency's career page](http://www.nyc.gov/html/dep/html/job_opportunities/index_wide.shtml) at the time of writing was in November 2016.

In [54]:
#Tests
assert source_df['Process Date'][0]=='03/14/2017 00:00:00', 'The file posted on 3/14 is not being used'
assert True not in df.duplicated('Job ID'), 'The datset contains duplicate Job IDs'
assert len(df['has_external_posting'].unique()) == 2, 'There are more than 2 values for Boolean column has_external_posting'

print 'All tests pass'

All tests pass
