In [2]:
import zipfile
import numpy as np
import pandas as pd
import re
import seaborn as sns

from collections import defaultdict

## Names dataset for assigning gender

Top 1000 US baby names with their gender since 1880

https://www.ssa.gov/oact/babynames/limits.html

In [3]:
zf = zipfile.ZipFile('/data/names.zip')

names = defaultdict(lambda: [0,0])

for name in zf.namelist():
    if name[-4:] == ".txt":
        f = zf.open(name)
        for line in f.readlines():
            elements = line.decode("utf-8").split(",")
            
            current_counts = names[elements[0]]
            
            if elements[1] == 'M':
                names[elements[0]] = [current_counts[0] + int(elements[2]), current_counts[1]]
            else:
                names[elements[0]] = [current_counts[0], current_counts[1] + int(elements[2])]
                                    
zf.close()

In [149]:
def m_or_f(name):
    re_split = r'[\ -]'
    m, f = [
        np.average([names[name][loc] for name in re.split(r'[\ -]', name)])
        for loc in [0,1]
    ]
    
    if m > f:
        return ['M', m / (m + f)]
    elif f > m:
        return ['F', f / (m + f)]
    else:
        return [None, 0]

## Load salary data

In [150]:
salaries = pd.read_csv('/data/2015-combined-salary-seconded-en.csv')

salaries['Salary Paid'] = salaries['Salary Paid'].apply(lambda x: float(re.sub(r'[\$,]', '', x)))
salaries['gender'] = salaries['First name'].apply( lambda x: m_or_f(x)[0] )


print("Matched", len(salaries) - len(salaries[ salaries['gender'].isnull() ]), "names out of", len(salaries), "entries")

Matched 110388 names out of 115431 entries


In [151]:
salaries.head(1)

Unnamed: 0,Sector,Last name,First name,Salary Paid,Taxable Benefits,Employer,Job title,Calendar Year,gender
0,Government of Ontario - Ministries,Aniol,Richard,106143.8,$177.35,Aboriginal Affairs,Senior Negotiator / Négociateur principal,2015,M


In [152]:
salaries.groupby(by='gender')['Salary Paid'].mean()

gender
F    123639.206867
M    129158.439588
Name: Salary Paid, dtype: float64

In [153]:
sector_grouped = salaries.groupby(by=['Sector', 'gender'])['Salary Paid'].mean()

In [154]:
sector_grouped.head()

Sector                             gender
Colleges                           F         113483.243822
                                   M         113424.158068
Crown Agencies                     F         133738.329018
                                   M         136596.097323
Government of Ontario - Judiciary  F         188860.401954
Name: Salary Paid, dtype: float64

In [155]:
sector_grouped.unstack()



gender,F,M
Sector,Unnamed: 1_level_1,Unnamed: 2_level_1
Colleges,113483.243822,113424.158068
Crown Agencies,133738.329018,136596.097323
Government of Ontario - Judiciary,188860.401954,207865.903164
Government of Ontario - Legislative Assembly & Offices,132952.340556,136534.997442
Government of Ontario - Ministries,126456.92179,123681.39511
Hospitals & Boards of Public Health,122169.697159,148089.663957
Municipalities & Services,118083.064644,119186.192592
Ontario Power Generation,131757.753402,142485.834662
Other Public Sector Employers,127708.508234,140126.27348
School Boards,112812.168479,114195.710656


In [156]:
from bokeh.plotting import figure, output_file, show

# prepare some data
x = [1, 2, 3, 4, 5]
y = [6, 7, 2, 4, 5]

# output to static HTML file
output_file("lines.html", title="line plot example")

# create a new plot with a title and axis labels
p = figure(title="simple line example", x_axis_label='x', y_axis_label='y')

# add a line renderer with legend and line thickness
p.line(x, y, legend="Temp.", line_width=2)

# show the results
show(p)