In [23]:
import csv
import numpy as np
import scipy as sp
import pandas as pd
import seaborn as sns
import os
import random
import matplotlib

from matplotlib_venn import venn2
from faculty_hiring.misc.plotting import *  # Definitions for LABEL_SIZE and such
from faculty_hiring.misc.util import *
from faculty_hiring.parse import faculty_parser, institution_parser
from faculty_hiring.parse import load

In [24]:
df = pd.read_csv('../data/attrition.csv', index_col=0)[["place", "count_ret", "count_not", "attrition_frac"]]
df.head(20)

Unnamed: 0,place,count_ret,count_not,attrition_frac
0,Arizona State University,26,14,0.35
1,Auburn University,12,6,0.333333
2,"Binghamton University, SUNY",14,5,0.263158
3,Boston University,17,2,0.105263
4,Brandeis University,10,2,0.166667
5,Brigham Young University,18,10,0.357143
6,Brown University,16,10,0.384615
7,California Institute of Technology,1,15,0.9375
8,Carleton University,22,6,0.214286
9,Carnegie Mellon University,74,51,0.408


## Adjust departmental rates for error

In [26]:
correction = pd.read_csv('../data/uni_correction.tsv', sep='\t')
rate = sum(correction['truth']) / sum(correction['empirical'])
print rate

0.891008174387


In [27]:
df['attrition_frac'] = df['attrition_frac']*rate

## Select a few institutions to highlight

In [30]:
df.sort_values(by = ['attrition_frac'], ascending = False).head(10)

Unnamed: 0,place,count_ret,count_not,attrition_frac
7,California Institute of Technology,1,15,0.83532
33,Georgia Institute of Technology,25,64,0.640725
174,University of Southern Mississippi,5,12,0.628947
90,Temple University,7,16,0.619832
69,Oregon Health & Science University,4,8,0.594005
77,Queen's University,12,23,0.58552
195,Virginia Commonwealth University,4,7,0.567005
143,University of Memphis,5,8,0.548313
13,Claremont Graduate University,2,3,0.534605
52,Missouri University of Science & Technology,7,10,0.524122


In [31]:
# 1: Georgia Institute of Technology has several schools and institutes within the 
# College of Computing, three of which were created between 2011-2017 (Institute for Data & 
# High Performance Computing, Institute for People and Technology, Institute for Robotics 
# & Intelligent Machines).
#
# 2: Oregon Health & Science University has annual contracts, not typical for the TT system

In [32]:
# 3: Colorado School of Mines hired five new assistant professors. Two professors appeared to 
# have left, and one moved to a different university.
df[df['place'] == "Colorado School of Mines"]

Unnamed: 0,place,count_ret,count_not,attrition_frac
17,Colorado School of Mines,3,3,0.445504


In [33]:
# 3: Two full professors retired and one moved to a different university. 
# Two professors had ambiguous titles - "Professor of Natural Science", and Dean
df[df['place'] == "Harvard University"]

Unnamed: 0,place,count_ret,count_not,attrition_frac
35,Harvard University,16,5,0.212145


In [34]:
# 3: One assistant professor moved to industry.
df[df['place'] == "Rice University"]

Unnamed: 0,place,count_ret,count_not,attrition_frac
79,Rice University,17,1,0.0495


In [36]:
df.sort_values(by = ['attrition_frac'], ascending = True).head(5)

Unnamed: 0,place,count_ret,count_not,attrition_frac
168,University of Rhode Island,7,0,0.0
123,University of Denver,10,0,0.0
120,"University of Colorado, Denver",7,0,0.0
79,Rice University,17,1,0.0495
141,"University of Massachusetts, Boston",11,1,0.074251


In [37]:
print "Average department attrition: %.4f" % np.mean(df["attrition_frac"])

Average department retention: 0.3013
