In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats
import csv

from scipy import stats
from collections import Counter
from scripts import regression, plot_utils, cohort_utils
from scripts.parse.institution_parser import INST_NAME_ALIASES
from scipy.stats import mannwhitneyu, ks_2samp, chisquare, chi2_contingency, ttest_ind, ranksums
from statsmodels.stats.proportion import proportions_ztest
from scripts.load_data import load_all_faculty
import statsmodels.formula.api as smf

from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Helvetica']

color_mapping = {'CS': ('#5777D9', '#293866'), 
                 'Business': ('#CC3A35', '#661D1B'), 
                 'History': ('#8BCC60', '#466630')}

%matplotlib inline

In [2]:
# Read in history, business and computer science survey frame files
his_ids = pd.read_excel('../data/survey_data/his_busi_survey/HIS_intro_2018_10_22b_unlocked.xlsx')
his_ids_more = pd.read_excel('../data/survey_data/his_busi_survey/HIS_allinvited_exceptheads.xls')

his_frame = pd.concat([his_ids, his_ids_more], sort=False)



In [3]:
busi_ids = pd.read_excel('../data/survey_data/his_busi_survey/intro_sep2019_unlocked.xlsx')
busi_ids_more = pd.read_excel('../data/survey_data/his_busi_survey/BUSI_participant_codes_send1.xls')

bus_frame = pd.concat([busi_ids, busi_ids_more], sort=False)



In [4]:
cs_frame = pd.read_csv("../data/survey_data/cs_survey/frame_feb5_2020.tsv", sep='\t', header=0)

In [5]:
# Merge on departmental prestige. First, standardize university names
his_frame['university_name_standard'] = his_frame['u_university'].apply(
        lambda x: INST_NAME_ALIASES[x] if x in INST_NAME_ALIASES else x)
bus_frame['university_name_standard'] = bus_frame['u_university'].apply(
        lambda x: INST_NAME_ALIASES[x] if x in INST_NAME_ALIASES else x)
cs_frame['university_name_standard'] = cs_frame['university'].apply(
        lambda x: INST_NAME_ALIASES[x] if x in INST_NAME_ALIASES else x)

In [6]:
# Extract prestige / ranking data
PRESTIGE = '../data/survey_data/faculty_2011/%s_vertexlist.txt'
for field, field_data in [('History', his_frame), ('Business', bus_frame), ('CS', cs_frame)]:
    pi_rank_mapping = {}

    with open(PRESTIGE % field) as rankings:

        pi_reader = csv.DictReader(rankings, dialect='excel-tab')
        for row in pi_reader:
            pi_rank_mapping[row['institution']] = row


        field_data['prestige_inv'] = field_data['university_name_standard'].apply(
            lambda x: pi_rank_mapping[x]['pi'] if x in pi_rank_mapping else np.nan)
        field_data['prestige_rank_inv'] = field_data['university_name_standard'].apply(
            lambda x: pi_rank_mapping[x]['# u'] if x in pi_rank_mapping else np.nan)

        field_data['prestige_inv'] = pd.to_numeric(field_data['prestige_inv'])
        field_data['prestige_rank_inv'] = pd.to_numeric(field_data['prestige_rank_inv'])

In [7]:
# Read in responses
df = load_all_faculty()



## Response rate

In [8]:
df[df.likely_department == 'Computer Science'].shape[0], \
df[df.likely_department == 'Business'].shape[0], \
df[df.likely_department == 'History'].shape[0], \
df.shape[0] # Total number of responses

(1139, 1321, 992, 3452)

In [9]:
print(cs_frame.shape[0], bus_frame.shape[0], his_frame.shape[0])
# print(len(cs_inv), len(busi_ids) + len(busi_ids_more), len(his_ids) + len(his_ids_more))

5792 9573 4336


In [10]:
# Total number of departments across the three fields
len(df[df.likely_department == "Computer Science"].university_name_standard.dropna().unique()), \
len(df[df.likely_department == "Business"].university_name_standard.dropna().unique()), \
len(df[df.likely_department == "History"].university_name_standard.dropna().unique()), \
(200+113+140)

(200, 113, 140, 453)

In [11]:
# Among those who completed first page, total number of departments across the three fields
completed_first_part = (df.children.isin([1.0,2.0]) & (df.gender_ans > 0) & (df.age_coded > 0))
len(df[completed_first_part & (df.likely_department == "Computer Science")].university_name_standard.dropna().unique()), \
len(df[completed_first_part & (df.likely_department == "Business")].university_name_standard.dropna().unique()), \
len(df[completed_first_part & (df.likely_department == "History")].university_name_standard.dropna().unique()), \
(200+112+138)

(200, 112, 138, 450)

In [12]:
# Overall response rate (most generous interpretation)
df.shape[0] / (len(cs_frame) + len(bus_frame) + len(his_frame))

0.1752195320034516

In [13]:
# Field response rate
print(
    'CS: %.4f' %  (100.0*df[df.likely_department == "Computer Science"].shape[0] / (len(cs_frame))),
    'BUS: %.4f' % (100.0*df[df.likely_department == "Business"].shape[0] / (len(bus_frame))),
    'HIS: %.4f' % (100.0*df[df.likely_department == "History"].shape[0] / (len(his_frame))),
)

CS: 19.6651 BUS: 13.7992 HIS: 22.8782


In [14]:
# Overall response rate (stricer definition)
df[df.children.isin([1.0,2.0]) & (df.gender_ans > 0) & (df.age_coded > 0)].shape[0] / \
(len(cs_frame) + len(bus_frame) + len(his_frame))

0.15552510024871835

In [15]:
# Field response rate
print(
    'CS: %.4f' %  (100.0*df[(df.likely_department == "Computer Science") &
                      (df.children.isin([1.0,2.0]) & (df.gender_ans > 0) & (df.age_coded > 0))].shape[0] / \
                   (len(cs_frame))),
    'BUS: %.4f' % (100.0*df[(df.likely_department == "Business") &
                      (df.children.isin([1.0,2.0]) & (df.gender_ans > 0) & (df.age_coded > 0))].shape[0] / \
                   (len(bus_frame))),
    'HIS: %.4f' % (100.0*df[(df.likely_department == "History")  &
                      (df.children.isin([1.0,2.0]) & (df.gender_ans > 0) & (df.age_coded > 0))].shape[0] / \
                   (len(his_frame))),
)

CS: 19.4406 BUS: 10.8639 HIS: 20.7103


In [16]:
print(
    df[df.likely_department == "Computer Science"].shape[0], 
    df[df.likely_department == "Business"].shape[0],
    df[df.likely_department == "History"].shape[0])

1139 1321 992


In [17]:
print(
    df[(df.likely_department == "Computer Science") & (df.gender_ans > 0)].shape[0], 
    df[(df.likely_department == "Business") & (df.gender_ans > 0)].shape[0],
    df[(df.likely_department == "History") & (df.gender_ans > 0)].shape[0])

1129 1050 898


### Response rate on year of selected questions

In [18]:
print(sum(df.consent == 1), sum(df.consent == 1)/len(df.consent))
print(sum(df.tenured == 1), sum(df.tenured == 1)/len(df.tenured))

3452 1.0
3444 0.9976825028968713


In [19]:
print(sum(df.age_coded > 0), sum(df.age_coded > 0)/len(df.age_coded)) # First page

3074 0.8904982618771726


In [20]:
print(sum(df.gender_ans > 0), sum(df.gender_ans > 0)/len(df.gender_ans)) # First page

3077 0.8913673232908459


In [21]:
print(sum(df.children.isin([1.0,2.0])), sum(df.children.isin([1.0,2.0]))/len(df.children)) # First page

3075 0.8907879490150638


In [22]:
# Completed any of the first page questions
print(sum((df.children.isin([1.0,2.0]) | (df.gender_ans > 0) | (df.age_coded > 0))), # First page
      sum((df.children.isin([1.0,2.0]) | (df.gender_ans > 0) | (df.age_coded > 0))) / len(df.consent)) 

3085 0.8936848203939745


In [23]:
# Completed all of the first page questions
print(sum((df.children.isin([1.0,2.0]) & (df.gender_ans > 0) & (df.age_coded > 0))), # First page
      sum((df.children.isin([1.0,2.0]) & (df.gender_ans > 0) & (df.age_coded > 0))) / len(df.consent)) 

3064 0.8876013904982619


In [24]:
# Completed any of the second page
print(sum((df.tenstop > 0) | (df.parleave_ideal > 0) | (df.parleave_neutral > 0)), 
      sum((df.tenstop > 0) | (df.parleave_ideal > 0) | (df.parleave_neutral > 0)) / len(df), 
      sum((df.tenstop > 0) | (df.parleave_ideal > 0) | (df.parleave_neutral > 0)) / \
      len(df[(df.children.isin([1.0,2.0]) & (df.gender_ans > 0) & (df.age_coded > 0))]))

2919 0.8455967555040557 0.9526762402088773


In [25]:
print(sum(df.parleave_elig_child1 > 0) / len(df.parleave_elig_child1), # Second page
      sum(df.parleave_elig_child1 > 0) / sum(df.children == 2.0), 
      sum(df.parleave_elig_child1 > 0), sum(df.children == 2.0))

0.49710312862108924 0.7853546910755149 1716 2185


In [26]:
print(sum(df.aim_min >= 0) / len(df.aim_min), # Middle
      sum(df.aim_min >= 0) / len(df.aim_min),
      sum(df.aim_min >= 0) / len(df[(df.children.isin([1.0,2.0]) & (df.gender_ans > 0) & (df.age_coded > 0))]),
      sum(df.aim_min >= 0), len(df.aim_min))

0.7885283893395133 0.7885283893395133 0.8883812010443864 2722 3452


In [27]:
print(sum(df.injnorm_wchild >= 0) / len(df.injnorm_wchild), # Middle
      sum(df.injnorm_wchild >= 0) / len(df[(df.children.isin([1.0,2.0]) & (df.gender_ans > 0) & (df.age_coded > 0))]),
      sum(df.injnorm_wchild >= 0), len(df.injnorm_wchild))

0.7320393974507532 0.8247389033942559 2527 3452


In [28]:
print(sum(df.desnorm_wchild >= 0) / len(df.desnorm_wchild), # Middle
      sum(df.desnorm_wchild >= 0) / len(df[(df.children.isin([1.0,2.0]) & (df.gender_ans > 0) & (df.age_coded > 0))]),
      sum(df.desnorm_wchild >= 0), len(df.desnorm_wchild))

0.7242178447276941 0.8159268929503917 2500 3452


In [29]:
print(sum(df.current_parleave > 0) / len(df.current_parleave), # Last page
      sum(df.current_parleave > 0) / len(df[(df.children.isin([1.0,2.0]) & (df.gender_ans > 0) & (df.age_coded > 0))]),
      sum(df.current_parleave > 0), 
      len(df.current_parleave)
     )

0.7891077636152954 0.8890339425587467 2724 3452


## Total publications

In [30]:
pub_count = 0
for each in df['dblp_pubs'].dropna():
    pub_count += len(each)
print(pub_count)

100972


In [31]:
print(len(df[df.likely_department == "Business"]['dblp_pubs'].dropna()), 
      len(df[df.likely_department == "Business"]['dblp_pubs']),
      len(df[df.likely_department == "Business"]['dblp_pubs'].dropna()) / \
      len(df[df.likely_department == "Business"]['dblp_pubs']))
pub_count = 0
for each in df[df.likely_department == "Business"]['dblp_pubs'].dropna():
    pub_count += len(each)
print(pub_count)

525 1321 0.39742619227857684
15352


In [32]:
print(len(df[df.likely_department == "History"]['dblp_pubs'].dropna()),
      len(df[df.likely_department == "History"]['dblp_pubs']),
      len(df[df.likely_department == "History"]['dblp_pubs'].dropna()) / \
      len(df[df.likely_department == "History"]['dblp_pubs']))
pub_count = 0
for each in df[df.likely_department == "History"]['dblp_pubs'].dropna():
    pub_count += len(each)
print(pub_count)

294 992 0.2963709677419355
6346


In [33]:
print(len(df[df.likely_department == "Computer Science"]['dblp_pubs'].dropna()),
      len(df[df.likely_department == "Computer Science"]['dblp_pubs']),
      len(df[df.likely_department == "Computer Science"]['dblp_pubs'].dropna()) / \
      len(df[df.likely_department == "Computer Science"]['dblp_pubs']))
pub_count = 0
for each in df[df.likely_department == "Computer Science"]['dblp_pubs'].dropna():
    pub_count += len(each)
print(pub_count)

1061 1139 0.9315188762071993
79274


### Sample publications by gender and parenthood status

In [34]:
# Record information about publication type and authorship position for random samples by field

In [35]:
# women_in_cs_pubs = df[(df.likely_department == "Computer Science") & (df.gender_ans == 1) &
#                       (df.dblp_pubs.notna())][['dblp_pubs', 'sid', 'likely_department', 'name', 
#                                                'gender_ans']].sample(10)
# men_in_cs_pubs = df[(df.likely_department == "Computer Science") & (df.gender_ans == 2) &
#                     (df.dblp_pubs.notna())][['dblp_pubs', 'sid', 'likely_department', 'name',
#                                              'gender_ans']].sample(10)

# women_in_bus_pubs = df[(df.likely_department == "Business") & (df.gender_ans == 1) &
#                        (df.dblp_pubs.notna())][['dblp_pubs', 'sid', 'likely_department', 'name',
#                                                 'gender_ans']].sample(10)
# men_in_bus_pubs = df[(df.likely_department == "Business") & (df.gender_ans == 2) &
#                      (df.dblp_pubs.notna())][['dblp_pubs', 'sid', 'likely_department', 'name',
#                                               'gender_ans']].sample(10)

# women_in_his_pubs = df[(df.likely_department == "History") & (df.gender_ans == 1) &
#                        (df.dblp_pubs.notna())][['dblp_pubs', 'sid', 'likely_department', 'name',
#                                                 'gender_ans']].sample(10)
# men_in_his_pubs = df[(df.likely_department == "History") & (df.gender_ans == 2) &
#                      (df.dblp_pubs.notna())][['dblp_pubs', 'sid', 'likely_department', 'name',
#                                               'gender_ans']].sample(10)

# pd.concat([women_in_cs_pubs, men_in_cs_pubs, women_in_bus_pubs, men_in_bus_pubs, women_in_his_pubs, 
#            men_in_his_pubs], sort=False, axis=0).to_csv('../data/survey_data/pub_validation/pub_sample.csv')

In [36]:
pub_validation = pd.read_csv('../data/survey_data/pub_validation/pub_sample_annotated.csv')
pub_validation.groupby(['likely_department', 'gender_ans'])['gender_ans'].count()

likely_department  gender_ans
Business           1             10
                   2             10
Computer Science   1             10
                   2             10
History            1             10
                   2             10
Name: gender_ans, dtype: int64

In [37]:
pub_validation_props = pub_validation.copy()

gender_ans_mapping = {2: 'Men', 1: 'Women'}
pub_validation_props['gender_ans'] = pub_validation_props['gender_ans'].apply(lambda x: gender_ans_mapping[x])
pub_validation_props[['book_prop', 'proceedings_prop', 'chapters_prop', 'journal_prop']] = \
pub_validation_props[['book_count', 'proceedings_count', 
                      'chapters_count', 'journal_count']].divide(pub_validation_props['total_pub_count'], 
                                                                 axis=0)

In [38]:
pub_validation_props.groupby(['likely_department', 'gender_ans'])['total_pub_count'].sum()

likely_department  gender_ans
Business           Men           263
                   Women         158
Computer Science   Men           685
                   Women         662
History            Men           223
                   Women         408
Name: total_pub_count, dtype: int64

In [39]:
pub_validation_props.groupby(['likely_department', 'gender_ans'])['total_pub_count'].sum().sum()

2399

In [40]:
pub_validation_props[['likely_department', 'gender_ans', 'book_prop', 'proceedings_prop', 'chapters_prop',
                       'journal_prop']].groupby(['likely_department', 'gender_ans']).mean()*100

Unnamed: 0_level_0,Unnamed: 1_level_0,book_prop,proceedings_prop,chapters_prop,journal_prop
likely_department,gender_ans,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Business,Men,0.645161,0.0,6.894935,92.459904
Business,Women,0.333333,0.0,7.972222,91.694444
Computer Science,Men,0.0,68.112042,0.0,31.887958
Computer Science,Women,0.0,73.318833,0.0,26.681167
History,Men,17.563973,0.0,38.793732,43.642295
History,Women,12.05345,0.0,37.373153,50.573396


In [41]:
temp = pub_validation_props[['likely_department', 'gender_ans', 'book_prop', 
                             'proceedings_prop', 'chapters_prop', 
                             'journal_prop']].groupby(['likely_department', 'gender_ans']).mean()

# Business
bus_men = temp.loc[('Business', 'Men')]; bus_women = temp.loc[('Business', 'Women')]
obs = np.array([
    [bus_men['book_prop'], bus_men['chapters_prop'], bus_men['journal_prop']],
    [bus_women['book_prop'], bus_women['chapters_prop'], bus_women['journal_prop']]])

chi2, p, dof, expected = chi2_contingency(obs, correction = False)
print(chi2, p, dof, expected)

# History
bus_men = temp.loc[('History', 'Men')]; bus_women = temp.loc[('History', 'Women')]
obs = np.array([
    [bus_men['book_prop'], bus_men['chapters_prop'], bus_men['journal_prop']],
    [bus_women['book_prop'], bus_women['chapters_prop'], bus_women['journal_prop']]])

chi2, p, dof, expected = chi2_contingency(obs, correction = False)
print(chi2, p, dof, expected)

# CS
bus_men = temp.loc[('Computer Science', 'Men')]; bus_women = temp.loc[('Computer Science', 'Women')]
obs = np.array([
    [bus_men['proceedings_prop'], bus_men['journal_prop']],
    [bus_women['proceedings_prop'], bus_women['journal_prop']]])

chi2, p, dof, expected = chi2_contingency(obs, correction = False)
print(chi2, p, dof, expected)

0.0018061667050095344 0.9990973243045407 2 [[0.00489247 0.07433579 0.92077174]
 [0.00489247 0.07433579 0.92077174]]
0.015616605602199197 0.9922221028050227 2 [[0.14808711 0.38083443 0.47107846]
 [0.14808711 0.38083443 0.47107846]]
0.00654571949403767 0.9355170097868151 1 [[0.70715437 0.29284563]
 [0.70715437 0.29284563]]


In [42]:
pub_validation_props[['likely_department', 'gender_ans', 
                      'avg_num_authors']].groupby(['likely_department', 'gender_ans']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,avg_num_authors
likely_department,gender_ans,Unnamed: 2_level_1
Business,Men,2.516762
Business,Women,3.089461
Computer Science,Men,3.672987
Computer Science,Women,4.257894
History,Men,1.103673
History,Women,1.304378


In [43]:
pub_validation_props[['likely_department', 'gender_ans', 
                      'avg_num_authors']].groupby(['likely_department', 'gender_ans']).std()

Unnamed: 0_level_0,Unnamed: 1_level_0,avg_num_authors
likely_department,gender_ans,Unnamed: 2_level_1
Business,Men,0.610909
Business,Women,1.40191
Computer Science,Men,0.86537
Computer Science,Women,1.339294
History,Men,0.188325
History,Women,0.547665


In [44]:
# pub_validation.groupby(['likely_department', 'gender_ans'])[['book_count', 'proceedings_count', 
#    'chapters_count', 'journal_count', 'avg_num_authors']].mean()

## Use Genderize.io to classify the genders of survey frame

In [45]:
# cs_frame[['name']].to_csv('cs_frame_names.tsv', index=False)
# his_frame[['u_name', 'firstname', 'lastname']].to_csv('his_frame_names.tsv', index=False)
# bus_frame[['u_name', 'firstname', 'lastname']].to_csv('bus_frame_names.tsv', index=False)

In [46]:
cs_frame['genderize'] = pd.read_csv('../data/survey_data/gender_frame/cs_frame_name_genderize.tsv').values

In [47]:
temp = df[df.likely_department == 'Computer Science'].merge(
    cs_frame, left_on='sid', right_on='survey sid', suffixes=('_ans', '_inv'))
print(temp.shape)
print(pd.crosstab(temp[temp.gender_ans.isin([1,2]) & (temp.genderize != 'unclear')]['genderize'], 
                  temp[(temp.gender_ans.isin([1,2])) & (temp.genderize != 'unclear')]['gender_ans']))
print((156+716)/(156+716+9+2), (156)/(156+9), (716)/(716+2))

(1139, 466)
gender_ans    1    2
genderize           
female      156    2
male          9  716
0.9875424688561721 0.9454545454545454 0.9972144846796658


In [48]:
his_frame['genderize'] = pd.read_csv('../data/survey_data/gender_frame/his_frame_name_genderize.tsv').values

In [49]:
temp = pd.concat([
    df[df.likely_department == 'History'].merge(
        his_frame, left_on='pid', right_on='code', how='inner'),
    df[df.likely_department == 'History'].merge(
        his_frame, left_on='pid', right_on='session_id', how='inner')]).copy(deep=True).reset_index()
print(temp.shape)
print(pd.crosstab(temp[temp.gender_ans.isin([1,2]) & (temp.genderize != 'unclear')]['genderize'], 
                  temp[temp.gender_ans.isin([1,2]) & (temp.genderize != 'unclear')]['gender_ans']))
print((324+411)/(324+411+3+5), (324)/(324+5), (411)/(411+3))

(992, 502)
gender_ans    1    2
genderize           
female      324    3
male          5  411
0.9892328398384926 0.9848024316109423 0.9927536231884058


In [50]:
bus_frame['genderize'] = pd.read_csv('../data/survey_data/gender_frame/bus_frame_name_genderize.tsv').values

In [51]:
temp = pd.concat([
    df[df.likely_department == 'Business'].merge(
        bus_frame, left_on='pid', right_on='code', how='inner'),
    df[df.likely_department == 'Business'].merge(
        bus_frame, left_on='pid', right_on='session_id', how='inner')]).copy(deep=True).reset_index()
print(temp.shape)
print(pd.crosstab(temp[(temp.gender_ans.isin([1,2])) & (temp.genderize != 'unclear')]['genderize'], 
                  temp[(temp.gender_ans.isin([1,2])) & (temp.genderize != 'unclear')]['gender_ans']))
print((230+589)/(230+589+11+2), (230)/(230+11), (589)/(589+2))

(1321, 502)
gender_ans    1    2
genderize           
female      230    2
male         11  589
0.984375 0.9543568464730291 0.9966159052453468


In [52]:
# Overall accuracy
temp = pd.concat([
    df[df.likely_department == 'Computer Science'].merge(
        cs_frame, left_on='sid', right_on='survey sid', 
        suffixes=('_ans', '_inv'), 
        how='inner')[['gender_ans', 'genderize']],
    df[df.likely_department == 'History'].merge(
        his_frame, left_on='pid', right_on='code', 
        how='inner')[['gender_ans', 'genderize']],
    df[df.likely_department == 'History'].merge(
        his_frame, left_on='pid', right_on='session_id', 
        how='inner')[['gender_ans', 'genderize']],
    df[df.likely_department == 'Business'].merge(
        bus_frame, left_on='pid', right_on='code', 
        how='inner')[['gender_ans', 'genderize']],
    df[df.likely_department == 'Business'].merge(
        bus_frame, left_on='pid', right_on='session_id', 
        how='inner')[['gender_ans', 'genderize']]], axis=0).copy(deep=True).reset_index()

print(temp.shape)
print(pd.crosstab(temp[(temp.gender_ans.isin([1,2])) & (temp.genderize != 'unclear')]['genderize'], 
                  temp[(temp.gender_ans.isin([1,2])) & (temp.genderize != 'unclear')]['gender_ans']))
print((710+1716)/(1716+710+7+25), (710)/(710+25), (1716)/(1716+7))

(3452, 3)
gender_ans    1     2
genderize            
female      710     7
male         25  1716
0.9869812855980472 0.9659863945578231 0.995937318630296


In [53]:
pd.concat([cs_frame['genderize'], his_frame['genderize'], bus_frame['genderize']]).value_counts(normalize=True)

male       0.567788
unclear    0.249835
female     0.182377
Name: genderize, dtype: float64

## Representativeness among survey respondents

In [54]:
completed_first_part = (df.children.isin([1.0,2.0]) & (df.gender_ans > 0) & (df.age_coded > 0))

#### Computer Science

In [55]:
cs_faculty = (df.likely_department == "Computer Science")

In [56]:
# Gender -- M (2) / F (1)
frame_dist = dict(cs_frame[cs_frame.genderize.isin(['male', 'female'])].genderize.value_counts(normalize=True))
resp_dist = dict(df[completed_first_part & cs_faculty].gender_ans.value_counts(normalize=True))                
                  
print(frame_dist) # Frame            
print(resp_dist)  # Responses

obs = np.array([[len(cs_frame[cs_frame.genderize == 'male']), 
                 len(cs_frame[cs_frame.genderize == 'female'])], 
                [len(df[completed_first_part & cs_faculty & (df.gender_ans == 2)]), 
                 len(df[completed_first_part & cs_faculty & (df.gender_ans == 1)])]])

print(stats.chi2_contingency(obs, correction=False))

{'male': 0.8545497969906855, 'female': 0.14545020300931455}
{2: 0.7868561278863233, 1: 0.20515097690941386, 4: 0.007104795737122558, 3: 0.0008880994671403197}
(24.902228024976882, 6.031267966832184e-07, 1, array([[3523.90045249,  663.09954751],
       [ 940.09954751,  176.90045249]]))


In [57]:
# Rank -- 3: full professor, 2: associate professor, 1: full professor
frame_dist = dict(cs_frame[cs_frame.factitle.isin([1, 2, 3])].factitle.value_counts(normalize=False))
resp_dist = dict(df[completed_first_part & cs_faculty].factitle.value_counts(normalize=False))

print(dict(cs_frame[cs_frame.factitle.isin([1, 2, 3])].factitle.value_counts(normalize=True))) # Frame
print(dict(df[completed_first_part & cs_faculty].factitle.value_counts(normalize=True))) # Responses

obs = np.array([[frame_dist[3], frame_dist[2], frame_dist[1]],
                [resp_dist[3],  resp_dist[2],  resp_dist[1]]])

chi2, p, dof, expected = chi2_contingency(obs, correction = False)
print(chi2, p, dof, expected)

{3: 0.5029887482419128, 2: 0.26828410689170185, 1: 0.22872714486638537}
{3.0: 0.4715808170515098, 1.0: 0.2921847246891652, 2.0: 0.23623445825932504}
21.356375260874213 2.304209884834833e-05 2 [[2831.47872028 1495.87555034 1360.64572938]
 [ 560.52127972  296.12444966  269.35427062]]


In [58]:
print(cs_frame.prestige.mean(), cs_frame.prestige.std())
print(df[completed_first_part & cs_faculty].prestige_inv.mean(), 
      df[completed_first_part & cs_faculty].prestige_inv.std())

print(ks_2samp(cs_frame.prestige.dropna(), 
               df[completed_first_part & cs_faculty].prestige_inv.dropna()))

86.52224352331606 54.10902176186857
86.14853333333333 53.970120228449645
KstestResult(statistic=0.02373747841105354, pvalue=0.6530227179334973)


#### Business

In [59]:
business_faculty = (df.likely_department == "Business")

In [60]:
# Gender -- M (2) / F (1)
frame_dist = dict(bus_frame[bus_frame.genderize.isin(['male', 'female'])].genderize.value_counts(normalize=True))
resp_dist = dict(df[completed_first_part & business_faculty].gender_ans.value_counts(normalize=True))                

print(frame_dist)
print(resp_dist)

obs = np.array([[len(bus_frame[bus_frame.genderize == 'male']), 
                 len(bus_frame[bus_frame.genderize == 'female'])], 
                [len(df[completed_first_part & business_faculty & (df.gender_ans == 2)]), 
                 len(df[completed_first_part & business_faculty & (df.gender_ans == 1)])]])

print(stats.chi2_contingency(obs, correction=False))

{'male': 0.7645220849311177, 'female': 0.23547791506888227}
{2: 0.6701923076923076, 1: 0.3211538461538462, 4: 0.007692307692307693, 3: 0.0009615384615384616}
(37.87613580408626, 7.538171202230037e-10, 1, array([[5303.42913776, 1737.57086224],
       [ 776.57086224,  254.42913776]]))


In [61]:
# Rank -- 3: full professor, 2: associate professor, 1: full professor
tt_keys = ['Full Professor', 'Associate Professor', 'Assistant Professor']

frame_dist = dict(bus_frame[bus_frame.u_factitle.isin(tt_keys)].u_factitle.value_counts(normalize=False))
resp_dist = dict(df[completed_first_part & business_faculty].curtitle.value_counts(normalize=False))

print(dict(bus_frame[bus_frame.u_factitle.isin(tt_keys)].u_factitle.value_counts(normalize=True))) # Frame
print(dict(df[completed_first_part & business_faculty].curtitle.value_counts(normalize=True))) # Responses

obs = np.array([[frame_dist['Full Professor'], 
                 frame_dist['Associate Professor'], 
                 frame_dist['Assistant Professor']],
                  [resp_dist[3.0],  resp_dist[2.0],  resp_dist[1.0]]])

chi2, p, dof, expected = chi2_contingency(obs, correction = False)
print(chi2, p, dof, expected)

{'Full Professor': 0.41697747277338504, 'Assistant Professor': 0.3044905266298672, 'Associate Professor': 0.2785320005967477}
{3.0: 0.4173076923076923, 1.0: 0.2971153846153846, 2.0: 0.25769230769230766, 4.0: 0.027884615384615386}
0.8953628805989438 0.6391082457382842 2 [[2805.80593726 1855.18602541 2042.00803733]
 [ 423.19406274  279.81397459  307.99196267]]


In [62]:
print(bus_frame.prestige_inv.mean(), bus_frame.prestige_inv.std())
print(df[completed_first_part & business_faculty].prestige_inv.mean(), 
      df[completed_first_part & business_faculty].prestige_inv.std())

print(ks_2samp(bus_frame.prestige_inv.dropna(), 
               df[completed_first_part & business_faculty].prestige_inv.dropna()))

51.893656749555944 31.543476752759847
51.35623915139826 31.256111132699516
KstestResult(statistic=0.026035145444486502, pvalue=0.5443662725284539)


#### History

In [63]:
history_faculty = (df.likely_department == "History")

In [64]:
# Gender -- M (2) / F (1)
frame_dist = dict(his_frame[his_frame.genderize.isin(['male', 'female'])].genderize.value_counts(normalize=True))
resp_dist = dict(df[completed_first_part & history_faculty].gender_ans.value_counts(normalize=True))                

print(frame_dist)
print(resp_dist)

obs = np.array([[len(his_frame[his_frame.genderize == 'male']), 
                 len(his_frame[his_frame.genderize == 'female'])], 
                [len(df[completed_first_part & history_faculty & (df.gender_ans == 2)]), 
                 len(df[completed_first_part & history_faculty & (df.gender_ans == 1)])]])

print(stats.chi2_contingency(obs, correction=False))

{'male': 0.6265840608279358, 'female': 0.3734159391720642}
{2: 0.5300668151447662, 1: 0.46547884187082406, 3: 0.0022271714922048997, 4: 0.0022271714922048997}
(26.551482902219266, 2.5660767394421224e-07, 1, array([[2157.76175478, 1393.23824522],
       [ 543.23824522,  350.76175478]]))


In [65]:
# Rank -- 3: full professor, 2: associate professor, 1: full professor
tt_keys = ['Full Professor', 'Associate Professor', 'Assistant Professor']

frame_dist = dict(his_frame[his_frame.u_factitle.isin(tt_keys)].u_factitle.value_counts(normalize=False))
resp_dist = dict(df[completed_first_part & history_faculty].curtitle.value_counts(normalize=False))

print(dict(his_frame[his_frame.u_factitle.isin(tt_keys)].u_factitle.value_counts(normalize=True))) # Frame
print(dict(df[completed_first_part & history_faculty].curtitle.value_counts(normalize=True))) # Responses

obs = np.array([[frame_dist['Full Professor'], frame_dist['Associate Professor'], frame_dist['Assistant Professor']],
                  [resp_dist[3.0],  resp_dist[2.0],  resp_dist[1.0]]])

chi2, p, dof, expected = chi2_contingency(obs, correction = False)
print(chi2, p, dof, expected)

{'Full Professor': 0.4530141843971631, 'Associate Professor': 0.383274231678487, 'Assistant Professor': 0.16371158392434987}
{3.0: 0.42761692650334077, 2.0: 0.3841870824053452, 1.0: 0.16369710467706014, 4.0: 0.024498886414253896}
0.6045916712909183 0.7391193746856666 2 [[1522.8        1304.34929577  556.85070423]
 [ 394.2         337.65070423  144.14929577]]


In [66]:
print(his_frame.prestige_inv.mean(), his_frame.prestige_inv.std())
print(df[completed_first_part & history_faculty].prestige_inv.mean(), 
      df[completed_first_part & history_faculty].prestige_inv.std())

print(ks_2samp(his_frame.prestige_inv.dropna(), 
               df[completed_first_part & history_faculty].prestige_inv.dropna()))

62.31507204610951 39.16652779558621
66.71326280623609 38.45489621286801
KstestResult(statistic=0.07017194790857685, pvalue=0.0013065833618075562)


## Racial demographics

In [67]:
cs_racial_demographics = df[df.likely_department == "Computer Science"][['white', 'hisp', 'black', 'asian', 'native', 
                          'hawaii', 'otherace', 'narace']].replace(-77, np.nan)
cs_racial_demographics.describe()

Unnamed: 0,white,hisp,black,asian,native,hawaii,otherace,narace
count,1139.0,1139.0,1139.0,1139.0,1139.0,1139.0,1139.0,1139.0
mean,0.716418,0.022827,0.015803,0.204565,0.00439,0.000878,0.023705,0.040386
std,0.450934,0.149417,0.124769,0.403561,0.066139,0.02963,0.152195,0.19695
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [68]:
bus_racial_demographics = df[df.likely_department == "Business"][['white', 'hisp', 'black', 'asian', 'native', 
                          'hawaii', 'otherace', 'narace']].replace(-77, np.nan)
bus_racial_demographics.describe()

Unnamed: 0,white,hisp,black,asian,native,hawaii,otherace,narace
count,1068.0,1068.0,1068.0,1068.0,1068.0,1068.0,1068.0,1068.0
mean,0.794007,0.034644,0.029026,0.117978,0.003745,0.000936,0.012172,0.020599
std,0.404614,0.182963,0.167959,0.322733,0.061113,0.0306,0.109706,0.142105
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [69]:
his_racial_demographics = df[df.likely_department == "History"][['white', 'hisp', 'black', 'asian', 'native', 
                          'hawaii', 'otherace', 'narace']].replace(-77, np.nan)
his_racial_demographics.describe()

Unnamed: 0,white,hisp,black,asian,native,hawaii,otherace,narace
count,900.0,900.0,900.0,900.0,900.0,900.0,900.0,900.0
mean,0.842222,0.055556,0.037778,0.05,0.007778,0.002222,0.016667,0.03
std,0.364735,0.229189,0.190764,0.218066,0.087897,0.047114,0.12809,0.170682
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [70]:
df[['white', 'hisp', 'black', 'asian', 'native', 'hawaii', 
    'otherace', 'narace']].replace(-77, np.nan).describe()

Unnamed: 0,white,hisp,black,asian,native,hawaii,otherace,narace
count,3107.0,3107.0,3107.0,3107.0,3107.0,3107.0,3107.0,3107.0
mean,0.77953,0.036369,0.026714,0.130029,0.00515,0.001287,0.017702,0.030576
std,0.41463,0.187238,0.161272,0.336389,0.071588,0.035863,0.131887,0.172194
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Representativeness among those that had publications

In [71]:
print('Computer Science')
frame_dist = dict(cs_frame[cs_frame.genderize.isin(['male', 'female'])].genderize.value_counts(normalize=True))
resp_dist = dict(df[(df.likely_department == "Computer Science") & completed_first_part & 
                    (df.dblp_pubs.notna())].gender_ans.value_counts(normalize=True))                
print('Gender:')
print(frame_dist)
print(resp_dist)

# 
obs = np.array([[len(cs_frame[cs_frame.genderize == 'male']), 
                 len(cs_frame[cs_frame.genderize == 'female'])], 
                [len(df[(df.likely_department == "Computer Science") & 
                    (df.dblp_pubs.notna()) & (df.gender_ans == 2)]), 
                 len(df[(df.likely_department == "Computer Science") & 
                    (df.dblp_pubs.notna()) & (df.gender_ans == 1)])]])

print(stats.chi2_contingency(obs, correction=False))

print('\nRank:')
# Rank -- 3: full professor, 2: associate professor, 1: full professor
frame_dist = dict(cs_frame[cs_frame.factitle.isin([1, 2, 3])].factitle.value_counts(normalize=False))
resp_dist = dict(df[(df.likely_department == "Computer Science") & completed_first_part &
                    (df.dblp_pubs.notna())].factitle.value_counts(normalize=False))

print(dict(cs_frame[cs_frame.factitle.isin([1, 2, 3])].factitle.value_counts(normalize=True))) # Frame
print(dict(df[(df.likely_department == "Computer Science") & completed_first_part &
                    (df.dblp_pubs.notna())].factitle.value_counts(normalize=True))) # Responses

obs = np.array([[frame_dist[3], frame_dist[2], frame_dist[1]],
                [resp_dist[3],  resp_dist[2],  resp_dist[1]]])

chi2, p, dof, expected = chi2_contingency(obs, correction = False)
print(chi2, p, dof, expected)

print('\nPrestige:')
print(cs_frame.prestige.mean(), cs_frame.prestige.std())
print(df[(df.likely_department == "Computer Science") & (df.dblp_pubs.notna()) 
         & completed_first_part].prestige_inv.mean(), 
      df[(df.likely_department == "Computer Science") & (df.dblp_pubs.notna()) 
         & completed_first_part].prestige_inv.std())

print(ks_2samp(cs_frame.prestige.dropna(), 
               df[(df.likely_department == "Computer Science") &
                  (df.dblp_pubs.notna()) & (df.gender_ans > 0)].prestige_inv.dropna()))

Computer Science
Gender:
{'male': 0.8545497969906855, 'female': 0.14545020300931455}
{2: 0.7874165872259294, 1: 0.2049571020019066, 4: 0.00667302192564347, 3: 0.0009532888465204957}
(23.168262192437027, 1.4842688215887199e-06, 1, array([[3527.32734226,  659.67265774],
       [ 878.67265774,  164.32734226]]))

Rank:
{3: 0.5029887482419128, 2: 0.26828410689170185, 1: 0.22872714486638537}
{3.0: 0.45948522402287895, 1.0: 0.30409914204003813, 2.0: 0.23641563393708293}
27.717575077545355 9.576461037371376e-07 2 [[2822.47053585 1497.77527089 1367.75419326]
 [ 520.52946415  276.22472911  252.24580674]]

Prestige:
86.52224352331606 54.10902176186857
86.33170801526718 53.49335824345852
KstestResult(statistic=0.026208775588345008, pvalue=0.5638399100372817)


In [72]:
print('Business')
frame_dist = dict(bus_frame[bus_frame.genderize.isin(['male', 'female'])].genderize.value_counts(normalize=True))
resp_dist = dict(df[(df.likely_department == "Business") & 
                    completed_first_part & 
                    (df.dblp_pubs.notna())].gender_ans.value_counts(normalize=True))                
print('Gender:')
print(frame_dist)
print(resp_dist)

# 
obs = np.array([[len(bus_frame[bus_frame.genderize == 'male']), 
                 len(bus_frame[bus_frame.genderize == 'female'])], 
                [len(df[(df.likely_department == "Business") & 
                    (df.dblp_pubs.notna()) & (df.gender_ans == 2)]), 
                 len(df[(df.likely_department == "Business") & 
                    (df.dblp_pubs.notna()) & (df.gender_ans == 1)])]])

print(stats.chi2_contingency(obs, correction=False))

print('\nRank:')
# Rank -- 3: full professor, 2: associate professor, 1: full professor
tt_keys = ['Full Professor', 'Associate Professor', 'Assistant Professor']
frame_dist = dict(bus_frame[bus_frame.u_factitle.isin(tt_keys)].u_factitle.value_counts(normalize=False))
resp_dist = dict(df[(df.likely_department == "Business") & completed_first_part &
                    (df.dblp_pubs.notna())].curtitle.value_counts(normalize=False))

print(dict(bus_frame[bus_frame.u_factitle.isin(tt_keys)].u_factitle.value_counts(normalize=True))) # Frame
print(dict(df[(df.likely_department == "Business") & completed_first_part &
                    (df.dblp_pubs.notna())].curtitle.value_counts(normalize=True))) # Responses

obs = np.array([[frame_dist['Full Professor'], frame_dist['Associate Professor'], frame_dist['Assistant Professor']],
                  [resp_dist[3.0],  resp_dist[2.0], resp_dist[1.0]]])

chi2, p, dof, expected = chi2_contingency(obs, correction = False)
print(chi2, p, dof, expected)

print('\nPrestige:')
print(bus_frame.prestige_inv.mean(), bus_frame.prestige_inv.std())
print(df[(df.likely_department == "Business") & (df.dblp_pubs.notna()) 
         & completed_first_part].prestige_inv.mean(), 
      df[(df.likely_department == "Business") & (df.dblp_pubs.notna()) 
         & completed_first_part].prestige_inv.std())

print(ks_2samp(bus_frame.prestige_inv.dropna(), 
               df[(df.likely_department == "Business") & (df.dblp_pubs.notna()) 
                  & completed_first_part].prestige_inv.dropna()))

Business
Gender:
{'male': 0.7645220849311177, 'female': 0.23547791506888227}
{2: 0.6724137931034483, 1: 0.3275862068965517}
(22.223429568888612, 2.4269409016904657e-06, 1, array([[5338.46311475, 1702.53688525],
       [ 396.53688525,  126.46311475]]))

Rank:
{'Full Professor': 0.41697747277338504, 'Assistant Professor': 0.3044905266298672, 'Associate Professor': 0.2785320005967477}
{3.0: 0.4042145593869732, 1.0: 0.31992337164750956, 2.0: 0.25862068965517243, 4.0: 0.017241379310344827}
1.132547308032424 0.5676367133736813 2 [[2792.29739468 1859.67378049 2051.02882483]
 [ 213.70260532  142.32621951  156.97117517]]

Prestige:
51.893656749555944 31.543476752759847
45.003486590038314 29.49380223304564
KstestResult(statistic=0.12250455959113397, pvalue=6.521539837045509e-07)


In [73]:
print('History')
frame_dist = dict(his_frame[his_frame.genderize.isin(['male', 'female'])].genderize.value_counts(normalize=True))
resp_dist = dict(df[(df.likely_department == "History") & 
                    completed_first_part & 
                    (df.dblp_pubs.notna())].gender_ans.value_counts(normalize=True))                
print('Gender:')
print(frame_dist)
print(resp_dist)

# 
obs = np.array([[len(his_frame[his_frame.genderize == 'male']), 
                 len(his_frame[his_frame.genderize == 'female'])], 
                [len(df[(df.likely_department == "History") & 
                    (df.dblp_pubs.notna()) & (df.gender_ans == 2)]), 
                 len(df[(df.likely_department == "History") & 
                    (df.dblp_pubs.notna()) & (df.gender_ans == 1)])]])

print(stats.chi2_contingency(obs, correction=False))

print('\nRank:')
# Rank -- 3: full professor, 2: associate professor, 1: full professor
tt_keys = ['Full Professor', 'Associate Professor', 'Assistant Professor']
frame_dist = dict(his_frame[his_frame.u_factitle.isin(tt_keys)].u_factitle.value_counts(normalize=False))
resp_dist = dict(df[(df.likely_department == "History") & 
                    (df.dblp_pubs.notna()) & completed_first_part].curtitle.value_counts(normalize=False))

print(dict(his_frame[his_frame.u_factitle.isin(tt_keys)].u_factitle.value_counts(normalize=True))) # Frame
print(dict(df[(df.likely_department == "History") & 
              (df.dblp_pubs.notna()) & completed_first_part].curtitle.value_counts(normalize=True))) # Responses

obs = np.array([[frame_dist['Full Professor'], frame_dist['Associate Professor'], frame_dist['Assistant Professor']],
                  [resp_dist[3.0],  resp_dist[2.0], resp_dist[1.0]]])

chi2, p, dof, expected = chi2_contingency(obs, correction = False)
print(chi2, p, dof, expected)

print('\nPrestige:')
print(his_frame.prestige_inv.mean(), his_frame.prestige_inv.std())
print(df[(df.likely_department == "History") & (df.dblp_pubs.notna()) 
         & completed_first_part].prestige_inv.mean(), 
      df[(df.likely_department == "History") & (df.dblp_pubs.notna()) 
         & completed_first_part].prestige_inv.std())

print(ks_2samp(his_frame.prestige_inv.dropna(), 
               df[(df.likely_department == "History") &
                  (df.dblp_pubs.notna()) & completed_first_part].prestige_inv.dropna()))

History
Gender:
{'male': 0.6265840608279358, 'female': 0.3734159391720642}
{2: 0.5597269624573379, 1: 0.4402730375426621}
(5.143012463005998, 0.02334011387943421, 1, array([[2206.90400624, 1344.09599376],
       [ 182.09599376,  110.90400624]]))

Rank:
{'Full Professor': 0.4530141843971631, 'Associate Professor': 0.383274231678487, 'Assistant Professor': 0.16371158392434987}
{3.0: 0.45733788395904434, 2.0: 0.3651877133105802, 1.0: 0.15017064846416384, 4.0: 0.027303754266211604}
0.352821169940445 0.8382737300748013 2 [[1537.51103843 1294.94031071  551.54865086]
 [ 129.48896157  109.05968929   46.45134914]]

Prestige:
62.31507204610951 39.16652779558621
62.146382252559725 37.99293120829963
KstestResult(statistic=0.051096182785651756, pvalue=0.4563973254408068)
