In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import plot_utils
from util import INST_NAME_ALIASES, binomial_confidence_interval, load_in_data
from scipy.stats import mannwhitneyu, ks_2samp, chisquare, chi2_contingency, ttest_ind, ranksums

from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Helvetica']

%matplotlib inline

  rcParams['text.latex.preamble'] = [


In [2]:
df = pd.read_csv('../data/coded_data/data_linked.csv', low_memory=False)

canadian_universities = ['University of Waterloo', 'University of Alberta', 'University of Western Ontario', 
                         'York University', 'University of Toronto', 'University of Ottawa', 
                         'Carleton University', 'Concordia University, Montreal, Canada', 
                         'University of Calgary', 'McGill University', 'University of Montreal', 
                         'Université de Montréal', 'University of Saskatchewan', 
                         'University of British Columbia', 'University of New Brunswick', 
                         'Memorial University of Newfoundland', 'University of Victoria',
                         "Queen's University, Kingston Ontario", 'Simon Fraser University', 
                         'University of Regina', 'University of Manitoba', 'Dalhousie University', 
                         'McMaster University']
df = df[~df['Institution'].isin(canadian_universities)] # Drop Canada

In [3]:
df['likely_department'].value_counts()

Biology                     1557
Business                    1293
Psychology                  1145
Physics and/or Astronomy    1063
Computer Science            1009
History                      992
Sociology                    523
Anthropology                 400
Name: likely_department, dtype: int64

In [4]:
# Read in survey frame files
his_ids = pd.read_excel('../data/survey_data/history/HIS_intro_2018_10_22b_unlocked.xlsx')
his_ids_more = pd.read_excel('../data/survey_data/history/HIS_allinvited_exceptheads.xls')
his_frame = pd.concat([his_ids, his_ids_more], sort=False)
his_frame['Gender'] = pd.read_csv('../data/survey_data/history/his_frame_name_genderize.tsv').values
his_frame['Gender'] = his_frame['Gender'].map({'male': 'M', 'female': 'F'})
his_frame['Rank'] = his_frame['u_factitle']



In [5]:
busi_ids = pd.read_excel('../data/survey_data/business/intro_sep2019_unlocked.xlsx')
busi_ids_more = pd.read_excel('../data/survey_data/business/BUSI_participant_codes_send1.xls')
bus_frame = pd.concat([busi_ids, busi_ids_more], sort=False)
bus_frame['Gender'] = pd.read_csv('../data/survey_data/business/bus_frame_name_genderize.tsv').values
bus_frame['Gender'] = bus_frame['Gender'].map({'male': 'M', 'female': 'F'})
bus_frame['Rank'] = bus_frame['u_factitle']



In [6]:
cs_frame = pd.read_csv('../data/survey_data/cs/frame_feb5_2020.tsv', sep='\t', header=0)
cs_frame['Gender'] = pd.read_csv('../data/survey_data/cs/cs_frame_name_genderize.tsv').values
cs_frame['Gender'] = cs_frame['Gender'].map({'male': 'M', 'female': 'F'})
cs_frame['Rank'] = cs_frame['factitle'].map({1: 'Assistant Professor', 
                                             2: 'Associate Professor', 
                                             3: 'Full Professor'})

In [7]:
bio_frame = pd.read_csv('../data/survey_data/bio_mini/bio_mini_frame_w_gender_and_rank.csv')

In [8]:
all_other_fields_frame = pd.read_excel('../data/survey_data/data2020/SurveyEmailsTracking.xlsx')

In [9]:
remove_these_entries = ['Dead', 'No Longer Tenured', 'No Longer at Institution', 'Not Primary Appointment', 
                        'Not Tenure Track', 'Not at Institution', 'Not in Field', 'Not tenure Track & Retired',
                        'Retired']

In [10]:
# Drops 199 entries from faculty who told us they were not tenure-track
all_other_fields_frame[~all_other_fields_frame['SurveyDatabaseChangeReason'].isin(remove_these_entries)].shape

(16846, 17)

In [11]:
all_other_fields_frame = all_other_fields_frame[~all_other_fields_frame['SurveyDatabaseChangeReason'].isin(
    remove_these_entries)]
all_other_fields_frame['Rank'] = all_other_fields_frame['Rank'].map(
    {'Professor': 'Full Professor', 'Assistant Professor': 'Assistant Professor', 
     'Associate Professor': 'Associate Professor'})

In [12]:
gender_frame = pd.read_csv('../data/survey_data/data2020/Winter_2020_EmailFrame_PredictedGenders.csv')
all_other_fields_frame = pd.merge(all_other_fields_frame, gender_frame[['PersonId', 'Gender']], on='PersonId', 
                                  how='left')

In [13]:
# How many departments did we survey?

In [14]:
ndept_phys = len(all_other_fields_frame[all_other_fields_frame['Field'] == 'Physics/Astronomy']['InstitutionName'].dropna().unique())
ndept_soc = len(all_other_fields_frame[all_other_fields_frame['Field'] == 'Sociology']['InstitutionName'].dropna().unique())
ndept_psych = len(all_other_fields_frame[all_other_fields_frame['Field'] == 'Psychology']['InstitutionName'].dropna().unique())
ndept_anthro = len(all_other_fields_frame[all_other_fields_frame['Field'] == 'Anthropology']['InstitutionName'].dropna().unique())
ndept_bio = len(bio_frame['InstitutionName'].dropna().unique())

his_frame['university_name_standard'] = his_frame.u_university.apply(lambda x: INST_NAME_ALIASES.get(x, x))
ndept_his = len(his_frame['university_name_standard'].dropna().unique())
bus_frame['university_name_standard'] = bus_frame.u_university.apply(lambda x: INST_NAME_ALIASES.get(x, x))
ndept_bus = len(bus_frame['university_name_standard'].dropna().unique())
cs_frame['university_name_standard'] = cs_frame.university.apply(lambda x: INST_NAME_ALIASES.get(x, x))
# We allowed people not initially emailed to fill out the survey, so two people from out of sample universities 
# responded (Aalto University [Finland] & Colorado School of Medicine)
ndept_cs = len(cs_frame['university_name_standard'].dropna().unique()) 

print(ndept_phys, ndept_soc, ndept_psych, ndept_anthro, ndept_bio, ndept_his, ndept_bus, ndept_cs-2)
print(sum([ndept_phys, ndept_soc, ndept_psych, ndept_anthro, ndept_bio, ndept_his, ndept_bus, ndept_cs-2]))

194 122 236 104 243 144 112 205
1360


In [15]:
# Read in rankings information
rankings = pd.read_csv('../data/prestige/History_USNWR.csv')
his_frame = his_frame.merge(rankings, right_on = 'institution', 
                            left_on = 'university_name_standard', how='left')
his_frame['USNWR_coded'] = pd.to_numeric(his_frame['USNWR'], errors='coerce')

rankings = pd.read_csv('../data/prestige/Business_USNWR.csv')
bus_frame = bus_frame.merge(rankings, right_on = 'institution', 
                            left_on = 'university_name_standard', how='left')
bus_frame['USNWR_coded'] = pd.to_numeric(bus_frame['USNWR'], errors='coerce')

rankings = pd.read_csv('../data/prestige/CS_USNWR.csv')
cs_frame = cs_frame.merge(rankings, right_on = 'institution', 
                          left_on = 'university_name_standard', how='left')
cs_frame['USNWR_coded'] = pd.to_numeric(cs_frame['USNWR'], errors='coerce')

rankings = pd.read_csv('../data/prestige/Biology_USNWR.csv')
bio_frame = bio_frame.merge(rankings, right_on = 'institution', 
                           left_on = 'InstitutionName', how='left')
bio_frame['USNWR_coded'] = pd.to_numeric(bio_frame['USNWR'], errors='coerce')

rankings = pd.read_csv('../data/prestige/Physics and_or Astronomy_USNWR.csv')
phys_frame = all_other_fields_frame[all_other_fields_frame['Field'] == 'Physics/Astronomy']
phys_frame = phys_frame.merge(rankings, right_on = 'institution', left_on = 'InstitutionName', how='left')
phys_frame['USNWR_coded'] = pd.to_numeric(phys_frame['USNWR'], errors='coerce')

rankings = pd.read_csv('../data/prestige/Sociology_USNWR.csv')
soc_frame = all_other_fields_frame[all_other_fields_frame['Field'] == 'Sociology']
soc_frame = soc_frame.merge(rankings, right_on = 'institution', left_on = 'InstitutionName', how='left')
soc_frame['USNWR_coded'] = pd.to_numeric(soc_frame['USNWR'], errors='coerce')

rankings = pd.read_csv('../data/prestige/Psychology_USNWR.csv')
psych_frame = all_other_fields_frame[all_other_fields_frame['Field'] == 'Psychology']
psych_frame = psych_frame.merge(rankings, right_on = 'institution', left_on = 'InstitutionName', how='left')
psych_frame['USNWR_coded'] = pd.to_numeric(psych_frame['USNWR'], errors='coerce')

rankings = pd.read_csv('../data/prestige/Anthropology_NRC.csv')
anthro_frame = all_other_fields_frame[all_other_fields_frame['Field'] == 'Anthropology']
anthro_frame = anthro_frame.merge(rankings, right_on = 'institution', left_on = 'InstitutionName', how='left')
anthro_frame['NRC_coded'] = pd.to_numeric(anthro_frame['NRC R-Rank High'], errors='coerce')

In [16]:
# What was response rate?

In [17]:
resp = df[df.likely_department == 'History'].shape[0] # By field (13.5% to 23.0%)
total = his_frame.shape[0]
print('History', resp/total)

resp = df[df.likely_department == 'Business'].shape[0]
total = bus_frame.shape[0]
print('Business', resp/total)

resp = df[df.likely_department == 'Computer Science'].shape[0]
total = cs_frame.shape[0]
print('Computer Science', resp/total)

resp = df[df.likely_department == 'Physics and/or Astronomy'].shape[0]
total = phys_frame.shape[0]
print('Physics/Astronomy', resp/total)

resp = df[df.likely_department == 'Sociology'].shape[0]
total = soc_frame.shape[0]
print('Sociology', resp/total)

resp = df[df.likely_department == 'Psychology'].shape[0]
total = psych_frame.shape[0]
print('Psychology', resp/total)

resp = df[df.likely_department == 'Anthropology'].shape[0]
total = anthro_frame.shape[0]
print('Anthropology', resp/total)

resp = df[df.likely_department == 'Biology'].shape[0]
total = bio_frame.shape[0]
print('Biology', resp/total)

History 0.22878228782287824
Business 0.13506737699780633
Computer Science 0.17420580110497239
Physics/Astronomy 0.18096697310180457
Sociology 0.21165520032375557
Psychology 0.1759643460888274
Anthropology 0.20060180541624875
Biology 0.1534746180384426


In [18]:
# Overall response rate
total_frame = pd.concat([his_frame, bus_frame, cs_frame, bio_frame, all_other_fields_frame], 
                        sort=False).shape[0]
print((df.shape[0]/total_frame), df.shape[0], total_frame)

0.17152831320140496 8009 46692


In [19]:
# Provided a parent's education level (out of responses)
provided_basic_responses = (~df.p1_edu.isna())
print(df[provided_basic_responses].shape[0]/df.shape[0], df[provided_basic_responses].shape[0], df.shape[0])

0.8994880759145961 7204 8009


In [20]:
# Provided their zip code
provided_basic_responses = (~df.zip.isna())
print(df[provided_basic_responses].shape[0]/df.shape[0], df[provided_basic_responses].shape[0], df.shape[0])

0.6001997752528405 4807 8009


In [8]:
# Abroad responses (missing responses have -66, -99, and empty strings)
provided_basic_responses = (df.zip.isna() & (~df.abroad.isin(['-66', '-99', '', ' ']))) 
print(df[provided_basic_responses].shape[0]/df.shape[0], df[provided_basic_responses].shape[0], df.shape[0])
# print(list(df[~df.abroad.isin(['-66', '-99', '', ' '])]['abroad'].unique()))

0.25059308278187037 2007 8009


In [21]:
# Provided their year of birth
provided_basic_responses = (~df.age_coded.isna())
print(df[provided_basic_responses].shape[0]/df.shape[0], df[provided_basic_responses].shape[0], df.shape[0])

0.9002372331127482 7210 8009


In [22]:
df['USNWR'] = pd.to_numeric(df['USNWR'])
df['NRC'] = pd.to_numeric(df['NRC'])

max_dept_usnwr = df.groupby(['likely_department'])['USNWR'].max()
max_dept_nrc = df.groupby(['likely_department'])['NRC'].max()

def remove_missing(row):
    if (row['likely_department'] in max_dept_usnwr) and not np.isnan(max_dept_usnwr[row['likely_department']]):
        return row['USNWR']
    elif (row['likely_department'] in max_dept_nrc) and not np.isnan(max_dept_nrc[row['likely_department']]):
        return row['NRC']
    else:
        return np.nan

df['prestige'] = df.apply(remove_missing, axis=1)

In [23]:
# Can link prestige
provided_basic_responses = (~df.prestige.isna())
print(1-df[provided_basic_responses].shape[0]/df.shape[0], df[provided_basic_responses].shape[0], df.shape[0])

0.09576726183044071 7242 8009


In [24]:
# Representative with respect to gender, rank, prestige?
provided_basic_responses = ((~df.p1_edu.isna()) | (~df.zip.isna()))
pd.options.display.float_format = '{:,.1f}'.format
df[provided_basic_responses].shape

(7581, 311)

In [25]:
resp = df[(df.likely_department == 'Anthropology') & provided_basic_responses]
total = anthro_frame
tt_keys = ['Full Professor', 'Associate Professor', 'Assistant Professor']
print('\n>>> Anthropology', resp.shape[0]/total.shape[0])
resp_gender_dist = dict(resp['gender_ans'].value_counts(normalize=True))
frame_gender_dist = dict(total[total['Gender'].isin(['M', 'F'])]['Gender'].value_counts(normalize=True))
resp_title_dist = dict(resp[resp['Title'].isin(tt_keys)]['Title'].value_counts(normalize=True))
frame_title_dist = dict(total[total['Rank'].isin(tt_keys)]['Rank'].value_counts(normalize=True))

temp_df = pd.DataFrame({
    'Group': ['Survey', 'Population'], 
    'Men': 100.0*np.array([resp_gender_dist['Male'], frame_gender_dist['M']]),
    'Women': 100.0*np.array([resp_gender_dist['Female'], frame_gender_dist['F']]),
    'Non-binary': [100.0*resp_gender_dist['Other identity'], '--'],
    'Undisclosed': [100.0*resp_gender_dist['Prefer not to say'], '--'],
    'Assistant': 100.0*np.array([resp_title_dist['Assistant Professor'], 
                                 frame_title_dist['Assistant Professor']]),
    'Associate': 100.0*np.array([resp_title_dist['Associate Professor'], 
                                 frame_title_dist['Associate Professor']]),
    'Full': 100.0*np.array([resp_title_dist['Full Professor'], frame_title_dist['Full Professor']]),
    'N': [len(resp), len(total)]
}) 

print('gender: ', chi2_contingency(
    [[len(total[total['Gender'] == 'M']), len(total[total['Gender'] == 'F'])],
     [len(resp[resp.gender_ans == 'Male']), len(resp[resp.gender_ans == 'Female'])]]))

print('rank: ', chi2_contingency(
    [[len(total[total['Rank'] == 'Assistant Professor']), 
      len(total[total['Rank'] == 'Associate Professor']),
      len(total[total['Rank'] == 'Full Professor'])],
     [len(resp[resp['Title'] == 'Assistant Professor']),
      len(resp[resp['Title'] == 'Associate Professor']),
      len(resp[resp['Title'] == 'Full Professor'])]]))

print('prestige: ', (resp.prestige.mean(), resp.prestige.std()), (total.NRC_coded.mean(), total.NRC_coded.std()))
print(ks_2samp(resp.prestige.dropna(), total.NRC_coded.dropna()))
#print(mannwhitneyu(resp.prestige.dropna(), total.NRC_coded.dropna()))

temp_df


>>> Anthropology 0.17803410230692077
gender:  (0.006937350197940794, 0.9336203496211356, 1, array([[ 31.19565217,  31.80434783],
       [173.80434783, 177.19565217]]))
rank:  (3.965010070784102, 0.1377238018295685, 2, array([[410.01715266, 635.52658662, 946.45626072],
       [ 69.98284734, 108.47341338, 161.54373928]]))
prestige:  (26.521885521885523, 21.523879553356338) (22.644106463878327, 19.446594741780928)
KstestResult(statistic=0.08993099563441768, pvalue=0.03697134649026057)


Unnamed: 0,Group,Men,Women,Non-binary,Undisclosed,Assistant,Associate,Full,N
0,Survey,48.7,50.1,0.6,0.6,17.4,30.6,52.1,355
1,Population,50.8,49.2,--,--,21.1,32.1,46.7,1994


In [26]:
resp = df[(df.likely_department == 'Biology') & provided_basic_responses]
total = bio_frame
tt_keys = ['Full Professor', 'Associate Professor', 'Assistant Professor', 'Professor']
print('\n>>> Biology', resp.shape[0]/total.shape[0])
resp_gender_dist = dict(resp['gender_ans'].value_counts(normalize=True))
frame_gender_dist = dict(total[total['Gender'].isin(['M', 'F'])]['Gender'].value_counts(normalize=True))
resp_title_dist = dict(resp[resp['Title'].isin(tt_keys)]['Title'].value_counts(normalize=True))
frame_title_dist = dict(total[total['Rank'].isin(tt_keys)]['Rank'].value_counts(normalize=True))

temp_df = pd.DataFrame({
    'Group': ['Survey', 'Population'], 
    'Men': 100.0*np.array([resp_gender_dist['Male'], frame_gender_dist['M']]),
    'Women': 100.0*np.array([resp_gender_dist['Female'], frame_gender_dist['F']]),
    'Non-binary': [100.0*resp_gender_dist['Other identity'], '--'],
    'Undisclosed': [100.0*resp_gender_dist['Prefer not to say'], '--'],
    'Assistant': 100.0*np.array([resp_title_dist['Assistant Professor'], frame_title_dist['Assistant Professor']]),
    'Associate': 100.0*np.array([resp_title_dist['Associate Professor'], frame_title_dist['Associate Professor']]),
    'Full': 100.0*np.array([resp_title_dist['Full Professor'], frame_title_dist['Professor']]),
    'N': [len(resp), len(total)]
}) 

print('gender: ', chi2_contingency(
    [[len(total[total.Gender == 'M']), len(total[total.Gender == 'F'])],
     [len(resp[resp.gender_ans == 'Male']), len(resp[resp.gender_ans == 'Female'])]]))

print('rank: ', chi2_contingency(
    [[len(total[total['Rank'] == 'Assistant Professor']), 
      len(total[total['Rank'] == 'Associate Professor']),
      len(total[total['Rank'] == 'Full Professor'])],
     [len(resp[resp['Title'] == 'Assistant Professor']),
      len(resp[resp['Title'] == 'Associate Professor']),
      len(resp[resp['Title'] == 'Full Professor'])]]))

print('prestige: ', (resp.prestige.mean(), resp.prestige.std()), (total.USNWR_coded.mean(), total.USNWR_coded.std()))
print(ks_2samp(resp.prestige.dropna(), total.USNWR_coded.dropna()))
#print(mannwhitneyu(resp.prestige.dropna(), total.USNWR_coded.dropna()))

temp_df


>>> Biology 0.152686052242484
gender:  (13.018091930648897, 0.00030849598555885967, 1, array([[6086.31370647, 2681.68629353],
       [1069.68629353,  471.31370647]]))
rank:  (3437.259446709768, 0.0, 2, array([[1962.76041038, 2516.89680145,  664.34278817],
       [ 566.23958962,  726.10319855,  191.65721183]]))
prestige:  (83.33880171184023, 61.37022175213649) (83.17081125152875, 64.47640989355091)
KstestResult(statistic=0.03574708078204045, pvalue=0.09593016646437613)


Unnamed: 0,Group,Men,Women,Non-binary,Undisclosed,Assistant,Associate,Full,N
0,Survey,65.1,34.3,0.1,0.4,12.7,29.6,57.7,1549
1,Population,70.1,29.9,--,--,23.1,27.6,49.3,10145


In [27]:
resp = df[(df.likely_department == 'Business') & provided_basic_responses]
total = bus_frame
tt_keys = ['Full Professor', 'Associate Professor', 'Assistant Professor']
print('\n>>> Business', resp.shape[0]/total.shape[0])
resp_gender_dist = dict(resp['gender_ans'].value_counts(normalize=True))
frame_gender_dist = dict(total[total['Gender'].isin(['M', 'F'])]['Gender'].value_counts(normalize=True))
resp_title_dist = dict(resp[resp['Title'].isin(tt_keys)]['Title'].value_counts(normalize=True))
frame_title_dist = dict(total[total['Rank'].isin(tt_keys)]['Rank'].value_counts(normalize=True))

temp_df = pd.DataFrame({
    'Group': ['Survey', 'Population'], 
    'Men': 100.0*np.array([resp_gender_dist['Male'], frame_gender_dist['M']]),
    'Women': 100.0*np.array([resp_gender_dist['Female'], frame_gender_dist['F']]),
    'Non-binary': [100.0*resp_gender_dist['Other identity'], '--'],
    'Undisclosed': [100.0*resp_gender_dist['Prefer not to say'], '--'],
    'Assistant': 100.0*np.array([resp_title_dist['Assistant Professor'], 
                                 frame_title_dist['Assistant Professor']]),
    'Associate': 100.0*np.array([resp_title_dist['Associate Professor'], 
                                 frame_title_dist['Associate Professor']]),
    'Full': 100.0*np.array([resp_title_dist['Full Professor'], frame_title_dist['Full Professor']]),
    'N': [len(resp), len(total)]
})

print('gender: ', chi2_contingency(
    [[len(total[total.Gender == 'M']), len(total[total.Gender == 'F'])],
     [len(resp[resp.gender_ans == 'Male']), len(resp[resp.gender_ans == 'Female'])]]))

print('rank: ', chi2_contingency(
    [[len(total[total['Rank'] == 'Assistant Professor']), 
      len(total[total['Rank'] == 'Associate Professor']),
      len(total[total['Rank'] == 'Full Professor'])],
     [len(resp[resp['Title'] == 'Assistant Professor']),
      len(resp[resp['Title'] == 'Associate Professor']),
      len(resp[resp['Title'] == 'Full Professor'])]]))

print('prestige: ', (resp.prestige.mean(), resp.prestige.std()), (total.USNWR_coded.mean(), total.USNWR_coded.std()))
print(ks_2samp(resp.prestige.dropna(), total.USNWR_coded.dropna()))
#print(mannwhitneyu(resp.prestige.dropna(), total.USNWR_coded.dropna()))

temp_df


>>> Business 0.13506737699780633
gender:  (39.18297288435738, 3.858909169269995e-10, 1, array([[5302.16848771, 1738.83151229],
       [ 762.83151229,  250.16848771]]))
rank:  (2.198791147950787, 0.33307234046019063, 2, array([[2043.8627222 , 1848.17373816, 2810.96353964],
       [ 306.1372778 ,  276.82626184,  421.03646036]]))
prestige:  (42.42462087421945, 27.515999898498528) (43.43192076228686, 28.71672266945643)
KstestResult(statistic=0.030363056162242302, pvalue=0.3180150175410601)


Unnamed: 0,Group,Men,Women,Non-binary,Undisclosed,Assistant,Associate,Full,N
0,Survey,66.7,32.4,0.1,0.8,30.8,25.7,43.5,1293
1,Population,76.5,23.5,--,--,30.4,27.9,41.7,9573


In [28]:
resp = df[(df.likely_department == 'Computer Science') & provided_basic_responses]
total = cs_frame
tt_keys = ['Full Professor', 'Associate Professor', 'Assistant Professor']
print('\n>>> Computer Science', resp.shape[0]/total.shape[0])
resp_gender_dist = dict(resp['gender_ans'].value_counts(normalize=True))
frame_gender_dist = dict(total[total['Gender'].isin(['M', 'F'])]['Gender'].value_counts(normalize=True))
resp_title_dist = dict(resp[resp['Title'].isin(tt_keys)]['Title'].value_counts(normalize=True))
frame_title_dist = dict(total[total['Rank'].isin(tt_keys)]['Rank'].value_counts(normalize=True))

temp_df = pd.DataFrame({
    'Group': ['Survey', 'Population'], 
    'Men': 100.0*np.array([resp_gender_dist['Male'], frame_gender_dist['M']]),
    'Women': 100.0*np.array([resp_gender_dist['Female'], frame_gender_dist['F']]),
    'Non-binary': [100.0*resp_gender_dist['Other identity'], '--'],
    'Undisclosed': [100.0*resp_gender_dist['Prefer not to say'], '--'],
    'Assistant': 100.0*np.array([resp_title_dist['Assistant Professor'], frame_title_dist['Assistant Professor']]),
    'Associate': 100.0*np.array([resp_title_dist['Associate Professor'], frame_title_dist['Associate Professor']]),
    'Full': 100.0*np.array([resp_title_dist['Full Professor'], frame_title_dist['Full Professor']]),
    'N': [len(resp), len(total)]
}) 

print('gender: ', chi2_contingency(
    [[len(total[total.Gender == 'M']), len(total[total.Gender == 'F'])],
     [len(resp[resp.gender_ans == 'Male']), len(resp[resp.gender_ans == 'Female'])]]))

print('rank: ', chi2_contingency(
    [[len(total[total['Rank'] == 'Assistant Professor']), 
      len(total[total['Rank'] == 'Associate Professor']),
      len(total[total['Rank'] == 'Full Professor'])],
     [len(resp[resp['Title'] == 'Assistant Professor']),
      len(resp[resp['Title'] == 'Associate Professor']),
      len(resp[resp['Title'] == 'Full Professor'])]]))

print('prestige: ', (resp.prestige.mean(), resp.prestige.std()), (total.USNWR_coded.mean(), total.USNWR_coded.std()))
print(ks_2samp(resp.prestige.dropna(), total.USNWR_coded.dropna()))
#print(mannwhitneyu(resp.prestige.dropna(), total.USNWR_coded.dropna()))

temp_df


>>> Computer Science 0.1724792817679558
gender:  (21.185642236280444, 4.168758057110428e-06, 1, array([[3530.15224111,  656.84775889],
       [ 833.84775889,  155.15224111]]))
rank:  (43.396064485409035, 3.772804738168263e-10, 2, array([[1374.99536608, 1496.24096386, 2816.76367006],
       [ 190.00463392,  206.75903614,  389.23632994]]))
prestige:  (65.44604316546763, 47.169692103664545) (65.65963606286186, 48.535874132005745)
KstestResult(statistic=0.020349689762546575, pvalue=0.8824734919341346)


Unnamed: 0,Group,Men,Women,Non-binary,Undisclosed,Assistant,Associate,Full,N
0,Survey,78.8,20.3,0.1,0.8,33.6,22.5,43.9,999
1,Population,85.5,14.5,--,--,22.9,26.8,50.3,5792


In [29]:
resp = df[(df.likely_department == 'History') & provided_basic_responses]
total = his_frame
tt_keys = ['Full Professor', 'Associate Professor', 'Assistant Professor']
print('>>> History', resp.shape[0]/total.shape[0])
resp_gender_dist = dict(resp['gender_ans'].value_counts(normalize=True))
frame_gender_dist = dict(total[total['Gender'].isin(['M', 'F'])]['Gender'].value_counts(normalize=True))
resp_title_dist = dict(resp[resp['Title'].isin(tt_keys)]['Title'].value_counts(normalize=True))
frame_title_dist = dict(total[total['Rank'].isin(tt_keys)]['Rank'].value_counts(normalize=True))

temp_df = pd.DataFrame({
    'Group': ['Survey', 'Population'], 
    'Men': 100.0*np.array([resp_gender_dist['Male'], frame_gender_dist['M']]),
    'Women': 100.0*np.array([resp_gender_dist['Female'], frame_gender_dist['F']]),
    'Non-binary': [100.0*resp_gender_dist['Other identity'], '--'],
    'Undisclosed': [100.0*resp_gender_dist['Prefer not to say'], '--'],
    'Assistant': 100.0*np.array([resp_title_dist['Assistant Professor'], 
                                 frame_title_dist['Assistant Professor']]),
    'Associate': 100.0*np.array([resp_title_dist['Associate Professor'], 
                                 frame_title_dist['Associate Professor']]),
    'Full': 100.0*np.array([resp_title_dist['Full Professor'], frame_title_dist['Full Professor']]),
    'N': [len(resp), len(total)]
})

print('gender: ', chi2_contingency(
    [[len(total[total.Gender == 'M']), len(total[total.Gender == 'F'])],
     [len(resp[resp.gender_ans == 'Male']), len(resp[resp.gender_ans == 'Female'])]]))

print('rank: ', chi2_contingency(
    [[len(total[total['Rank'] == 'Assistant Professor']), 
      len(total[total['Rank'] == 'Associate Professor']),
      len(total[total['Rank'] == 'Full Professor'])],
     [len(resp[resp['Title'] == 'Assistant Professor']),
      len(resp[resp['Title'] == 'Associate Professor']),
      len(resp[resp['Title'] == 'Full Professor'])]]))

print('prestige: ', (resp.prestige.mean(), resp.prestige.std()), (total.USNWR_coded.mean(), total.USNWR_coded.std()))
print(ks_2samp(resp.prestige.dropna(), total.USNWR_coded.dropna()))
#print(mannwhitneyu(resp.prestige.dropna(), total.USNWR_coded.dropna()))

temp_df

>>> History 0.22878228782287824
gender:  (26.158064526074334, 3.1458053931145735e-07, 1, array([[2157.76175478, 1393.23824522],
       [ 543.23824522,  350.76175478]]))
rank:  (0.5534307934764104, 0.7582702725967513, 2, array([[ 556.72001877, 1304.04318235, 1523.23679887],
       [ 144.27998123,  337.95681765,  394.76320113]]))
prestige:  (55.84385026737968, 39.044656832012116) (51.8574706782254, 38.24292253111374)
KstestResult(statistic=0.048250783322925386, pvalue=0.05723497919514109)


Unnamed: 0,Group,Men,Women,Non-binary,Undisclosed,Assistant,Associate,Full,N
0,Survey,53.0,46.5,0.2,0.2,16.8,39.3,43.9,992
1,Population,62.7,37.3,--,--,16.4,38.3,45.3,4336


In [30]:
resp = df[(df.likely_department == 'Physics and/or Astronomy') & provided_basic_responses]
total = phys_frame
tt_keys = ['Full Professor', 'Associate Professor', 'Assistant Professor']
print('\n>>> Physics/Astronomy', resp.shape[0]/total.shape[0])
resp_gender_dist = dict(resp['gender_ans'].value_counts(normalize=True))
frame_gender_dist = dict(total[total['Gender'].isin(['M', 'F'])]['Gender'].value_counts(normalize=True))
resp_title_dist = dict(resp[resp['Title'].isin(tt_keys)]['Title'].value_counts(normalize=True))
frame_title_dist = dict(total[total['Rank'].isin(tt_keys)]['Rank'].value_counts(normalize=True))

temp_df = pd.DataFrame({
    'Group': ['Survey', 'Population'], 
    'Men': 100.0*np.array([resp_gender_dist['Male'], frame_gender_dist['M']]),
    'Women': 100.0*np.array([resp_gender_dist['Female'], frame_gender_dist['F']]),
    'Non-binary': [100.0*resp_gender_dist['Other identity'], '--'],
    'Undisclosed': [100.0*resp_gender_dist['Prefer not to say'], '--'],
    'Assistant': 100.0*np.array([resp_title_dist['Assistant Professor'], 
                                 frame_title_dist['Assistant Professor']]),
    'Associate': 100.0*np.array([resp_title_dist['Associate Professor'], 
                                 frame_title_dist['Associate Professor']]),
    'Full': 100.0*np.array([resp_title_dist['Full Professor'], frame_title_dist['Full Professor']]),
    'N': [len(resp), len(total)]
})

print('gender: ', chi2_contingency(
    [[len(total[total['Gender'] == 'M']), len(total[total['Gender'] == 'F'])],
     [len(resp[resp.gender_ans == 'Male']), len(resp[resp.gender_ans == 'Female'])]]))

print('rank: ', chi2_contingency(
    [[len(total[total['Rank'] == 'Assistant Professor']), 
      len(total[total['Rank'] == 'Associate Professor']),
      len(total[total['Rank'] == 'Full Professor'])],
     [len(resp[resp['Title'] == 'Assistant Professor']),
      len(resp[resp['Title'] == 'Associate Professor']),
      len(resp[resp['Title'] == 'Full Professor'])]]))

print('prestige: ', (resp.prestige.mean(), resp.prestige.std()), (total.USNWR_coded.mean(), total.USNWR_coded.std()))
print(ks_2samp(resp.prestige.dropna(), total.USNWR_coded.dropna()))
#print(mannwhitneyu(resp.prestige.dropna(), total.USNWR_coded.dropna()))

temp_df


>>> Physics/Astronomy 0.15951651344909773
gender:  (0.012654415467714725, 0.91043343447616, 1, array([[  5.61802575,   1.38197425],
       [742.38197425, 182.61802575]]))
rank:  (0.8863634912431907, 0.6419905175185426, 2, array([[1056.83862395, 1148.73763473, 3666.42374133],
       [ 162.16137605,  176.26236527,  562.57625867]]))
prestige:  (52.41224018475751, 39.63807112328518) (51.513609995537706, 41.67344615771217)
KstestResult(statistic=0.05214803272623468, pvalue=0.03707154216802422)


Unnamed: 0,Group,Men,Women,Non-binary,Undisclosed,Assistant,Associate,Full,N
0,Survey,79.3,19.6,0.2,1.0,17.5,18.6,63.8,937
1,Population,85.7,14.3,--,--,18.1,19.7,62.2,5874


In [31]:
resp = df[(df.likely_department == 'Psychology') & provided_basic_responses]
total = psych_frame
print('\n>>> Psychology', resp.shape[0]/total.shape[0])
resp_gender_dist = dict(resp['gender_ans'].value_counts(normalize=True))
frame_gender_dist = dict(total[total['Gender'].isin(['M', 'F'])]['Gender'].value_counts(normalize=True))
resp_title_dist = dict(resp[resp['Title'].isin(tt_keys)]['Title'].value_counts(normalize=True))
frame_title_dist = dict(total[total['Rank'].isin(tt_keys)]['Rank'].value_counts(normalize=True))

temp_df = pd.DataFrame({
    'Group': ['Survey', 'Population'], 
    'Men': 100.0*np.array([resp_gender_dist['Male'], frame_gender_dist['M']]),
    'Women': 100.0*np.array([resp_gender_dist['Female'], frame_gender_dist['F']]),
    'Non-binary': [100.0*resp_gender_dist['Other identity'], '--'],
    'Undisclosed': [100.0*resp_gender_dist['Prefer not to say'], '--'],
    'Assistant': 100.0*np.array([resp_title_dist['Assistant Professor'], 
                                 frame_title_dist['Assistant Professor']]),
    'Associate': 100.0*np.array([resp_title_dist['Associate Professor'], 
                                 frame_title_dist['Associate Professor']]),
    'Full': 100.0*np.array([resp_title_dist['Full Professor'], frame_title_dist['Full Professor']]),
    'N': [len(resp), len(total)]
}) 

print('gender: ', chi2_contingency(
    [[len(total[total['Gender'] == 'M']), len(total[total['Gender'] == 'F'])],
     [len(resp[resp.gender_ans == 'Male']), len(resp[resp.gender_ans == 'Female'])]]))

print('rank: ', chi2_contingency(
    [[len(total[total['Rank'] == 'Assistant Professor']), 
      len(total[total['Rank'] == 'Associate Professor']),
      len(total[total['Rank'] == 'Full Professor'])],
     [len(resp[resp['Title'] == 'Assistant Professor']),
      len(resp[resp['Title'] == 'Associate Professor']),
      len(resp[resp['Title'] == 'Full Professor'])]]))

print('prestige: ', (resp.prestige.mean(), resp.prestige.std()), (total.USNWR_coded.mean(), total.USNWR_coded.std()))
print(ks_2samp(resp.prestige.dropna(), total.USNWR_coded.dropna()))
#print(mannwhitneyu(resp.prestige.dropna(), total.USNWR_coded.dropna()))

temp_df


>>> Psychology 0.1509143998770555
gender:  (4.112233177248129, 0.04257411073043223, 1, array([[108.77306733, 117.22693267],
       [470.22693267, 506.77306733]]))
rank:  (2.9744674510956925, 0.2259969612610424, 2, array([[1480.23184133, 1881.2369338 , 3143.53122487],
       [ 217.76815867,  276.7630662 ,  462.46877513]]))
prestige:  (83.78335233751426, 59.87340942629635) (88.18549747048904, 60.52347852805889)
KstestResult(statistic=0.05612322400641463, pvalue=0.01806602721397177)


Unnamed: 0,Group,Men,Women,Non-binary,Undisclosed,Assistant,Associate,Full,N
0,Survey,46.5,53.1,0.2,0.2,20.7,30.4,48.9,982
1,Population,54.4,45.6,--,--,23.1,28.7,48.2,6507


In [32]:
resp = df[(df.likely_department == 'Sociology') & provided_basic_responses]
total = soc_frame
tt_keys = ['Full Professor', 'Associate Professor', 'Assistant Professor']
print('\n>>> Sociology', resp.shape[0]/total.shape[0])
resp_gender_dist = dict(resp['gender_ans'].value_counts(normalize=True))
frame_gender_dist = dict(total[total['Gender'].isin(['M', 'F'])]['Gender'].value_counts(normalize=True))
resp_title_dist = dict(resp[resp['Title'].isin(tt_keys)]['Title'].value_counts(normalize=True))
frame_title_dist = dict(total[total['Rank'].isin(tt_keys)]['Rank'].value_counts(normalize=True))

temp_df = pd.DataFrame({
    'Group': ['Survey', 'Population'], 
    'Men': 100.0*np.array([resp_gender_dist['Male'], frame_gender_dist['M']]),
    'Women': 100.0*np.array([resp_gender_dist['Female'], frame_gender_dist['F']]),
    'Non-binary': [100.0*resp_gender_dist['Other identity'], '--'],
    'Undisclosed': [100.0*resp_gender_dist['Prefer not to say'], '--'],
    'Assistant': 100.0*np.array([resp_title_dist['Assistant Professor'], 
                                 frame_title_dist['Assistant Professor']]),
    'Associate': 100.0*np.array([resp_title_dist['Associate Professor'], 
                                 frame_title_dist['Associate Professor']]),
    'Full': 100.0*np.array([resp_title_dist['Full Professor'], frame_title_dist['Full Professor']]),
    'N': [len(resp), len(total)]
})  

print('gender: ', chi2_contingency(
    [[len(total[total['Gender'] == 'M']), len(total[total['Gender'] == 'F'])],
     [len(resp[resp.gender_ans == 'Male']), len(resp[resp.gender_ans == 'Female'])]]))

print('rank: ', chi2_contingency(
    [[len(total[total['Rank'] == 'Assistant Professor']), 
      len(total[total['Rank'] == 'Associate Professor']),
      len(total[total['Rank'] == 'Full Professor'])],
     [len(resp[resp['Title'] == 'Assistant Professor']),
      len(resp[resp['Title'] == 'Associate Professor']),
      len(resp[resp['Title'] == 'Full Professor'])]]))

print('prestige: ', (resp.prestige.mean(), resp.prestige.std()), (total.USNWR_coded.mean(), total.USNWR_coded.std()))
print(ks_2samp(resp.prestige.dropna(), total.USNWR_coded.dropna()))
#print(mannwhitneyu(resp.prestige.dropna(), total.USNWR_coded.dropna()))

temp_df


>>> Sociology 0.18089842169162282
gender:  (0.31784590580646743, 0.5729050614033764, 1, array([[ 39.13636364,  44.86363636],
       [206.86363636, 237.13636364]]))
rank:  (4.583722153661682, 0.10107817245285348, 2, array([[ 522.13536966,  734.040958  , 1185.82367234],
       [  93.86463034,  131.959042  ,  213.17632766]]))
prestige:  (38.44642857142857, 25.926261508471033) (40.03223981900452, 27.290252351680486)
KstestResult(statistic=0.06773478622218118, pvalue=0.09988795191686217)


Unnamed: 0,Group,Men,Women,Non-binary,Undisclosed,Assistant,Associate,Full,N
0,Survey,45.6,53.7,0.2,0.4,17.5,31.9,50.6,447
1,Population,50.0,50.0,--,--,22.1,29.7,48.2,2471


In [33]:
# All the fields wrapped up together

In [34]:
provided_basic_responses = ((~df.p1_edu.isna()) | (~df.zip.isna()))
resp = df[provided_basic_responses]
total = pd.concat([his_frame, bus_frame, cs_frame, phys_frame, soc_frame, psych_frame, anthro_frame])
print('\n>>> All', resp.shape[0], resp.shape[0]/total.shape[0])

resp_gender_dist = dict(resp['gender_ans'].value_counts(normalize=True))
frame_gender_dist = dict(total[total['Gender'].isin(['M', 'F'])]['Gender'].value_counts(normalize=True))
resp_title_dist = dict(resp[resp['Title'].isin(tt_keys)]['Title'].value_counts(normalize=True))
frame_title_dist = dict(total[total['Rank'].isin(tt_keys)]['Rank'].value_counts(normalize=True))

temp_df = pd.DataFrame({
    'Group': ['Survey', 'Population'], 
    'Men': 100.0*np.array([resp_gender_dist['Male'], frame_gender_dist['M']]),
    'Women': 100.0*np.array([resp_gender_dist['Female'], frame_gender_dist['F']]),
    'Non-binary': [100.0*resp_gender_dist['Other identity'], '--'],
    'Undisclosed': [100.0*resp_gender_dist['Prefer not to say'], '--'],
    'Assistant': 100.0*np.array([resp_title_dist['Assistant Professor'], 
                                 frame_title_dist['Assistant Professor']]),
    'Associate': 100.0*np.array([resp_title_dist['Associate Professor'], 
                                 frame_title_dist['Associate Professor']]),
    'Full': 100.0*np.array([resp_title_dist['Full Professor'], frame_title_dist['Full Professor']]),
    'N': [len(resp), len(total)]
}) 

print('gender: ', chi2_contingency(
    [[len(total[total['Gender'] == 'M']), len(total[total['Gender'] == 'F'])],
     [len(resp[resp.gender_ans == 'Male']), len(resp[resp.gender_ans == 'Female'])]]))

print('rank: ', chi2_contingency(
    [[len(total[total['Rank'] == 'Assistant Professor']), 
      len(total[total['Rank'] == 'Associate Professor']),
      len(total[total['Rank'] == 'Full Professor'])],
     [len(resp[resp['Title'] == 'Assistant Professor']),
      len(resp[resp['Title'] == 'Associate Professor']),
      len(resp[resp['Title'] == 'Full Professor'])]]))

pi_resp = pd.concat([resp.prestige, resp.prestige]); pi_total = pd.concat([total.NRC_coded, total.USNWR_coded])
print('prestige: ', (pi_resp.mean(), pi_resp.std()), (pi_total.mean(), pi_total.std()))
print(ks_2samp(pi_resp.dropna(), pi_total.dropna()))
print(mannwhitneyu(pi_resp.dropna(), pi_total.dropna()))

temp_df


>>> All 7581 0.20743152652748514
gender:  (319.77102456814595, 1.6247146697199926e-71, 1, array([[10825.04332631,  4333.95667369],
       [ 5112.95667369,  2047.04332631]]))
rank:  (14.523090738327657, 0.000702022313266403, 2, array([[ 7298.58252329,  9108.13476815, 16179.28270856],
       [ 1526.41747671,  1904.86523185,  3383.71729144]]))
prestige:  (61.504735538394286, 49.494010376845026) (55.680129463209724, 45.21141577454595)
KstestResult(statistic=0.04605869387023187, pvalue=1.2730897852914849e-17)
MannwhitneyuResult(statistic=187190641.0, pvalue=1.2342789071270753e-24)


Unnamed: 0,Group,Men,Women,Non-binary,Undisclosed,Assistant,Associate,Full,N
0,Survey,63.1,36.2,0.2,0.6,20.7,28.4,51.0,7581
1,Population,75.1,24.9,--,--,22.8,27.9,49.4,36547


## Parental support

In [60]:
df[(~df['support_parents'].isna())].shape[0]/df.shape[0]

0.8586590086153078

In [35]:
provided_basic_responses = ~df.support_parents.isna()
resp = df[(df.likely_department == 'Anthropology') & provided_basic_responses]
total = anthro_frame
tt_keys = ['Full Professor', 'Associate Professor', 'Assistant Professor']
print('\n>>> Anthropology', resp.shape[0]/total.shape[0])
resp_gender_dist = dict(resp['gender_ans'].value_counts(normalize=True))
frame_gender_dist = dict(total[total['Gender'].isin(['M', 'F'])]['Gender'].value_counts(normalize=True))
resp_title_dist = dict(resp[resp['Title'].isin(tt_keys)]['Title'].value_counts(normalize=True))
frame_title_dist = dict(total[total['Rank'].isin(tt_keys)]['Rank'].value_counts(normalize=True))

temp_df = pd.DataFrame({
    'Group': ['Survey', 'Population'], 
    'Men': 100.0*np.array([resp_gender_dist['Male'], frame_gender_dist['M']]),
    'Women': 100.0*np.array([resp_gender_dist['Female'], frame_gender_dist['F']]),
    'Non-binary': [100.0*resp_gender_dist['Other identity'], '--'],
    'Undisclosed': [100.0*resp_gender_dist['Prefer not to say'], '--'],
    'Assistant': 100.0*np.array([resp_title_dist['Assistant Professor'], 
                                 frame_title_dist['Assistant Professor']]),
    'Associate': 100.0*np.array([resp_title_dist['Associate Professor'], 
                                 frame_title_dist['Associate Professor']]),
    'Full': 100.0*np.array([resp_title_dist['Full Professor'], frame_title_dist['Full Professor']]),
    'N': [len(resp), len(total)]
}) 

print('gender: ', chi2_contingency(
    [[len(total[total['Gender'] == 'M']), len(total[total['Gender'] == 'F'])],
     [len(resp[resp.gender_ans == 'Male']), len(resp[resp.gender_ans == 'Female'])]]))

print('rank: ', chi2_contingency(
    [[len(total[total['Rank'] == 'Assistant Professor']), 
      len(total[total['Rank'] == 'Associate Professor']),
      len(total[total['Rank'] == 'Full Professor'])],
     [len(resp[resp['Title'] == 'Assistant Professor']),
      len(resp[resp['Title'] == 'Associate Professor']),
      len(resp[resp['Title'] == 'Full Professor'])]]))

print('prestige: ', (resp.prestige.mean(), resp.prestige.std()), (total.NRC_coded.mean(), total.NRC_coded.std()))
print(ks_2samp(resp.prestige.dropna(), total.NRC_coded.dropna()))
#print(mannwhitneyu(resp.prestige.dropna(), total.NRC_coded.dropna()))

temp_df


>>> Anthropology 0.17051153460381144
gender:  (0.00046992776381905984, 0.9827049588279788, 1, array([[ 31.57894737,  31.42105263],
       [168.42105263, 167.57894737]]))
rank:  (3.961665090639952, 0.13795433625034387, 2, array([[409.73868047, 636.51228978, 945.74902975],
       [ 67.26131953, 104.48771022, 155.25097025]]))
prestige:  (26.424028268551236, 21.533506515763897) (22.644106463878327, 19.446594741780928)
KstestResult(statistic=0.08593962030928805, pvalue=0.059930798910133753)


Unnamed: 0,Group,Men,Women,Non-binary,Undisclosed,Assistant,Associate,Full,N
0,Survey,49.4,49.4,0.6,0.6,17.1,30.9,52.0,340
1,Population,50.8,49.2,--,--,21.1,32.1,46.7,1994


In [36]:
resp = df[(df.likely_department == 'Biology') & provided_basic_responses]
total = bio_frame
tt_keys = ['Full Professor', 'Associate Professor', 'Assistant Professor', 'Professor']
print('\n>>> Biology', resp.shape[0]/total.shape[0])
resp_gender_dist = dict(resp['gender_ans'].value_counts(normalize=True))
frame_gender_dist = dict(total[total['Gender'].isin(['M', 'F'])]['Gender'].value_counts(normalize=True))
resp_title_dist = dict(resp[resp['Title'].isin(tt_keys)]['Title'].value_counts(normalize=True))
frame_title_dist = dict(total[total['Rank'].isin(tt_keys)]['Rank'].value_counts(normalize=True))

temp_df = pd.DataFrame({
    'Group': ['Survey', 'Population'], 
    'Men': 100.0*np.array([resp_gender_dist['Male'], frame_gender_dist['M']]),
    'Women': 100.0*np.array([resp_gender_dist['Female'], frame_gender_dist['F']]),
    'Non-binary': [100.0*resp_gender_dist['Other identity'], '--'],
    'Undisclosed': [100.0*resp_gender_dist['Prefer not to say'], '--'],
    'Assistant': 100.0*np.array([resp_title_dist['Assistant Professor'], frame_title_dist['Assistant Professor']]),
    'Associate': 100.0*np.array([resp_title_dist['Associate Professor'], frame_title_dist['Associate Professor']]),
    'Full': 100.0*np.array([resp_title_dist['Full Professor'], frame_title_dist['Professor']]),
    'N': [len(resp), len(total)]
}) 

print('gender: ', chi2_contingency(
    [[len(total[total.Gender == 'M']), len(total[total.Gender == 'F'])],
     [len(resp[resp.gender_ans == 'Male']), len(resp[resp.gender_ans == 'Female'])]]))

print('rank: ', chi2_contingency(
    [[len(total[total['Rank'] == 'Assistant Professor']), 
      len(total[total['Rank'] == 'Associate Professor']),
      len(total[total['Rank'] == 'Full Professor'])],
     [len(resp[resp['Title'] == 'Assistant Professor']),
      len(resp[resp['Title'] == 'Associate Professor']),
      len(resp[resp['Title'] == 'Full Professor'])]]))

print('prestige: ', (resp.prestige.mean(), resp.prestige.std()), (total.USNWR_coded.mean(), total.USNWR_coded.std()))
print(ks_2samp(resp.prestige.dropna(), total.USNWR_coded.dropna()))
#print(mannwhitneyu(resp.prestige.dropna(), total.USNWR_coded.dropna()))

temp_df


>>> Biology 0.15071463775258748
gender:  (11.416301643633963, 0.000728024728207123, 1, array([[6090.47487608, 2677.52512392],
       [1056.52512392,  464.47487608]]))
rank:  (3445.4263211847356, 0.0, 2, array([[1962.65779123, 2521.41603631,  659.92617247],
       [ 559.34220877,  718.58396369,  188.07382753]]))
prestige:  (83.50685920577617, 61.24920776199653) (83.17081125152875, 64.47640989355091)
KstestResult(statistic=0.03691817725587618, pvalue=0.0810530255708064)


Unnamed: 0,Group,Men,Women,Non-binary,Undisclosed,Assistant,Associate,Full,N
0,Survey,65.4,34.1,0.1,0.4,12.4,29.7,57.8,1529
1,Population,70.1,29.9,--,--,23.1,27.6,49.3,10145


In [37]:
resp = df[(df.likely_department == 'Business') & provided_basic_responses]
total = bus_frame
tt_keys = ['Full Professor', 'Associate Professor', 'Assistant Professor']
print('\n>>> Business', resp.shape[0]/total.shape[0])
resp_gender_dist = dict(resp['gender_ans'].value_counts(normalize=True))
frame_gender_dist = dict(total[total['Gender'].isin(['M', 'F'])]['Gender'].value_counts(normalize=True))
resp_title_dist = dict(resp[resp['Title'].isin(tt_keys)]['Title'].value_counts(normalize=True))
frame_title_dist = dict(total[total['Rank'].isin(tt_keys)]['Rank'].value_counts(normalize=True))

temp_df = pd.DataFrame({
    'Group': ['Survey', 'Population'], 
    'Men': 100.0*np.array([resp_gender_dist['Male'], frame_gender_dist['M']]),
    'Women': 100.0*np.array([resp_gender_dist['Female'], frame_gender_dist['F']]),
    'Non-binary': [100.0*resp_gender_dist['Other identity'], '--'],
    'Undisclosed': [100.0*resp_gender_dist['Prefer not to say'], '--'],
    'Assistant': 100.0*np.array([resp_title_dist['Assistant Professor'], 
                                 frame_title_dist['Assistant Professor']]),
    'Associate': 100.0*np.array([resp_title_dist['Associate Professor'], 
                                 frame_title_dist['Associate Professor']]),
    'Full': 100.0*np.array([resp_title_dist['Full Professor'], frame_title_dist['Full Professor']]),
    'N': [len(resp), len(total)]
})

print('gender: ', chi2_contingency(
    [[len(total[total.Gender == 'M']), len(total[total.Gender == 'F'])],
     [len(resp[resp.gender_ans == 'Male']), len(resp[resp.gender_ans == 'Female'])]]))

print('rank: ', chi2_contingency(
    [[len(total[total['Rank'] == 'Assistant Professor']), 
      len(total[total['Rank'] == 'Associate Professor']),
      len(total[total['Rank'] == 'Full Professor'])],
     [len(resp[resp['Title'] == 'Assistant Professor']),
      len(resp[resp['Title'] == 'Associate Professor']),
      len(resp[resp['Title'] == 'Full Professor'])]]))

print('prestige: ', (resp.prestige.mean(), resp.prestige.std()), (total.USNWR_coded.mean(), total.USNWR_coded.std()))
print(ks_2samp(resp.prestige.dropna(), total.USNWR_coded.dropna()))
#print(mannwhitneyu(resp.prestige.dropna(), total.USNWR_coded.dropna()))

temp_df


>>> Business 0.09840175493575681
gender:  (44.123243470916265, 3.083377856308248e-11, 1, array([[5300.17101304, 1740.82898696],
       [ 703.82898696,  231.17101304]]))
rank:  (1.5378041037822123, 0.46352171181496293, 2, array([[2048.3833027, 1851.2880021, 2803.3286952],
       [ 279.6166973,  252.7119979,  382.6713048]]))
prestige:  (42.36507936507937, 27.388624011964428) (43.43192076228686, 28.71672266945643)
KstestResult(statistic=0.035028620660516346, pvalue=0.3138786207751887)


Unnamed: 0,Group,Men,Women,Non-binary,Undisclosed,Assistant,Associate,Full,N
0,Survey,65.9,33.3,0.1,0.6,31.4,25.9,42.7,942
1,Population,76.5,23.5,--,--,30.4,27.9,41.7,9573


In [38]:
resp = df[(df.likely_department == 'Computer Science') & provided_basic_responses]
total = cs_frame
tt_keys = ['Full Professor', 'Associate Professor', 'Assistant Professor']
print('\n>>> Computer Science', resp.shape[0]/total.shape[0])
resp_gender_dist = dict(resp['gender_ans'].value_counts(normalize=True))
frame_gender_dist = dict(total[total['Gender'].isin(['M', 'F'])]['Gender'].value_counts(normalize=True))
resp_title_dist = dict(resp[resp['Title'].isin(tt_keys)]['Title'].value_counts(normalize=True))
frame_title_dist = dict(total[total['Rank'].isin(tt_keys)]['Rank'].value_counts(normalize=True))

temp_df = pd.DataFrame({
    'Group': ['Survey', 'Population'], 
    'Men': 100.0*np.array([resp_gender_dist['Male'], frame_gender_dist['M']]),
    'Women': 100.0*np.array([resp_gender_dist['Female'], frame_gender_dist['F']]),
    'Non-binary': [100.0*resp_gender_dist['Other identity'], '--'],
    'Undisclosed': [100.0*resp_gender_dist['Prefer not to say'], '--'],
    'Assistant': 100.0*np.array([resp_title_dist['Assistant Professor'], frame_title_dist['Assistant Professor']]),
    'Associate': 100.0*np.array([resp_title_dist['Associate Professor'], frame_title_dist['Associate Professor']]),
    'Full': 100.0*np.array([resp_title_dist['Full Professor'], frame_title_dist['Full Professor']]),
    'N': [len(resp), len(total)]
}) 

print('gender: ', chi2_contingency(
    [[len(total[total.Gender == 'M']), len(total[total.Gender == 'F'])],
     [len(resp[resp.gender_ans == 'Male']), len(resp[resp.gender_ans == 'Female'])]]))

print('rank: ', chi2_contingency(
    [[len(total[total['Rank'] == 'Assistant Professor']), 
      len(total[total['Rank'] == 'Associate Professor']),
      len(total[total['Rank'] == 'Full Professor'])],
     [len(resp[resp['Title'] == 'Assistant Professor']),
      len(resp[resp['Title'] == 'Associate Professor']),
      len(resp[resp['Title'] == 'Full Professor'])]]))

print('prestige: ', (resp.prestige.mean(), resp.prestige.std()), (total.USNWR_coded.mean(), total.USNWR_coded.std()))
print(ks_2samp(resp.prestige.dropna(), total.USNWR_coded.dropna()))
#print(mannwhitneyu(resp.prestige.dropna(), total.USNWR_coded.dropna()))

temp_df


>>> Computer Science 0.16212016574585636
gender:  (21.31808884040257, 3.890430794756761e-06, 1, array([[3531.14570313,  655.85429688],
       [ 786.85429688,  146.14570313]]))
rank:  (41.011528979314264, 1.2429671041428553e-09, 2, array([[1370.67703796, 1494.56004978, 2822.76291226],
       [ 178.32296204,  194.43995022,  367.23708774]]))
prestige:  (65.1631982475356, 46.97921670314117) (65.65963606286186, 48.535874132005745)
KstestResult(statistic=0.02130652091786954, pvalue=0.867955843182373)


Unnamed: 0,Group,Men,Women,Non-binary,Undisclosed,Assistant,Associate,Full,N
0,Survey,78.8,20.6,0.1,0.5,33.5,22.0,44.5,939
1,Population,85.5,14.5,--,--,22.9,26.8,50.3,5792


In [39]:
resp = df[(df.likely_department == 'History') & provided_basic_responses]
total = his_frame
tt_keys = ['Full Professor', 'Associate Professor', 'Assistant Professor']
print('>>> History', resp.shape[0]/total.shape[0])
resp_gender_dist = dict(resp['gender_ans'].value_counts(normalize=True))
frame_gender_dist = dict(total[total['Gender'].isin(['M', 'F'])]['Gender'].value_counts(normalize=True))
resp_title_dist = dict(resp[resp['Title'].isin(tt_keys)]['Title'].value_counts(normalize=True))
frame_title_dist = dict(total[total['Rank'].isin(tt_keys)]['Rank'].value_counts(normalize=True))

temp_df = pd.DataFrame({
    'Group': ['Survey', 'Population'], 
    'Men': 100.0*np.array([resp_gender_dist['Male'], frame_gender_dist['M']]),
    'Women': 100.0*np.array([resp_gender_dist['Female'], frame_gender_dist['F']]),
    'Non-binary': [100.0*resp_gender_dist['Other identity'], '--'],
    'Undisclosed': [100.0*resp_gender_dist['Prefer not to say'], '--'],
    'Assistant': 100.0*np.array([resp_title_dist['Assistant Professor'], 
                                 frame_title_dist['Assistant Professor']]),
    'Associate': 100.0*np.array([resp_title_dist['Associate Professor'], 
                                 frame_title_dist['Associate Professor']]),
    'Full': 100.0*np.array([resp_title_dist['Full Professor'], frame_title_dist['Full Professor']]),
    'N': [len(resp), len(total)]
})

print('gender: ', chi2_contingency(
    [[len(total[total.Gender == 'M']), len(total[total.Gender == 'F'])],
     [len(resp[resp.gender_ans == 'Male']), len(resp[resp.gender_ans == 'Female'])]]))

print('rank: ', chi2_contingency(
    [[len(total[total['Rank'] == 'Assistant Professor']), 
      len(total[total['Rank'] == 'Associate Professor']),
      len(total[total['Rank'] == 'Full Professor'])],
     [len(resp[resp['Title'] == 'Assistant Professor']),
      len(resp[resp['Title'] == 'Associate Professor']),
      len(resp[resp['Title'] == 'Full Professor'])]]))

print('prestige: ', (resp.prestige.mean(), resp.prestige.std()), (total.USNWR_coded.mean(), total.USNWR_coded.std()))
print(ks_2samp(resp.prestige.dropna(), total.USNWR_coded.dropna()))
#print(mannwhitneyu(resp.prestige.dropna(), total.USNWR_coded.dropna()))

temp_df

>>> History 0.19695571955719557
gender:  (27.412105515511158, 1.6439889923447213e-07, 1, array([[2157.54919337, 1393.45080663],
       [ 516.45080663,  333.54919337]]))
rank:  (0.4663655140540208, 0.7920088151204917, 2, array([[ 553.83301708, 1305.11954459, 1525.04743833],
       [ 136.16698292,  320.88045541,  374.95256167]]))
prestige:  (56.57992565055762, 39.47682982204163) (51.8574706782254, 38.24292253111374)
KstestResult(statistic=0.055322594811968454, pvalue=0.03182705045451373)


Unnamed: 0,Group,Men,Women,Non-binary,Undisclosed,Assistant,Associate,Full,N
0,Survey,52.6,47.0,0.2,0.2,16.3,39.5,44.1,854
1,Population,62.7,37.3,--,--,16.4,38.3,45.3,4336


In [40]:
resp = df[(df.likely_department == 'Physics and/or Astronomy') & provided_basic_responses]
total = phys_frame
tt_keys = ['Full Professor', 'Associate Professor', 'Assistant Professor']
print('\n>>> Physics/Astronomy', resp.shape[0]/total.shape[0])
resp_gender_dist = dict(resp['gender_ans'].value_counts(normalize=True))
frame_gender_dist = dict(total[total['Gender'].isin(['M', 'F'])]['Gender'].value_counts(normalize=True))
resp_title_dist = dict(resp[resp['Title'].isin(tt_keys)]['Title'].value_counts(normalize=True))
frame_title_dist = dict(total[total['Rank'].isin(tt_keys)]['Rank'].value_counts(normalize=True))

temp_df = pd.DataFrame({
    'Group': ['Survey', 'Population'], 
    'Men': 100.0*np.array([resp_gender_dist['Male'], frame_gender_dist['M']]),
    'Women': 100.0*np.array([resp_gender_dist['Female'], frame_gender_dist['F']]),
    'Non-binary': [100.0*resp_gender_dist['Other identity'], '--'],
    'Undisclosed': [100.0*resp_gender_dist['Prefer not to say'], '--'],
    'Assistant': 100.0*np.array([resp_title_dist['Assistant Professor'], 
                                 frame_title_dist['Assistant Professor']]),
    'Associate': 100.0*np.array([resp_title_dist['Associate Professor'], 
                                 frame_title_dist['Associate Professor']]),
    'Full': 100.0*np.array([resp_title_dist['Full Professor'], frame_title_dist['Full Professor']]),
    'N': [len(resp), len(total)]
})

print('gender: ', chi2_contingency(
    [[len(total[total['Gender'] == 'M']), len(total[total['Gender'] == 'F'])],
     [len(resp[resp.gender_ans == 'Male']), len(resp[resp.gender_ans == 'Female'])]]))

print('rank: ', chi2_contingency(
    [[len(total[total['Rank'] == 'Assistant Professor']), 
      len(total[total['Rank'] == 'Associate Professor']),
      len(total[total['Rank'] == 'Full Professor'])],
     [len(resp[resp['Title'] == 'Assistant Professor']),
      len(resp[resp['Title'] == 'Associate Professor']),
      len(resp[resp['Title'] == 'Full Professor'])]]))

print('prestige: ', (resp.prestige.mean(), resp.prestige.std()), (total.USNWR_coded.mean(), total.USNWR_coded.std()))
print(ks_2samp(resp.prestige.dropna(), total.USNWR_coded.dropna()))
#print(mannwhitneyu(resp.prestige.dropna(), total.USNWR_coded.dropna()))

temp_df


>>> Physics/Astronomy 0.15270684371807966
gender:  (0.017387788027390733, 0.8950928429164176, 1, array([[  5.63758389,   1.36241611],
       [714.36241611, 172.63758389]]))
rank:  (0.16870361092384004, 0.9191078520483611, 2, array([[1058.75657211, 1153.81791178, 3659.42551611],
       [ 155.24342789,  169.18208822,  536.57448389]]))
prestige:  (53.03015681544029, 39.69920486918989) (51.513609995537706, 41.67344615771217)
KstestResult(statistic=0.057956527894179585, pvalue=0.017381833447973394)


Unnamed: 0,Group,Men,Women,Non-binary,Undisclosed,Assistant,Associate,Full,N
0,Survey,79.6,19.3,0.2,0.9,17.8,19.3,63.0,897
1,Population,85.7,14.3,--,--,18.1,19.7,62.2,5874


In [42]:
resp = df[(df.likely_department == 'Psychology') & provided_basic_responses]
total = psych_frame
print('\n>>> Psychology', resp.shape[0]/total.shape[0])
resp_gender_dist = dict(resp['gender_ans'].value_counts(normalize=True))
frame_gender_dist = dict(total[total['Gender'].isin(['M', 'F'])]['Gender'].value_counts(normalize=True))
resp_title_dist = dict(resp[resp['Title'].isin(tt_keys)]['Title'].value_counts(normalize=True))
frame_title_dist = dict(total[total['Rank'].isin(tt_keys)]['Rank'].value_counts(normalize=True))

temp_df = pd.DataFrame({
    'Group': ['Survey', 'Population'], 
    'Men': 100.0*np.array([resp_gender_dist['Male'], frame_gender_dist['M']]),
    'Women': 100.0*np.array([resp_gender_dist['Female'], frame_gender_dist['F']]),
    'Non-binary': [100.0*resp_gender_dist['Other identity'], '--'],
    #'Undisclosed': [100.0*resp_gender_dist['Prefer not to say'], '--'],
    'Assistant': 100.0*np.array([resp_title_dist['Assistant Professor'], 
                                 frame_title_dist['Assistant Professor']]),
    'Associate': 100.0*np.array([resp_title_dist['Associate Professor'], 
                                 frame_title_dist['Associate Professor']]),
    'Full': 100.0*np.array([resp_title_dist['Full Professor'], frame_title_dist['Full Professor']]),
    'N': [len(resp), len(total)]
}) 

print('gender: ', chi2_contingency(
    [[len(total[total['Gender'] == 'M']), len(total[total['Gender'] == 'F'])],
     [len(resp[resp.gender_ans == 'Male']), len(resp[resp.gender_ans == 'Female'])]]))

print('rank: ', chi2_contingency(
    [[len(total[total['Rank'] == 'Assistant Professor']), 
      len(total[total['Rank'] == 'Associate Professor']),
      len(total[total['Rank'] == 'Full Professor'])],
     [len(resp[resp['Title'] == 'Assistant Professor']),
      len(resp[resp['Title'] == 'Associate Professor']),
      len(resp[resp['Title'] == 'Full Professor'])]]))

print('prestige: ', (resp.prestige.mean(), resp.prestige.std()), (total.USNWR_coded.mean(), total.USNWR_coded.std()))
print(ks_2samp(resp.prestige.dropna(), total.USNWR_coded.dropna()))
#print(mannwhitneyu(resp.prestige.dropna(), total.USNWR_coded.dropna()))

temp_df


>>> Psychology 0.1426156446903335
gender:  (3.972424794807886, 0.04625112462791099, 1, array([[109.07638889, 116.92361111],
       [446.92361111, 479.07638889]]))
rank:  (1.7018580798564404, 0.42701803080808, 2, array([[1484.75111291, 1875.24416565, 3145.00472144],
       [ 207.24888709,  261.75583435,  438.99527856]]))
prestige:  (83.6819830713422, 59.61676805522757) (88.18549747048904, 60.52347852805889)
KstestResult(statistic=0.055744059574520145, pvalue=0.024066387906136022)


Unnamed: 0,Group,Men,Women,Non-binary,Assistant,Associate,Full,N
0,Survey,46.7,53.1,0.2,21.1,29.7,49.1,928
1,Population,54.4,45.6,--,23.1,28.7,48.2,6507


In [44]:
resp = df[provided_basic_responses & (df.likely_department == 'Sociology')]
# total = pd.concat([his_frame, bus_frame, cs_frame, phys_frame, soc_frame, psych_frame, anthro_frame])
total = pd.concat([soc_frame])
print('\n>>> Sociology', resp.shape[0], resp.shape[0]/total.shape[0])

resp_gender_dist = dict(resp['gender_ans'].value_counts(normalize=True))
frame_gender_dist = dict(total[total['Gender'].isin(['M', 'F'])]['Gender'].value_counts(normalize=True))
resp_title_dist = dict(resp[resp['Title'].isin(tt_keys)]['Title'].value_counts(normalize=True))
frame_title_dist = dict(total[total['Rank'].isin(tt_keys)]['Rank'].value_counts(normalize=True))

temp_df = pd.DataFrame({
    'Group': ['Survey', 'Population'], 
    'Men': 100.0*np.array([resp_gender_dist['Male'], frame_gender_dist['M']]),
    'Women': 100.0*np.array([resp_gender_dist['Female'], frame_gender_dist['F']]),
    'Non-binary': [100.0*resp_gender_dist['Other identity'], '--'],
    'Undisclosed': [100.0*resp_gender_dist['Prefer not to say'], '--'],
    'Assistant': 100.0*np.array([resp_title_dist['Assistant Professor'], 
                                 frame_title_dist['Assistant Professor']]),
    'Associate': 100.0*np.array([resp_title_dist['Associate Professor'], 
                                 frame_title_dist['Associate Professor']]),
    'Full': 100.0*np.array([resp_title_dist['Full Professor'], frame_title_dist['Full Professor']]),
    'N': [len(resp), len(total)]
}) 

print('gender: ', chi2_contingency(
    [[len(total[total['Gender'] == 'M']), len(total[total['Gender'] == 'F'])],
     [len(resp[resp.gender_ans == 'Male']), len(resp[resp.gender_ans == 'Female'])]]))

print('rank: ', chi2_contingency(
    [[len(total[total['Rank'] == 'Assistant Professor']), 
      len(total[total['Rank'] == 'Associate Professor']),
      len(total[total['Rank'] == 'Full Professor'])],
     [len(resp[resp['Title'] == 'Assistant Professor']),
      len(resp[resp['Title'] == 'Associate Professor']),
      len(resp[resp['Title'] == 'Full Professor'])]]))

pi_resp = pd.concat([resp.prestige, resp.prestige]); 
pi_total = total.USNWR_coded # pi_total = pd.concat([total.NRC_coded, total.USNWR_coded])
print('prestige: ', (pi_resp.mean(), pi_resp.std()), (pi_total.mean(), pi_total.std()))
print(ks_2samp(pi_resp.dropna(), pi_total.dropna()))

temp_df


>>> Sociology 422 0.1707810602994739
gender:  (0.19477687549601902, 0.6589702508949935, 1, array([[ 39.65737052,  44.34262948],
       [197.34262948, 220.65737052]]))
rank:  (5.106491199268509, 0.07782865498287166, 2, array([[ 521.57563025,  732.7710084 , 1187.65336134],
       [  88.42436975,  124.2289916 ,  201.34663866]]))
prestige:  (38.75603217158177, 26.15363995751168) (40.03223981900452, 27.290252351680486)
KstestResult(statistic=0.06092978540147757, pvalue=0.03864766615338844)


Unnamed: 0,Group,Men,Women,Non-binary,Undisclosed,Assistant,Associate,Full,N
0,Survey,46.2,52.8,0.2,0.7,17.1,31.6,51.2,422
1,Population,50.0,50.0,--,--,22.1,29.7,48.2,2471
