# ISTP demographics

Data is from the ```ISTP Pro/Post Survey - Demographics (Vanessa)``` folder on Drive

I will create files and tree plots.

Running in the ```ete3``` conda environment (is WSL).

In [None]:
import pandas as pd
import numpy as np
import itertools
import os
import json
import scipy.stats as stats

In [None]:
from ete3 import Tree, faces, AttrFace, TreeStyle, TextFace, add_face_to_node

In [None]:
pd.set_option('display.max_rows', 20)

In [None]:
def getDemographics(inputdf):

    # loop through all the columns and groupby all the other columns to get the overlaps
    # if i == 0, then we find the overlaps of all columns
    # if i > 0, then we ignore any columns < i, and replace their values with nan
    # using method from : https://stackoverflow.com/questions/35268817/unique-combinations-of-values-in-selected-columns-in-pandas-data-frame-and-count

    n = 0
    cols = inputdf.columns
    for i in np.arange(1, len(cols)):
        # get all the combinations of i columns in useCols
        itr = list(itertools.combinations(cols, i))
        print(i, len(itr))

        # these columns will be used in groupby while others will be anything
        for useColList in itr:
            g = inputdf.groupby(list(useColList)).size().reset_index().rename(columns = {0:'count'})

            # add the missing column(s) as NaN 
            for cc in cols:
                if (cc not in useColList):
                    g.insert(0, cc, np.nan)

            # remove any rows that are all nans (excluding count)
            g.dropna(how = 'all', inplace = True, subset = useColList)

            # move the count column to be first
            count = g.pop('count')
            g.insert(0, 'count', count)

            if (n == 0):
                groupdf = g
            else:
                groupdf = pd.concat([groupdf, g])

            n += 1

        print(len(groupdf))

    # convert any entry with a space or blank entry to nan
    groupdf.replace(r'^\s*$', np.nan, regex=True, inplace = True)

    # remove duplicates
    groupdf.drop_duplicates(keep = 'first', inplace = True)

    # sort
    groupdf = groupdf.sort_values(by = 'count', ascending = False)

    # add a column that has the fraction of total
    groupdf.insert(1, 'fraction', groupdf['count']/len(df))

    # add a column to count the number of non-nan entries in each row (excluding "count" and "fraction")
    groupdf.insert(2, 'nAxes', groupdf.count(axis = 1) - 2)

    # remove any rows with nAxes == 0
    groupdf = groupdf.loc[groupdf['nAxes'] > 0].reset_index(drop = True)

    # take only the rows with > 5 people in the group and sort
    groupdfTrim = groupdf.loc[groupdf['count'] > 5]
    
    # combine groups into a single columns, and output a condensed file
    groups = []

    for i,row in groupdf.iterrows():
        foo = row[cols].copy().dropna().values

        group = [x for x in foo if x != '' and not x.isspace()]
        groups.append('; '.join(group))

    outdf = groupdf[['count','fraction','nAxes']].copy()
    outdf['group'] = groups
    
    return groupdf, groupdfTrim, outdf

In [None]:
def multiReplacer(inputdf, c, m):
    # add a comma after everything to make the replacement easier
    inputdf.loc[~pd.isna(inputdf[c]), c] = inputdf[c].loc[~pd.isna(inputdf[c])].astype(str) + ','    
    
    # can't use a simple string replace on the entire dataframe because there are single and double digits 
    # (e.g., 1, can be confused with 21,)
    replacer = inputdf[c].values
    for index,row in inputdf.iterrows():

        if (row[c] is not np.nan):
            replace = ''

            if (',' in row[c]):
                vals = list(filter(None, row[c].split(','))) # remove empty strings
            else:
                vals = [row[c]]

            for v in vals: 
                try:
                    if (v != ''):
                        ind = int(float(v))
                        val = m[ind]
                        if (val is not np.nan):
                            if (val not in replace):
                                replace += val + ', '
                except:
                    replace = v
                            
            replacer[index] = replace
            
    replacer[np.where(pd.isna(replacer))] = 'Did not respond (' + c + ')'   
     
    return replacer

In [None]:
def getNodeFractions(inputdf):
    # assume that this df has only 1 column
    col = inputdf.columns[0]
    unique_values = inputdf[col].dropna().unique()
    count = np.zeros_like(unique_values)
    for i,name in enumerate(unique_values):
        foo = inputdf.loc[inputdf[col] == name]
        count[i] = len(foo)

    outdf = pd.DataFrame()
    outdf[col] = unique_values
    outdf['count'] = count
    outdf['fraction'] = np.array(count)/len(inputdf)
    # add error
    
    return outdf

In [None]:
def addNodesToTree(base, cols, i, nodes, inputdf):
    c = cols[i]
    if (c in inputdf.columns):
        # get the unique values in this column
        unique_values = inputdf[c].dropna().unique()

        # add them as nodes to the tree
        for col_name in unique_values:
            usedf = inputdf.loc[inputdf[c] == col_name]

            # if there are >0 rows in the inputdf that have these values then 
            if len(usedf) > 0:
                #name = ' ' + col_name + ' [' + str(len(usedf)) + '/' + str(len(inputdf)) + ', {:.1f}%] '.format(len(usedf)/len(inputdf)*100.)
                name = ' ' + col_name + ' [' + str(len(usedf)) + ', {:.1f}%] '.format(len(usedf)/len(inputdf)*100.)
                nodes[name] = base.add_child(name = name)
                nodes[name].support = len(usedf)
                
                # recursively move down the tree
                if (i+1 < len(cols)):
                    addNodesToTree(nodes[name], cols, i+1, nodes, usedf)

In [None]:
# https://github.com/etetoolkit/ete/issues/219
def my_tree_layout(node):
    F = TextFace(node.name, tight_text = True)
    add_face_to_node(F, node, column = 0, position = "branch-right")
        
# http://etetoolkit.org/docs/latest/faqs/#how-do-i-visualize-internal-node-names
# def my_layout(node):
#     if node.is_leaf():
#         # If terminal node, draws its name
#         name_face = AttrFace("name")
#     else:
#         # If internal node, draws label with smaller font size
#         name_face = AttrFace("name", fsize=10)
#     # Adds the name face to the image at the preferred position
#     faces.add_face_to_node(name_face, node, column=0, position="branch-right")

tree_style = TreeStyle()

# Do not add leaf names automatically
tree_style.show_leaf_name = False

# increase the y spacing
tree_style.branch_vertical_margin = 10

# I need some way to remove the scale bar at the bottom

# Use my custom layout
tree_style.layout_fn = my_tree_layout

In [None]:
def createSingleColTree(df, filename):
    
    # create the tree
    tree = Tree()
    nodes = {}
    addNodesToTree(tree, df.columns, 0, nodes, df)
    
    # seems like to need to remove the file first or else it doesn't get written
    fname = os.path.join(os.getcwd(),filename)
    if os.path.isfile(fname):
        os.remove(fname)
    
    # write the file
    _ = tree.render(fname, w=11, units="in", tree_style=tree_style)

In [None]:
def compileCircleData(df):
    # create a tmp df and an out df
    tmpDf = df.copy()
    cols = df.columns
    outDf = pd.DataFrame(columns = cols)
    
    count = []
    
    cnt = 0
    #while (len(tmpDf) > 0 and cnt < 1000):
    while (len(tmpDf) > 0 ):
        # take the first row in the df
        row = tmpDf.iloc[0]
        
        # check for any with matching demographics
        foo = tmpDf.copy()
        for c in cols:
            foo = foo.loc[foo[c] == row[c]]
            
        # count them, add then to outDf and remove them from tmpDf
        count.append(len(foo))
        outDf = outDf.append(row)
        tmpDf = tmpDf.drop(index = foo.index)#.reset_index(drop = True)
        cnt += 1
        #print(cnt, len(tmpDf), row)
        
    outDf['count'] = count
    
    return outDf

In [None]:
def createCircleInput(circleDf, cols):
    circle = []
    # do I need to expand this to have multiple rows per person, one for each demographic category?

    for i, row in circleDf.iterrows():
        demo_dict = row[cols].to_dict()
        demo_list = []
        for key in demo_dict.keys():
            if (not pd.isnull(demo_dict[key])):
                demo_list.append(key + '.' + demo_dict[key])
    #     demo_list = [key + '.' + demo_dict[key] if not math.isnan(demo_dict[key]) for key in demo_dict.keys()]
        done = False
        for c in cols:
            #if (not pd.isnull(row[c])):
            # just take one entry per "person"
            if (not pd.isnull(row[c]) and not done):
                done = True
                person = 'person' + str(i).zfill(3)
                demo = c + '.' + row[c]
                other_demo_list = demo_list.copy()
                other_demo_list.remove(demo)

                entry = {'name':demo + '.' + person, 
                         'other_demographics':[ v + '.' + person for v in other_demo_list], 
                         'full_demographics':other_demo_list + [demo],
                         'size': row['count']
                        }
                circle.append(entry)
                
    return circle

In [None]:
def construct_contingency(usedf, key, d1, d2, i):
    col = ['date',usedf[key].iloc[i], 'not_' + usedf[key].iloc[i], 'total']
    contingency = pd.DataFrame(columns = col)
    contingency[col[0]] = [d1, d2, 'total']
    contingency[col[1]] = [usedf.iloc[i]['count_' + d1], 
                           usedf.iloc[i]['count_' + d2], 
                           usedf.iloc[i]['count_' + d1] + usedf.iloc[i]['count_' + d2]]
    contingency[col[2]] = [np.sum(usedf['count_' + d1]) - usedf.iloc[i]['count_' + d1], 
                           np.sum(usedf['count_' + d2]) - usedf.iloc[i]['count_' + d2], 
                           np.sum(usedf['count_' + d1]) - usedf.iloc[i]['count_' + d1] + 
                           np.sum(usedf['count_' + d2]) - usedf.iloc[i]['count_' + d2]]
    contingency[col[3]] = contingency[col[1]] + contingency[col[2]]
    contingency.set_index('date', drop = True, inplace = True)
    
    return contingency

In [None]:
# collapsing categories (see demographicsBreakdown.ipynb for full categories)
# I am going to ignore "Other" answers

roleMap = {
    1:'Faculty member, lecturer, instructor, or adjunct faculty',
    2:'Graduate student or Postdoctoral scholar',
    3:'Graduate student or Postdoctoral scholar',
    4:'Staff member',
    5:'Other (role)',
    np.nan:'Did not respond (role)'

}
disciplineMap = {
    1:'Agriculture and natural resource sciences',
    2:'Arts',
    3:'Biological and life sciences',
    4:'Business and management science',
    5:'Chemistry',
    6:'Computer, information, and technological sciences',
    7:'Earth, environmental, atmospheric, and ocean sciences',
    8:'Education',
    9:'Engineering',
    10:'Humanities',
    11:'Law',
    12:'Mathematics and Statistics',
    13:'Medical sciences',
    14:'Physical sciences',
    15:'Psychology',
    16:'Social, behavioral, and economic sciences (not including psychology)',
    17:'Other (discipline)',
    np.nan:'Did not respond (discipline)'
}
disciplineSTEMMap = {
    1:'STEM',
    2:'non-STEM',
    3:'STEM',
    4:'non-STEM',
    5:'STEM',
    6:'STEM',
    7:'STEM',
    8:'non-STEM',
    9:'STEM',
    10:'non-STEM',
    11:'non-STEM',
    12:'STEM',
#    13:'non-STEM',
    13:'Medical sciences',
    14:'STEM',
    15:'STEM',
    16:'STEM',
    17:'Other (discipline)',
    np.nan:'Did not respond (discipline)'
}
institutionMap = {
    1:'Community college / 2-year institution',
    7:'Comprehensive or Regional University',
    8:'Liberal arts college',
    9:'Research University',
    10:'Technical college', 
    11:'Other (institution)',
    np.nan:'Did not respond (institution)'

}
institutionMap_oct21 = {
    1:'Community college / 2-year institution',
    2:'Comprehensive or Regional University',
    3:'Liberal arts college',
    4:'Research University',
    5:'Technical college', 
    6:'Other (institution)',
    np.nan:'Did not respond (institution)'

}
# genderMap = {
#     1:'Non-binary, gender queer, self-identify', 
#     8:'Cis-Man/Trans-Man',
#     9:'Non-binary, gender queer, self-identify',
#     10:'Cis-Man/Trans-Man',
#     14:'Cis-Woman/Trans-Woman', 
#     11:'Cis-Woman/Trans-Woman', 
#     12:'Non-binary, gender queer, self-identify',
#     13:'I prefer not to respond (gender)',
#     np.nan:'Did not respond (gender)'
# }
    
genderMap = {
    1:'Non-binary, gender queer, self-identify', 
    8:'Man',
    9:'Non-binary, gender queer, self-identify',
    10:'Transgender',
    11:'Woman', 
    12:'Non-binary, gender queer, self-identify', 
    13:'I prefer not to respond (gender)',
    14:'Transgender', 
    np.nan:'Did not respond (gender)'
}
    
genderMap_oct21 = {
    1:'Non-binary, gender queer, self-identify', 
    2:'Man',
    3:'Non-binary, gender queer, self-identify',
    4:'Transgender',
    5:'Woman', 
    6:'Non-binary, gender queer, self-identify', 
    7:'I prefer not to respond (gender)',
    np.nan:'Did not respond (gender)'
}

genderMap_prefall22 = {
    1:'Non-binary, gender queer, self-identify', 
    2:'Man',
    3:'Non-binary, gender queer, self-identify',
    4:'Transgender',
    5:'Transgender', 
    6:'Woman', 
    7:'Non-binary, gender queer, self-identify',
    8:'I prefer not to respond (gender)',
    np.nan:'Did not respond (gender)'
}
# these are checkboxes so I will keep each individual column
institutionTypeMap = {
    1:'Minority-focussed institution',
    8:'Minority-focussed institution', 
    9:'Minority-focussed institution', 
    10:'Predominantly White Institution (PWI)',
    11:'Minority-focussed institution',
    12:'Minority-focussed institution',
    13:'I am not sure (institution type)',
    np.nan:'did not respond (institution type)'
}
raceMap = {
    1:'Alaska Native, American Indian, Native American or Indigenous',
    14:'Asian',
    15:'Black or African American',
    16:'Asian',
    17:'Latina/o/x or Hispanic',
    18:'other POC',
    19:'other POC',
    20:'Asian',
    21:'Asian',
    22:'White',
    23:'Multiracial',
    24:'I self-describe as (race)',
    25:'I prefer not to respond (race)',
    np.nan:'Did not respond (race)'
}
raceMap_prefall22 = {
    1:'Alaska Native, American Indian, Native American or Indigenous',
    2:'Asian',
    3:'Black or African American',
    4:'Asian',
    5:'Latina/o/x or Hispanic',
    6:'other POC',
    7:'other POC',
    8:'Asian',
    9:'Asian',
    10:'White',
    11:'Multiracial',
    12:'I self-describe as (race)',
    13:'I prefer not to respond (race)',
    np.nan:'Did not respond (race)'
}
# not included in these files
tenureMap = {
    7:'Tenured and tenure-track',
    19:'Tenured and tenure-track',
    12:'Full-time teaching/instructional or research',
    20:'Full-time teaching/instructional or research',
    23:'Part-time teaching/instructional',
    22:'Full-time teaching/instructional or research',
    21:'Full-time teaching/instructional or research',
    15:np.nan,
    np.nan:'Did not respond (tenure)'

}

## First file
```data/ISTP_demographics_spring22_Aaron.csv```

In [None]:
df = pd.read_csv('data/ISTP_demographics_spring22_Aaron.csv')
df

In [None]:
df.columns

In [None]:
# role, discipline, institution type, gender,  institution designation, race
# (no faculty status in this file?),
useCols = [
    'primerole_march22', 'discipline_march22', 'institution_msi_oct22', 
    'gender_march22', 'institution_march22','race'
]

In [None]:
#usedf = df[useCols]#.dropna(how = 'all').reset_index(drop = True)#.fillna(0)
usedf = df.loc[df['completion_binary'] == 1][useCols].reset_index(drop = True)
# add an additional column for STEM
usedf['STEM_march22'] = usedf['discipline_march22']
usedf

In [None]:
#checking for indigenous (race == 1)
col = usedf['race']
foo = pd.DataFrame(col[col.str.contains('1').fillna(False)])
foo

In [None]:
usedf['primerole_march22']

In [None]:
# count the NaN values in all the columns
Nnan = []
for c in useCols:
    N1 = usedf[c].isna().sum().sum()
    try:
        N2 = len(usedf[usedf[c].str.contains('nan')])
    except:
         N2 = 0
    try:
        N3 = len(usedf[usedf[c].str.contains(r'^\s*$', regex = True)])
    except:
        N3 = 0
        
    Nnan.append(N1 + N2 + N3)
    print(f'# NaN in {c} = {N1 + N2 + N3}')
print('')

# replace the entries

# replace numbers with values
replacements = {
    'primerole_march22':roleMap,
    'discipline_march22':disciplineMap,
    'STEM_march22':disciplineSTEMMap,
    'institution_march22':institutionMap,
    'gender_march22':genderMap,
#     'Q35.1':institutionTypeMap,
#     'race':raceMap
}

usedfHuman = usedf.replace(replacements)


# treat the cells with multiple entries a bit differently
usedfHuman['institution_msi_oct22'] = multiReplacer(usedfHuman, 'institution_msi_oct22', institutionTypeMap)

# for race we will put them in the 'Multiracial' category
usedfHuman['race'].fillna('Did not respond (race)', inplace = True)
foo = usedfHuman.loc[usedfHuman['race'].str.contains(',')]
print('number of people with multiple race entries = ', len(foo))
usedfHuman.loc[usedfHuman['race'].str.contains(','), 'race'] = 'Multiracial' 
usedfHuman['race'] = multiReplacer(usedfHuman, 'race', raceMap)

# remove any extra commas
usedfHuman = usedfHuman.applymap(lambda x: str(x).rstrip(', '))

# fix any lingering nan values
usedfHuman.replace('nan',np.nan, inplace = True)
usedfHuman.replace(r'^\s*$', np.nan, regex = True, inplace = True)

usedfHuman['primerole_march22'].fillna('Did not respond (role)', inplace = True)
usedfHuman['discipline_march22'].fillna('Did not respond (discipline)', inplace = True)
usedfHuman['STEM_march22'].fillna('Did not respond (discipline)', inplace = True)
usedfHuman['institution_march22'].fillna('Did not respond (institution)', inplace = True)
usedfHuman['gender_march22'].fillna('Did not respond (gender)', inplace = True)
usedfHuman['institution_msi_oct22'].fillna('Did not respond (institution type)', inplace = True)
# already filled race above


# print number of people with did not respond values in all the columns
print('')
for i, c in enumerate(useCols):
    N1 = len(usedfHuman[usedfHuman[c].str.contains('respond')])
    print(f'# marked "prefer not to respond" {c} = {N1 - Nnan[i]}')
    
    
usedfHuman_march22 = usedfHuman

usedfHuman

In [None]:
usedfHuman.iloc[17]

In [None]:
usedf.iloc[17]

In [None]:
groupdf, groupdfTrim, outdf = getDemographics(usedfHuman)

In [None]:
groupdfTrim

In [None]:
outdf

In [None]:
outdf.to_csv(os.path.join('analysis','ISTP_demographics_march22_Aaron_demographicsGroupsCondensed.csv'), index = False)
groupdf.to_csv(os.path.join('analysis','ISTP_demographics_march22_Aaron_demographicsGroupsFull.csv'), index = False)

In [None]:
createSingleColTree(usedfHuman['gender_march22'].to_frame(), os.path.join('analysis','figures','ISTP_gender_march22_tree.pdf'))
createSingleColTree(usedfHuman['race'].to_frame(), os.path.join('analysis','figures','ISTP_race_march22_tree.pdf'))
createSingleColTree(usedfHuman['institution_march22'].to_frame(), os.path.join('analysis','figures','ISTP_institution_march22_tree.pdf'))
createSingleColTree(usedfHuman['primerole_march22'].to_frame(), os.path.join('analysis','figures','ISTP_role_march22_tree.pdf'))
createSingleColTree(usedfHuman['STEM_march22'].to_frame(), os.path.join('analysis','figures','ISTP_STEMmed_march22_tree.pdf'))

## Second file
```data/ISTP_demographics_fall21_Aaron.csv```

In [None]:
df = pd.read_csv('data/ISTP_demographics_fall21_Aaron.csv')
df

In [None]:
df.columns

In [None]:
# role, discipline, institution type, gender,  institution designation, race
# (no faculty status in this file?),
useCols = [
    'primerole_oct21', 'discipline_oct21', 'institution_msi_oct21', 
    'gender_oct21', 'institution_oct21','race'
]

In [None]:
#usedf = df[useCols]#.dropna(how = 'all').reset_index(drop = True)#.fillna(0)
usedf = df.loc[df['completion_binary_oct21'] == 1][useCols].reset_index(drop = True)
# add an additional column for STEM
usedf['STEM_oct21'] = usedf['discipline_oct21']
usedf

In [None]:
# count the NaN values in all the columns
Nnan = []
for c in useCols:
    N1 = usedf[c].isna().sum().sum()
    try:
        N2 = len(usedf[usedf[c].str.contains('nan')])
    except:
         N2 = 0
    try:
        N3 = len(usedf[usedf[c].str.contains(r'^\s*$', regex = True)])
    except:
        N3 = 0
        
    Nnan.append(N1 + N2 + N3)
    print(f'# NaN in {c} = {N1 + N2 + N3}')
print('')

# replace numbers with values
replacements = {
    'primerole_oct21':roleMap,
    'discipline_oct21':disciplineMap,
    'STEM_oct21':disciplineSTEMMap,
    'institution_oct21':institutionMap_oct21,
    'gender_oct21':genderMap_oct21,
#     'Q35.1':institutionTypeMap,
#     'race':raceMap
}

usedfHuman = usedf.replace(replacements)

# treat the cells with multiple entries a bit differently
usedfHuman['institution_msi_oct21'] = multiReplacer(usedfHuman, 'institution_msi_oct21', institutionTypeMap)

# for race we will put them in the 'Multiracial' category
usedfHuman['race'].fillna('Did not respond (race)', inplace = True)
foo = usedfHuman.loc[usedfHuman['race'].str.contains(',')]
print('number of people with multiple race entries = ', len(foo))
usedfHuman.loc[usedfHuman['race'].str.contains(','),'race'] = 'Multiracial' 
usedfHuman['race'] = multiReplacer(usedfHuman, 'race', raceMap)

# remove any extra commas
usedfHuman = usedfHuman.applymap(lambda x: str(x).rstrip(', '))

# fix any lingering nan values
usedfHuman.replace('nan',np.nan, inplace = True)
usedfHuman.replace(r'^\s*$', np.nan, regex = True, inplace = True)

usedfHuman['primerole_oct21'].fillna('Did not respond (role)', inplace = True)
usedfHuman['discipline_oct21'].fillna('Did not respond (discipline)', inplace = True)
usedfHuman['STEM_oct21'].fillna('Did not respond (discipline)', inplace = True)
usedfHuman['institution_oct21'].fillna('Did not respond (institution)', inplace = True)
usedfHuman['gender_oct21'].fillna('Did not respond (gender)', inplace = True)
usedfHuman['institution_msi_oct21'].fillna('Did not respond (institution type)', inplace = True)
# already filled race above

# print number of people with did not respond values in all the columns
print('')
for i, c in enumerate(useCols):
    N1 = len(usedfHuman[usedfHuman[c].str.contains('respond')])
    print(f'# marked "prefer not to respond" {c} = {N1 - Nnan[i]}')
usedfHuman_oct21 = usedfHuman

usedfHuman

In [None]:
groupdf, groupdfTrim, outdf = getDemographics(usedfHuman)

In [None]:
groupdfTrim

In [None]:
outdf

In [None]:
outdf.to_csv(os.path.join('analysis','ISTP_demographics_oct21_Aaron_demographicsGroupsCondensed.csv'), index = False)
groupdf.to_csv(os.path.join('analysis','ISTP_demographics_oct21_Aaron_demographicsGroupsFull.csv'), index = False)

In [None]:
createSingleColTree(usedfHuman['gender_oct21'].to_frame(), os.path.join('analysis','figures','ISTP_gender_oct21_tree.pdf'))
createSingleColTree(usedfHuman['race'].to_frame(), os.path.join('analysis','figures','ISTP_race_oct21_tree.pdf'))
createSingleColTree(usedfHuman['institution_oct21'].to_frame(), os.path.join('analysis','figures','ISTP_institution_oct21_tree.pdf'))
createSingleColTree(usedfHuman['primerole_oct21'].to_frame(), os.path.join('analysis','figures','ISTP_role_oct21_tree.pdf'))
createSingleColTree(usedfHuman['STEM_oct21'].to_frame(), os.path.join('analysis','figures','ISTP_STEMmed_oct21_tree.pdf'))

# Third file

In [None]:
df = pd.read_stata('data/updated_pre.post.consent.modules.completion_FA22.dta')
df.replace(r'^\s*$', np.nan, regex = True, inplace = True)
df

In [None]:
print(df.columns.to_list())

In [None]:
# role, discipline, institution type, gender,  institution designation, race
# (no faculty status in this file?),
useCols = [
    'primerole_prefall22', 'discipline_prefall22', 'institution_msi_prefall22', 
    'gender_prefall22', 'institution_prefall22','race_prefall22'
]

In [None]:
#usedf = df[useCols]#.dropna(how = 'all').reset_index(drop = True)#.fillna(0)
usedf = df.loc[df['completionbinary'] == 1][useCols].reset_index(drop = True)
# add an additional column for STEM
usedf['STEM_prefall22'] = usedf['discipline_prefall22']
usedf

In [None]:
#checking for indigenous (race == 1)
pd.set_option('display.max_rows', 20)
col = usedf['race_prefall22']
foo = pd.DataFrame(col[col.str.contains('1').fillna(False)])
foo

In [None]:
# count the NaN values in all the columns
Nnan = []
for c in useCols:
    N1 = usedf[c].isna().sum().sum()
    try:
        N2 = len(usedf[usedf[c].str.contains('nan')])
    except:
         N2 = 0
    try:
        N3 = len(usedf[usedf[c].str.contains(r'^\s*$', regex = True)])
    except:
        N3 = 0
        
    Nnan.append(N1 + N2 + N3)
    print(f'# NaN in {c} = {N1 + N2 + N3}')
print('')

# replace numbers with values
replacements = {
    'primerole_prefall22':roleMap,
    'discipline_prefall22':disciplineMap,
    'STEM_prefall22':disciplineSTEMMap,
    'institution_prefall22':institutionMap_oct21,
    'gender_prefall22':genderMap_prefall22,
#     'Q35.1':institutionTypeMap,
#     'race':raceMap
}

usedfHuman = usedf.replace(replacements)

# treat the cells with multiple entries a bit differently
usedfHuman['institution_msi_prefall22'] = multiReplacer(usedfHuman, 'institution_msi_prefall22', institutionTypeMap)

# for race we will put them in the 'Multiracial' category
usedfHuman['race_prefall22'].fillna('Did not respond (race)', inplace = True)
foo = usedfHuman.loc[usedfHuman['race_prefall22'].str.contains(',')]
print('number of people with multiple race entries = ', len(foo))
usedfHuman.loc[usedfHuman['race_prefall22'].str.contains(','), 'race_prefall22'] = 'Multiracial' 
usedfHuman['race_prefall22'] = multiReplacer(usedfHuman, 'race_prefall22', raceMap_prefall22)

# remove any extra commas
usedfHuman = usedfHuman.applymap(lambda x: str(x).rstrip(', '))

# fix any lingering nan values
usedfHuman.replace('nan',np.nan, inplace = True)
usedfHuman.replace(r'^\s*$', np.nan, regex = True, inplace = True)

usedfHuman['primerole_prefall22'].fillna('Did not respond (role)', inplace = True)
usedfHuman['discipline_prefall22'].fillna('Did not respond (discipline)', inplace = True)
usedfHuman['STEM_prefall22'].fillna('Did not respond (discipline)', inplace = True)
usedfHuman['institution_prefall22'].fillna('Did not respond (institution)', inplace = True)
usedfHuman['gender_prefall22'].fillna('Did not respond (gender)', inplace = True)
usedfHuman['institution_msi_prefall22'].fillna('Did not respond (institution type)', inplace = True)
# already filled race above

# print number of people with did not respond values in all the columns
print('')
for i, c in enumerate(useCols):
    N1 = len(usedfHuman[usedfHuman[c].str.contains('respond')])
    print(f'# marked "prefer not to respond" {c} = {N1 - Nnan[i]}')
    
usedfHuman_prefall22 = usedfHuman

usedfHuman

In [None]:
col = usedfHuman['race_prefall22']
foo = pd.DataFrame(col[col.str.contains('Indigenous').fillna(False)])
foo

In [None]:
groupdf, groupdfTrim, outdf = getDemographics(usedfHuman)

In [None]:
groupdfTrim

In [None]:
outdf

In [None]:
outdf.to_csv(os.path.join('analysis','ISTP_demographics_prefall22_Aaron_demographicsGroupsCondensed.csv'), index = False)
groupdf.to_csv(os.path.join('analysis','ISTP_demographics_prefall22_Aaron_demographicsGroupsFull.csv'), index = False)

In [None]:
createSingleColTree(usedfHuman['gender_prefall22'].to_frame(), os.path.join('analysis','figures','ISTP_gender_prefall22_tree.pdf'))
createSingleColTree(usedfHuman['race_prefall22'].to_frame(), os.path.join('analysis','figures','ISTP_race_prefall22_tree.pdf'))
createSingleColTree(usedfHuman['institution_prefall22'].to_frame(), os.path.join('analysis','figures','ISTP_institution_prefall22_tree.pdf'))
createSingleColTree(usedfHuman['primerole_prefall22'].to_frame(), os.path.join('analysis','figures','ISTP_role_prefall22_tree.pdf'))
createSingleColTree(usedfHuman['STEM_prefall22'].to_frame(), os.path.join('analysis','figures','ISTP_STEMmed_prefall22_tree.pdf'))

## Create an input data file for the circle vis

In [None]:
# grab the data that we want and rename columns
#cols0 = ['primerole_oct21','institution_oct21','gender_oct21','race']
cols0 = ['primerole_oct21','institution_oct21','gender_oct21','STEM_oct21','race']
circleDfOut = compileCircleData(usedfHuman_oct21[cols0]).sort_values(by = 'count', ascending = False).reset_index(drop = True)

circleDf = circleDfOut[cols0 + ['count']].copy()
circleDf.rename(columns = 
                {'primerole_oct21':'Role',
                   'institution_oct21':'Institution Type',
                  'gender_oct21':'Gender',
                 'STEM_oct21':'Primary Field',
                  'race':'Race'}, 
                inplace = True)

# create the input for the circule visualization
circle = createCircleInput(circleDf, ['Role', 'Institution Type', 'Gender', 'Primary Field', 'Race'])
with open(os.path.join('demographics_circle_plot', 'src','data','ISTP_demographics_oct21_circle_data.json'), 'w') as outfile:
    json_object = json.dumps(circle)
    outfile.write(json_object)

circleDf

In [None]:
circleDf.loc[circleDf['Primary Field'] == 'Medical sciences']

In [None]:
# grab the data that we want and rename columns
cols0 = ['primerole_march22','institution_march22','gender_march22','STEM_march22','race']
circleDfOut = compileCircleData(usedfHuman_march22[cols0]).sort_values(by = 'count', ascending = False).reset_index(drop = True)

circleDf = circleDfOut[cols0 + ['count']].copy()
circleDf.rename(columns = 
                {'primerole_march22':'Role',
                  'institution_march22':'Institution Type',
                  'gender_march22':'Gender',
                  'STEM_march22':'Primary Field',
                  'race':'Race',
                }, 
                inplace = True)

# create the input for the circule visualization
circle = createCircleInput(circleDf, ['Role', 'Institution Type', 'Gender', 'Primary Field','Race'])
with open(os.path.join('demographics_circle_plot', 'src','data','ISTP_demographics_march22_circle_data.json'), 'w') as outfile:
    json_object = json.dumps(circle)
    outfile.write(json_object)

circleDf

In [None]:
# grab the data that we want and rename columns
cols0 = ['primerole_prefall22','institution_prefall22','gender_prefall22','STEM_prefall22','race_prefall22']
circleDfOut = compileCircleData(usedfHuman_prefall22[cols0]).sort_values(by = 'count', ascending = False).reset_index(drop = True)

circleDf = circleDfOut[cols0 + ['count']].copy()
circleDf.rename(columns = 
                {'primerole_prefall22':'Role',
                   'institution_prefall22':'Institution Type',
                  'gender_prefall22':'Gender',
                  'STEM_prefall22':'Primary Field',
                  'race_prefall22':'Race'}, 
                inplace = True)

# create the input for the circule visualization
circle = createCircleInput(circleDf, ['Role', 'Institution Type', 'Gender', 'Primary Field', 'Race'])
with open(os.path.join('demographics_circle_plot', 'src','data','ISTP_demographics_prefall22_circle_data.json'), 'w') as outfile:
    json_object = json.dumps(circle)
    outfile.write(json_object)

circleDf

In [None]:
# combine all the circle data sets
cols0 = ['primerole_oct21','institution_oct21','gender_oct21','STEM_oct21','race']
oct21 = usedfHuman_oct21[cols0].rename(columns = 
                {'primerole_oct21':'Role',
                   'institution_oct21':'Institution Type',
                  'gender_oct21':'Gender',
                 'STEM_oct21':'Primary Field',
                  'race':'Race'})

cols0 = ['primerole_march22','institution_march22','gender_march22','STEM_march22','race']
march22 = usedfHuman_march22[cols0].rename(columns = 
                {'primerole_march22':'Role',
                  'institution_march22':'Institution Type',
                  'gender_march22':'Gender',
                  'STEM_march22':'Primary Field',
                  'race':'Race',
                })

cols0 = ['primerole_prefall22','institution_prefall22','gender_prefall22','STEM_prefall22','race_prefall22']
prefall22 = usedfHuman_prefall22[cols0].rename(columns = 
                {'primerole_prefall22':'Role',
                   'institution_prefall22':'Institution Type',
                  'gender_prefall22':'Gender',
                  'STEM_prefall22':'Primary Field',
                  'race_prefall22':'Race'})


combined = pd.concat([oct21, march22, prefall22]).reset_index(drop = True)

circleDfOut = compileCircleData(combined).sort_values(by = 'count', ascending = False).reset_index(drop = True)

# create the input for the circule visualization
circle = createCircleInput(circleDfOut, ['Role', 'Institution Type', 'Gender', 'Primary Field', 'Race'])
with open(os.path.join('demographics_circle_plot', 'src','data','ISTP_demographics_combined_circle_data.json'), 'w') as outfile:
    json_object = json.dumps(circle)
    outfile.write(json_object)

circleDfOut

In [None]:
# remove any Did Dot Respond entries? (No, I will code this up in the javascript instead)
mask = np.column_stack([circleDfOut[col].str.contains(r"Did not respond", na = False) for col in circleDfOut.columns[:-1]])
circleDfOut.loc[mask.any(axis = 1)]

In [None]:
# create a file that only has the STEM, R1, faculty and will plot gender and Race
combined = pd.concat([oct21, march22, prefall22]).reset_index(drop = True)
print('all:', len(combined))
print('faculty:', len(combined.loc[(combined['Role'] == 'Faculty member, lecturer, instructor, or adjunct faculty')]))
print('STEM:', len(combined.loc[(combined['Primary Field'] == 'STEM')]))
print('research U:', len(combined.loc[(combined['Institution Type'] == 'Research University')]))
print('faculty + STEM:', len(combined.loc[(combined['Role'] == 'Faculty member, lecturer, instructor, or adjunct faculty') & (combined['Primary Field'] == 'STEM')]))
print('faculty + research U:', len(combined.loc[(combined['Role'] == 'Faculty member, lecturer, instructor, or adjunct faculty') & (combined['Institution Type'] == 'Research University')]))
print('STEM + research U:', len(combined.loc[(combined['Primary Field'] == 'STEM') & (combined['Institution Type'] == 'Research University')]))
    
usedf = combined.loc[(combined['Role'] == 'Faculty member, lecturer, instructor, or adjunct faculty') & 
                        (combined['Primary Field'] == 'STEM') &
                        (combined['Institution Type'] == 'Research University')
                       ]

print('faculty + research U + STEM:',len(usedf))

# create the input for the circle visualization
circleDfOut = compileCircleData(usedf).sort_values(by = 'count', ascending = False).reset_index(drop = True)


# # create a file that only has the STEM, R1, faculty and will plot gender and Race
# usedf = circleDfOut.loc[(circleDfOut['Role'] == 'Faculty member, lecturer, instructor, or adjunct faculty') & 
#                         (circleDfOut['Primary Field'] == 'STEM') &
#                         (circleDfOut['Institution Type'] == 'Research University')
#                        ]
# create the input for the circle visualization

circle = createCircleInput(circleDfOut, ['Gender','Race'])
with open(os.path.join('demographics_circle_plot', 'src','data','ISTP_demographics_combined_circle_data_STEMR1faculty.json'), 'w') as outfile:
    json_object = json.dumps(circle)
    outfile.write(json_object)

# Compare different files

In [None]:
gender_comparison_prefall22 = getNodeFractions(usedfHuman_prefall22['gender_prefall22'].to_frame()).rename(columns = {'gender_prefall22':'gender'})
gender_comparison_march22 = getNodeFractions(usedfHuman_march22['gender_march22'].to_frame()).rename(columns = {'gender_march22':'gender'})
gender_comparison_oct21 = getNodeFractions(usedfHuman_oct21['gender_oct21'].to_frame()).rename(columns = {'gender_oct21':'gender'})
gender = gender_comparison_march22.merge(gender_comparison_oct21, on = 'gender', suffixes = ('_march22','_oct21'))
gender = gender.merge(gender_comparison_prefall22, on = 'gender').rename(columns = {'count':'count_prefall22','fraction':'fraction_prefall22'})
gender = gender.sort_values('count_march22', ascending = False).reset_index(drop = True)
gender.to_csv(os.path.join('analysis','gender_fractions_compared.csv'), index = False)
gender

In [None]:
primerole_comparison_prefall22 = getNodeFractions(usedfHuman_prefall22['primerole_prefall22'].to_frame()).rename(columns = {'primerole_prefall22':'primerole'})
primerole_comparison_march22 = getNodeFractions(usedfHuman_march22['primerole_march22'].to_frame()).rename(columns = {'primerole_march22':'primerole'})
primerole_comparison_oct21 = getNodeFractions(usedfHuman_oct21['primerole_oct21'].to_frame()).rename(columns = {'primerole_oct21':'primerole'})
primerole = primerole_comparison_march22.merge(primerole_comparison_oct21, on = 'primerole', suffixes = ('_march22','_oct21'))
primerole = primerole.merge(primerole_comparison_prefall22, on = 'primerole').rename(columns = {'count':'count_prefall22','fraction':'fraction_prefall22'})
primerole = primerole.sort_values('count_march22', ascending = False).reset_index(drop = True)
primerole.to_csv(os.path.join('analysis','primerole_fractions_compared.csv'), index = False)
primerole

In [None]:
institution_comparison_prefall22 = getNodeFractions(usedfHuman_prefall22['institution_prefall22'].to_frame()).rename(columns = {'institution_prefall22':'institution'})
institution_comparison_march22 = getNodeFractions(usedfHuman_march22['institution_march22'].to_frame()).rename(columns = {'institution_march22':'institution'})
institution_comparison_oct21 = getNodeFractions(usedfHuman_oct21['institution_oct21'].to_frame()).rename(columns = {'institution_oct21':'institution'})
institution = institution_comparison_march22.merge(institution_comparison_oct21, on = 'institution', suffixes = ('_march22','_oct21'))
institution = institution.merge(institution_comparison_prefall22, on = 'institution').rename(columns = {'count':'count_prefall22','fraction':'fraction_prefall22'})
institution = institution.sort_values('count_march22', ascending = False).reset_index(drop = True)
institution.to_csv(os.path.join('analysis','institution_fractions_compared.csv'), index = False)
institution

In [None]:
race_comparison_prefall22 = getNodeFractions(usedfHuman_prefall22['race_prefall22'].to_frame()).rename(columns = {'race_prefall22':'race'})
race_comparison_march22 = getNodeFractions(usedfHuman_march22['race'].to_frame())
race_comparison_oct21 = getNodeFractions(usedfHuman_oct21['race'].to_frame())
race = race_comparison_march22.merge(race_comparison_oct21, on = 'race', suffixes = ('_march22','_oct21'))
race = race.merge(race_comparison_prefall22, on = 'race').rename(columns = {'count':'count_prefall22','fraction':'fraction_prefall22'})
race = race.sort_values('count_march22', ascending = False).reset_index(drop = True)
race.to_csv(os.path.join('analysis','race_fractions_compared.csv'), index = False)
race

In [None]:
STEM_comparison_prefall22 = getNodeFractions(usedfHuman_prefall22['STEM_prefall22'].to_frame()).rename(columns = {'STEM_prefall22':'STEM'})
STEM_comparison_march22 = getNodeFractions(usedfHuman_march22['STEM_march22'].to_frame()).rename(columns = {'STEM_march22':'STEM'})
STEM_comparison_oct21 = getNodeFractions(usedfHuman_oct21['STEM_oct21'].to_frame()).rename(columns = {'STEM_oct21':'STEM'})
STEM = STEM_comparison_march22.merge(STEM_comparison_oct21, on = 'STEM', suffixes = ('_march22','_oct21'))
STEM = STEM.merge(STEM_comparison_prefall22, on = 'STEM').rename(columns = {'count':'count_prefall22','fraction':'fraction_prefall22'})
STEM = STEM.sort_values('count_march22', ascending = False).reset_index(drop = True)
STEM.to_csv(os.path.join('analysis','STEM_fractions_compared.csv'), index = False)
STEM

In [None]:
# check to see if any are distinct

dates = ['march22', 'oct21', 'prefall22']
keys = ['gender', 'primerole','institution','race', 'STEM']
dfs = [gender, primerole, institution, race, STEM]

for usedf, key in zip (dfs, keys):
    for i, row in usedf.iterrows():
        for j,d1 in enumerate(dates):
            for k,d2 in enumerate(dates):
                if (j > k):
                    # construct a contingency table
                    # https://medium.com/mlearning-ai/how-to-perform-chi-square-tests-in-python-e1eabb98ef25
                    contingency = construct_contingency(usedf, key, d1, d2, i)
                    # perform the chi2 test
                    c,p,dof,expected = stats.chi2_contingency(contingency)
                    # pdiff = (row[cj] - row[ck])/row[cj]*100.
                    # print(d1, d2, row[key], p)
                    # if (pdiff > 5.):
                    if (p < 0.05):
                        print(key, d1, d2, row[key], p)

# Class Data

In [None]:
df = pd.read_csv('data/NoNames PHYSICS  130-3 - 02  College Physics - SPR 2023.csv')
df

In [None]:
# split the student program and plan into two columns
df[['College or School','Major']] = df['Student Program and Plan'].str.split('-',expand=True)
df

In [None]:
print(df.columns.to_list())

In [None]:
useCols = [
    'College or School', 'Acad Level', 
    'Race/Ethnicity', 'Gender','1st Gen College Student'
]

In [None]:
usedfHuman = df[useCols].copy()
usedfHuman.replace({'1st Gen College Student' : 'First Generation College Student'}, 'Yes', inplace=True)
usedfHuman.loc[usedfHuman['Race/Ethnicity'].str.contains(','), 'Race/Ethnicity'] = 'Multiracial' 
usedfHuman

In [None]:
# grab the data that we want and rename columns
circleDfOut = compileCircleData(usedfHuman).sort_values(by = 'count', ascending = False).reset_index(drop = True)

circleDf = circleDfOut.copy()

# create the input for the circule visualization
circle = createCircleInput(circleDf, list(circleDfOut.columns)[:-1])
with open(os.path.join('demographics_circle_plot', 'src','data','PHY130-3-02_SPR2023_circle_data.json'), 'w') as outfile:
    json_object = json.dumps(circle)
    outfile.write(json_object)

circleDf

# Facilitator data (circle vis)

In [None]:
df = pd.read_csv('data/Cleaned_ISTP_Facilitator_Data.csv')
df

In [None]:
# rename columns to make this easier for me to remember
df.rename(columns = { 'Q31':'primerole','Q33':'discipline','Q34':'institution','Q36':'gender','Q35':'institutionType','Q37':'race'}, inplace = True)

In [None]:
# important columns (trying for same as in the first file)
# this does not appear to include the faculty status
# They appear to come from the SP22_RQ2_Participant_ISTP.docx file
# Q35 (institution designation), Q37 (race) need to be split
# role, discipline, institution type, gender, institution designation, race
useCols = [
    'primerole', 'discipline', 'institution', 'gender', 'institutionType', 'race'
]

In [None]:
usedf = df[useCols].dropna(how = 'all').reset_index(drop = True)#.fillna(0)
usedf['STEM'] = usedf['discipline']
usedf

In [None]:
#checking for indigenous (race == 1)
col = usedf['race']
col[col.str.contains('1').fillna(False)]

In [None]:
# count the NaN values in all the columns
Nnan = []
for c in useCols:
    N1 = usedf[c].isna().sum().sum()
    try:
        N2 = len(usedf[usedf[c].str.contains('nan')])
    except:
         N2 = 0
    try:
        N3 = len(usedf[usedf[c].str.contains(r'^\s*$', regex = True)])
    except:
        N3 = 0
        
    Nnan.append(N1 + N2 + N3)
    print(f'# NaN in {c} = {N1 + N2 + N3}')
print('')


# replace the entries

# replace numbers with values
replacements = {
    'primerole':roleMap,
    'discipline':disciplineMap,
    'STEM':disciplineSTEMMap,
    'institution':institutionMap,
    'gender':genderMap,
#     'Q35':tenureMap,
#     'Q35.1':institutionTypeMap,
#     'Q37':raceMap
}


usedfHuman = usedf.replace(replacements)

# treat the cells with multiple entries a bit differently
usedfHuman['institutionType'] = multiReplacer(usedfHuman, 'institutionType', institutionTypeMap)

# for race we will put them in the 'Multiracial' category
usedfHuman['race'].fillna('Did not respond (race)', inplace = True)
foo = usedfHuman.loc[usedfHuman['race'].str.contains(',')]
print('number of people with multiple race entries = ', len(foo))
usedfHuman.loc[usedfHuman['race'].str.contains(','), 'race'] = 'Multiracial' 
usedfHuman['race'] = multiReplacer(usedfHuman, 'race', raceMap)


# remove any extra commas
usedfHuman = usedfHuman.applymap(lambda x: str(x).rstrip(', '))

# fix any lingering nan values
usedfHuman.replace('nan',np.nan, inplace = True)
usedfHuman.replace(r'^\s*$', np.nan, regex = True, inplace = True)

usedfHuman['primerole'].fillna('Did not respond (role)', inplace = True)
usedfHuman['discipline'].fillna('Did not respond (discipline)', inplace = True)
usedfHuman['institution'].fillna('Did not respond (institution)', inplace = True)
usedfHuman['gender'].fillna('Did not respond (gender)', inplace = True)
usedfHuman['institutionType'].fillna('Did not respond (institution type)', inplace = True)
# already filled race above

# print number of people with did not respond values in all the columns
print('')
for i, c in enumerate(useCols):
    N1 = len(usedfHuman[usedfHuman[c].str.contains('respond')])
    print(f'# marked "prefer not to respond" {c} = {N1 - Nnan[i]}')
    
usedfHuman_facilitators = usedfHuman

usedfHuman

In [None]:
col = usedfHuman['race']
foo = pd.DataFrame(col[col.str.contains('Indigenous').fillna(False)])
foo

In [None]:
# grab the data that we want and rename columns
cols0 = ['primerole','institution','gender','STEM','race']
circleDfOut = compileCircleData(usedfHuman_facilitators[cols0]).sort_values(by = 'count', ascending = False).reset_index(drop = True)

circleDf = circleDfOut[cols0 + ['count']].copy()
circleDf.rename(columns = 
                {'primerole':'Role',
                'institution':'Institution Type',
                'gender':'Gender',
                'STEM':'Primary Field',
                'race':'Race'}, 
                inplace = True)

# create the input for the circule visualization
circle = createCircleInput(circleDf, ['Role', 'Institution Type', 'Gender', 'Primary Field', 'Race'])
with open(os.path.join('demographics_circle_plot', 'src','data','ISTP_demographics_facilitator_circle_data.json'), 'w') as outfile:
    json_object = json.dumps(circle)
    outfile.write(json_object)

circleDf

# Participant data (circle vis)

In [None]:
df = pd.read_csv('data/Cleaned_ISTP_Participant_Data.csv')
df.columns

In [None]:
# rename columns to make this easier for me to remember
df.rename(columns = { 'Q31':'primerole','Q33.1':'discipline','Q34.1':'institution','Q36':'gender','Q35.1':'institutionType','Q37':'race'}, inplace = True)

In [None]:
# important columns (trying for same as in the first file)
# this does not appear to include the faculty status
# They appear to come from the SP22_RQ2_Participant_ISTP.docx file
# Q35 (institution designation), Q37 (race) need to be split
# role, discipline, institution type, gender, institution designation, race
useCols = [
    'primerole', 'discipline', 'institution', 'gender', 'institutionType', 'race'
]

In [None]:
usedf = df[useCols].dropna(how = 'all').reset_index(drop = True)#.fillna(0)
usedf['STEM'] = usedf['discipline']
usedf

In [None]:
#checking for indigenous (race == 1)
col = usedf['race']
col[col.str.contains('1').fillna(False)]

In [None]:
# count the NaN values in all the columns
Nnan = []
for c in useCols:
    N1 = usedf[c].isna().sum().sum()
    try:
        N2 = len(usedf[usedf[c].str.contains('nan')])
    except:
         N2 = 0
    try:
        N3 = len(usedf[usedf[c].str.contains(r'^\s*$', regex = True)])
    except:
        N3 = 0
        
    Nnan.append(N1 + N2 + N3)
    print(f'# NaN in {c} = {N1 + N2 + N3}')
print('')


# replace the entries

# replace numbers with values
replacements = {
    'primerole':roleMap,
    'discipline':disciplineMap,
    'STEM':disciplineSTEMMap,
    'institution':institutionMap,
    'gender':genderMap,
#     'Q35':tenureMap,
#     'Q35.1':institutionTypeMap,
#     'Q37':raceMap
}


usedfHuman = usedf.replace(replacements)

# treat the cells with multiple entries a bit differently
usedfHuman['institutionType'] = multiReplacer(usedfHuman, 'institutionType', institutionTypeMap)

# for race we will put them in the 'Multiracial' category
usedfHuman['race'].fillna('Did not respond (race)', inplace = True)
foo = usedfHuman.loc[usedfHuman['race'].str.contains(',')]
print('number of people with multiple race entries = ', len(foo))
usedfHuman.loc[usedfHuman['race'].str.contains(','), 'race'] = 'Multiracial' 
usedfHuman['race'] = multiReplacer(usedfHuman, 'race', raceMap)


# remove any extra commas
usedfHuman = usedfHuman.applymap(lambda x: str(x).rstrip(', '))

# fix any lingering nan values
usedfHuman.replace('nan',np.nan, inplace = True)
usedfHuman.replace(r'^\s*$', np.nan, regex = True, inplace = True)

usedfHuman['primerole'].fillna('Did not respond (role)', inplace = True)
usedfHuman['discipline'].fillna('Did not respond (discipline)', inplace = True)
usedfHuman['institution'].fillna('Did not respond (institution)', inplace = True)
usedfHuman['gender'].fillna('Did not respond (gender)', inplace = True)
usedfHuman['institutionType'].fillna('Did not respond (institution type)', inplace = True)
# already filled race above

# print number of people with did not respond values in all the columns
print('')
for i, c in enumerate(useCols):
    N1 = len(usedfHuman[usedfHuman[c].str.contains('respond')])
    print(f'# marked "prefer not to respond" {c} = {N1 - Nnan[i]}')
    
usedfHuman_participants = usedfHuman

usedfHuman

In [None]:
col = usedfHuman['race']
foo = pd.DataFrame(col[col.str.contains('Indigenous').fillna(False)])
foo

In [None]:
# grab the data that we want and rename columns
cols0 = ['primerole','institution','gender','STEM','race']
circleDfOut = compileCircleData(usedfHuman[cols0]).sort_values(by = 'count', ascending = False).reset_index(drop = True)

circleDf = circleDfOut[cols0 + ['count']].copy()
circleDf.rename(columns = 
                {'primerole':'Role',
                'institution':'Institution Type',
                'gender':'Gender',
                'STEM':'Primary Field',
                'race':'Race'}, 
                inplace = True)

# create the input for the circule visualization
circle = createCircleInput(circleDf, ['Role', 'Institution Type', 'Gender', 'Primary Field', 'Race'])
with open(os.path.join('demographics_circle_plot', 'src','data','ISTP_demographics_participant_circle_data.json'), 'w') as outfile:
    json_object = json.dumps(circle)
    outfile.write(json_object)

circleDf

In [None]:
# create a file that only has the STEM, R1, faculty and will plot gender and Race
cols0 = ['primerole','institution','gender','STEM','race']
combined = usedfHuman[cols0].copy()
combined.rename(columns = 
                {'primerole':'Role',
                'institution':'Institution Type',
                'gender':'Gender',
                'STEM':'Primary Field',
                'race':'Race'}, inplace = True)

print('all:', len(combined))
print('faculty:', len(combined.loc[(combined['Role'] == 'Faculty member, lecturer, instructor, or adjunct faculty')]))
print('STEM:', len(combined.loc[(combined['Primary Field'] == 'STEM')]))
print('research U:', len(combined.loc[(combined['Institution Type'] == 'Research University')]))
print('faculty + STEM:', len(combined.loc[(combined['Role'] == 'Faculty member, lecturer, instructor, or adjunct faculty') & (combined['Primary Field'] == 'STEM')]))
print('faculty + research U:', len(combined.loc[(combined['Role'] == 'Faculty member, lecturer, instructor, or adjunct faculty') & (combined['Institution Type'] == 'Research University')]))
print('STEM + research U:', len(combined.loc[(combined['Primary Field'] == 'STEM') & (combined['Institution Type'] == 'Research University')]))
    
usedf = combined.loc[(combined['Role'] == 'Faculty member, lecturer, instructor, or adjunct faculty') & 
                        (combined['Primary Field'] == 'STEM') &
                        (combined['Institution Type'] == 'Research University')
                       ]

print('faculty + research U + STEM:',len(usedf))

# create the input for the circle visualization
circleDfOut = compileCircleData(usedf).sort_values(by = 'count', ascending = False).reset_index(drop = True)


# # create a file that only has the STEM, R1, faculty and will plot gender and Race
# usedf = circleDfOut.loc[(circleDfOut['Role'] == 'Faculty member, lecturer, instructor, or adjunct faculty') & 
#                         (circleDfOut['Primary Field'] == 'STEM') &
#                         (circleDfOut['Institution Type'] == 'Research University')
#                        ]
# create the input for the circle visualization

circle = createCircleInput(circleDfOut, ['Gender','Race'])
with open(os.path.join('demographics_circle_plot', 'src','data','ISTP_demographics_participant_circle_data_STEMR1faculty.json'), 'w') as outfile:
    json_object = json.dumps(circle)
    outfile.write(json_object)
