# ISTP demographics

Data is from the ```ISTP Pro/Post Survey - Demographics (Vanessa)``` folder on Drive

I will create files and tree plots.

Running in the ```ete3``` conda environment (is WSL).

In [None]:
import pandas as pd
import numpy as np
import itertools
import os

In [None]:
from ete3 import Tree, faces, AttrFace, TreeStyle, TextFace, add_face_to_node

In [None]:
pd.set_option('display.max_rows', 20)

In [None]:
def getDemographics(inputdf):

    # loop through all the columns and groupby all the other columns to get the overlaps
    # if i == 0, then we find the overlaps of all columns
    # if i > 0, then we ignore any columns < i, and replace their values with nan
    # using method from : https://stackoverflow.com/questions/35268817/unique-combinations-of-values-in-selected-columns-in-pandas-data-frame-and-count

    n = 0
    cols = inputdf.columns
    for i in np.arange(1, len(cols)):
        # get all the combinations of i columns in useCols
        itr = list(itertools.combinations(cols, i))
        print(i, len(itr))

        # these columns will be used in groupby while others will be anything
        for useColList in itr:
            g = inputdf.groupby(list(useColList)).size().reset_index().rename(columns = {0:'count'})

            # add the missing column(s) as NaN 
            for cc in cols:
                if (cc not in useColList):
                    g.insert(0, cc, np.nan)

            # remove any rows that are all nans (excluding count)
            g.dropna(how = 'all', inplace = True, subset = useColList)

            # move the count column to be first
            count = g.pop('count')
            g.insert(0, 'count', count)

            if (n == 0):
                groupdf = g
            else:
                groupdf = pd.concat([groupdf, g])

            n += 1

        print(len(groupdf))

    # convert any entry with a space or blank entry to nan
    groupdf.replace(r'^\s*$', np.nan, regex=True, inplace = True)

    # remove duplicates
    groupdf.drop_duplicates(keep = 'first', inplace = True)

    # sort
    groupdf = groupdf.sort_values(by = 'count', ascending = False)

    # add a column that has the fraction of total
    groupdf.insert(1, 'fraction', groupdf['count']/len(df))

    # add a column to count the number of non-nan entries in each row (excluding "count" and "fraction")
    groupdf.insert(2, 'nAxes', groupdf.count(axis = 1) - 2)

    # remove any rows with nAxes == 0
    groupdf = groupdf.loc[groupdf['nAxes'] > 0].reset_index(drop = True)

    # take only the rows with > 5 people in the group and sort
    groupdfTrim = groupdf.loc[groupdf['count'] > 5]
    
    # combine groups into a single columns, and output a condensed file
    groups = []

    for i,row in groupdf.iterrows():
        foo = row[cols].copy().dropna().values

        group = [x for x in foo if x != '' and not x.isspace()]
        groups.append('; '.join(group))

    outdf = groupdf[['count','fraction','nAxes']].copy()
    outdf['group'] = groups
    
    return groupdf, groupdfTrim, outdf

In [None]:
def multiReplacer(inputdf, c, m):
    # add a comma after everything to make the replacement easier
    inputdf.loc[~pd.isna(inputdf[c]), c] = inputdf[c].loc[~pd.isna(inputdf[c])].astype(str) + ','    
    
    # can't use a simple string replace on the entire dataframe because there are single and double digits 
    # (e.g., 1, can be confused with 21,)
    replacer = inputdf[c].values
    for index,row in inputdf.iterrows():

        if (row[c] is not np.nan):
            replace = ''

            if (',' in row[c]):
                vals = list(filter(None, row[c].split(','))) # remove empty strings
            else:
                vals = [row[c]]

            for v in vals: 
                try:
                    if (v != ''):
                        ind = int(float(v))
                        val = m[ind]
                        if (val is not np.nan):
                            if (val not in replace):
                                replace += val + ', '
                except:
                    replace = v
                            
            replacer[index] = replace
            
    replacer[np.where(pd.isna(replacer))] = 'Did not respond (' + c + ')'   
     
    return replacer

In [None]:
# collapsing categories (see demographicsBreakdown.ipynb for full categories)
# I am going to ignore "Other" answers

roleMap = {
    1:'Faculty member, lecturer, instructor, or adjunct faculty',
    2:'Graduate student or Postdoctoral scholar',
    3:'Graduate student or Postdoctoral scholar',
    4:'Staff member',
    5:'Other (role)',
    np.nan:'Did not respond (role)'

}
disciplineMap = {
    1:'Agriculture and natural resource sciences',
    2:'Arts',
    3:'Biological and life sciences',
    4:'Business and management science',
    5:'Chemistry',
    6:'Computer, information, and technological sciences',
    7:'Earth, environmental, atmospheric, and ocean sciences',
    8:'Education',
    9:'Engineering',
    10:'Humanities',
    11:'Law',
    12:'Mathematics and Statistics',
    13:'Medical sciences',
    14:'Physical sciences',
    15:'Psychology',
    16:'Social, behavioral, and economic sciences (not including psychology)',
    17:'Other (discipline)',
    np.nan:'Did not respond (discipline)'
}
disciplineSTEMMap = {
    1:'STEM',
    2:'non-STEM',
    3:'STEM',
    4:'non-STEM',
    5:'STEM',
    6:'STEM',
    7:'STEM',
    8:'non-STEM',
    9:'STEM',
    10:'non-STEM',
    11:'non-STEM',
    12:'STEM',
    13:'non-STEM',
    14:'STEM',
    15:'STEM',
    16:'STEM',
    17:'Other (discipline)',
    np.nan:'Did not respond (discipline)'
}
institutionMap = {
    1:'Community college / 2-year institution',
    7:'Comprehensive or Regional University',
    8:'Liberal arts college',
    9:'Research University',
    10:'Technical college', 
    11:'Other (institution)',
    np.nan:'Did not respond (institution)'

}
institutionMap_oct21 = {
    1:'Community college / 2-year institution',
    2:'Comprehensive or Regional University',
    3:'Liberal arts college',
    4:'Research University',
    5:'Technical college', 
    6:'Other (institution)',
    np.nan:'Did not respond (institution)'

}
genderMap = {
    1:'Non-binary, gender queer, self-identify', 
    8:'Cis-Man/Trans-Man',
    9:'Non-binary, gender queer, self-identify',
    10:'Cis-Man/Trans-Man',
    14:'Cis-Woman/Trans-Woman', 
    11:'Cis-Woman/Trans-Woman', 
    12:'Non-binary, gender queer, self-identify',
    13:'I prefer not to respond (gender)',
    np.nan:'Did not respond (gender)'
}

genderMap_oct21 = {
    1:'Non-binary, gender queer, self-identify', 
    2:'Man',
    3:'Non-binary, gender queer, self-identify',
    4:'Transgender',
    5:'Woman', 
    6:'Non-binary, gender queer, self-identify', 
    7:'I prefer not to respond (gender)',
    np.nan:'Did not respond (gender)'
}
# these are checkboxes so I will keep each individual column
institutionTypeMap = {
    1:'Minority-focussed institution',
    8:'Minority-focussed institution', 
    9:'Minority-focussed institution', 
    10:'Predominantly White Institution (PWI)',
    11:'Minority-focussed institution',
    12:'Minority-focussed institution',
    13:'I am not sure (institution type)',
    np.nan:'did not respond (institution type)'
}
raceMap = {
    1:'Alaska Native, American Indian, Native American or Indigenous',
    14:'Asian',
    15:'Black or African American',
    16:'Asian',
    17:'Latina/o/x or Hispanic',
    18:'other POC',
    19:'other POC',
    20:'Asian',
    21:'Asian',
    22:'White',
    23:'Multiracial',
    24:'I self-describe as (race)',
    25:'I prefer not to respond (race)',
    np.nan:'Did not respond (race)'

}
# not included in these files
tenureMap = {
    7:'Tenured and tenure-track',
    19:'Tenured and tenure-track',
    12:'Full-time teaching/instructional or research',
    20:'Full-time teaching/instructional or research',
    23:'Part-time teaching/instructional',
    22:'Full-time teaching/instructional or research',
    21:'Full-time teaching/instructional or research',
    15:np.nan,
    np.nan:'Did not respond (tenure)'

}

## First file
```data/ISTP_demographics_spring22_Aaron.csv```

In [None]:
df = pd.read_csv('data/ISTP_demographics_spring22_Aaron.csv')
df

In [None]:
df.columns

In [None]:
# role, discipline, institution type, gender,  institution designation, race
# (no faculty status in this file?),
useCols = [
    'primerole_march22', 'discipline_march22', 'institution_msi_oct22', 
    'gender_march22', 'institution_march22','race'
]

In [None]:
#usedf = df[useCols]#.dropna(how = 'all').reset_index(drop = True)#.fillna(0)
usedf = df.loc[df['completion_binary'] == 1][useCols].reset_index(drop = True)
# add an additional column for STEM
usedf['STEM_march22'] = usedf['discipline_march22']
usedf

In [None]:
#checking for indigenous (race == 1)
col = usedf['race']
foo = pd.DataFrame(col[col.str.contains('1').fillna(False)])
foo

In [None]:
# replace the entries

# replace numbers with values
replacements = {
    'primerole_march22':roleMap,
    'discipline_march22':disciplineMap,
    'STEM_march22':disciplineSTEMMap,
    'institution_march22':institutionMap,
    'gender_march22':genderMap,
#     'Q35.1':institutionTypeMap,
#     'race':raceMap
}

usedfHuman = usedf.replace(replacements)

# treat the cells with multiple entries a bit differently
usedfHuman['institution_msi_oct22'] = multiReplacer(usedfHuman, 'institution_msi_oct22', institutionTypeMap)

# for race we will put them in the 'Multiracial' category
usedfHuman['race'].fillna('Did not respond (race)', inplace = True)
usedfHuman.loc[usedfHuman['race'].str.contains(','), 'race'] = 'Multiracial' 
usedfHuman['race'] = multiReplacer(usedfHuman, 'race', raceMap)

# remove any extra commas
usedfHuman = usedfHuman.applymap(lambda x: str(x).rstrip(', '))

# fix any lingering nan values
usedfHuman.replace('nan',np.nan, inplace = True)
usedfHuman.replace(r'^\s*$', np.nan, regex = True, inplace = True)

usedfHuman['primerole_march22'].fillna('Did not respond (role)', inplace = True)
usedfHuman['discipline_march22'].fillna('Did not respond (discipline)', inplace = True)
usedfHuman['STEM_march22'].fillna('Did not respond (discipline)', inplace = True)
usedfHuman['institution_march22'].fillna('Did not respond (institution)', inplace = True)
usedfHuman['gender_march22'].fillna('Did not respond (gender)', inplace = True)
usedfHuman['institution_msi_oct22'].fillna('Did not respond (institution type)', inplace = True)
# already filled race above

usedfHuman

In [None]:
usedfHuman.iloc[17]

In [None]:
usedf.iloc[17]

In [None]:
groupdf, groupdfTrim, outdf = getDemographics(usedfHuman)

In [None]:
groupdfTrim

In [None]:
outdf

In [None]:
outdf.to_csv(os.path.join('analysis','ISTP_demographics_march22_Aaron_demographicsGroupsCondensed.csv'), index = False)
groupdf.to_csv(os.path.join('analysis','ISTP_demographics_march22_Aaron_demographicsGroupsFull.csv'), index = False)

## Make the trees

In [None]:
def addNodesToTree(base, cols, i, nodes, inputdf):
    c = cols[i]
    if (c in inputdf.columns):
        # get the unique values in this column
        unique_values = inputdf[c].dropna().unique()

        # add them as nodes to the tree
        for col_name in unique_values:
            usedf = inputdf.loc[inputdf[c] == col_name]

            # if there are >0 rows in the inputdf that have these values then 
            if len(usedf) > 0:
                #name = ' ' + col_name + ' [' + str(len(usedf)) + '/' + str(len(inputdf)) + ', {:.1f}%] '.format(len(usedf)/len(inputdf)*100.)
                name = ' ' + col_name + ' [' + str(len(usedf)) + ', {:.1f}%] '.format(len(usedf)/len(inputdf)*100.)
                nodes[name] = base.add_child(name = name)
                nodes[name].support = len(usedf)
                
                # recursively move down the tree
                if (i+1 < len(cols)):
                    addNodesToTree(nodes[name], cols, i+1, nodes, usedf)

In [None]:
# https://github.com/etetoolkit/ete/issues/219
def my_layout(node):
    F = TextFace(node.name, tight_text = True)
    add_face_to_node(F, node, column = 0, position = "branch-right")
        
# http://etetoolkit.org/docs/latest/faqs/#how-do-i-visualize-internal-node-names
# def my_layout(node):
#     if node.is_leaf():
#         # If terminal node, draws its name
#         name_face = AttrFace("name")
#     else:
#         # If internal node, draws label with smaller font size
#         name_face = AttrFace("name", fsize=10)
#     # Adds the name face to the image at the preferred position
#     faces.add_face_to_node(name_face, node, column=0, position="branch-right")

tree_style = TreeStyle()

# Do not add leaf names automatically
tree_style.show_leaf_name = False

# increase the y spacing
tree_style.branch_vertical_margin = 10

# I need some way to remove the scale bar at the bottom

# Use my custom layout
tree_style.layout_fn = my_layout

In [None]:
def createSingleColTree(df, filename):
    
    # create the tree
    tree = Tree()
    nodes = {}
    addNodesToTree(tree, df.columns, 0, nodes, df)
    
    # seems like to need to remove the file first or else it doesn't get written
    fname = os.path.join(os.getcwd(),filename)
    if os.path.isfile(fname):
        os.remove(fname)
    
    # write the file
    _ = tree.render(fname, w=11, units="in", tree_style=tree_style)

In [None]:
createSingleColTree(usedfHuman['gender_march22'].to_frame(), os.path.join('analysis','figures','ISTP_gender_march22_tree.pdf'))
createSingleColTree(usedfHuman['race'].to_frame(), os.path.join('analysis','figures','ISTP_race_march22_tree.pdf'))
createSingleColTree(usedfHuman['institution_march22'].to_frame(), os.path.join('analysis','figures','ISTP_institution_march22_tree.pdf'))
createSingleColTree(usedfHuman['primerole_march22'].to_frame(), os.path.join('analysis','figures','ISTP_role_march22_tree.pdf'))
createSingleColTree(usedfHuman['STEM_march22'].to_frame(), os.path.join('analysis','figures','ISTP_STEM_march22_tree.pdf'))

## Second file
```data/ISTP_demographics_fall21_Aaron.csv```

In [None]:
df = pd.read_csv('data/ISTP_demographics_fall21_Aaron.csv')
df

In [None]:
df.columns

In [None]:
# role, discipline, institution type, gender,  institution designation, race
# (no faculty status in this file?),
useCols = [
    'primerole_oct21', 'discipline_oct21', 'institution_msi_oct21', 
    'gender_oct21', 'institution_oct21','race'
]

In [None]:
#usedf = df[useCols]#.dropna(how = 'all').reset_index(drop = True)#.fillna(0)
usedf = df.loc[df['completion_binary_oct21'] == 1][useCols].reset_index(drop = True)
# add an additional column for STEM
usedf['STEM_oct21'] = usedf['discipline_oct21']
usedf

In [None]:
# replace the entries

# replace numbers with values
replacements = {
    'primerole_oct21':roleMap,
    'discipline_oct21':disciplineMap,
    'STEM_oct21':disciplineSTEMMap,
    'institution_oct21':institutionMap_oct21,
    'gender_oct21':genderMap_oct21,
#     'Q35.1':institutionTypeMap,
#     'race':raceMap
}

usedfHuman = usedf.replace(replacements)

# treat the cells with multiple entries a bit differently
usedfHuman['institution_msi_oct21'] = multiReplacer(usedfHuman, 'institution_msi_oct21', institutionTypeMap)

# for race we will put them in the 'Multiracial' category
usedfHuman['race'].fillna('Did not respond (race)', inplace = True)
usedfHuman.loc[usedfHuman['race'].str.contains(','),'race'] = 'Multiracial' 
usedfHuman['race'] = multiReplacer(usedfHuman, 'race', raceMap)

# remove any extra commas
usedfHuman = usedfHuman.applymap(lambda x: str(x).rstrip(', '))

# fix any lingering nan values
usedfHuman.replace('nan',np.nan, inplace = True)
usedfHuman.replace(r'^\s*$', np.nan, regex = True, inplace = True)

usedfHuman['primerole_oct21'].fillna('Did not respond (role)', inplace = True)
usedfHuman['discipline_oct21'].fillna('Did not respond (discipline)', inplace = True)
usedfHuman['STEM_oct21'].fillna('Did not respond (discipline)', inplace = True)
usedfHuman['institution_oct21'].fillna('Did not respond (institution)', inplace = True)
usedfHuman['gender_oct21'].fillna('Did not respond (gender)', inplace = True)
usedfHuman['institution_msi_oct21'].fillna('Did not respond (institution type)', inplace = True)
# already filled race above

usedfHuman

In [None]:
groupdf, groupdfTrim, outdf = getDemographics(usedfHuman)

In [None]:
groupdfTrim

In [None]:
outdf

In [None]:
outdf.to_csv(os.path.join('analysis','ISTP_demographics_oct21_Aaron_demographicsGroupsCondensed.csv'), index = False)
groupdf.to_csv(os.path.join('analysis','ISTP_demographics_oct21_Aaron_demographicsGroupsFull.csv'), index = False)

In [None]:
createSingleColTree(usedfHuman['gender_oct21'].to_frame(), os.path.join('analysis','figures','ISTP_gender_oct21_tree.pdf'))
createSingleColTree(usedfHuman['race'].to_frame(), os.path.join('analysis','figures','ISTP_race_oct21_tree.pdf'))
createSingleColTree(usedfHuman['institution_oct21'].to_frame(), os.path.join('analysis','figures','ISTP_institution_oct21_tree.pdf'))
createSingleColTree(usedfHuman['primerole_oct21'].to_frame(), os.path.join('analysis','figures','ISTP_role_oct21_tree.pdf'))
createSingleColTree(usedfHuman['STEM_oct21'].to_frame(), os.path.join('analysis','figures','ISTP_STEM_oct21_tree.pdf'))

## Create an input data file for the bundling vis

In [None]:
import json

In [None]:
bundling = []

cols1 = ['primerole_oct21','institution_oct21','gender_oct21','race','count']
bundlingDf = groupdfTrim[cols1].copy()
#bundlingDf = bundlingDf[bundlingDf.isnull().sum(axis=1) <= 2]

# cols1 = ['primerole_oct21','institution_oct21','gender_oct21','race']
# bundlingDf = usedfHuman[cols1].copy()
# bundlingDf['count'] = 1.

bundlingDf.rename(columns = {'primerole_oct21':'Role',
                   'institution_oct21':'Institution Type',
                  'gender_oct21':'Gender',
                  'race':'Race'}, inplace = True)
cols = ['Role', 'Institution Type', 'Gender', 'Race']

# do I need to expand this to have multiple rows per person, one for each demographic category?
# try with just the first 2 rows

for i, row in bundlingDf.iterrows():
    demo_dict = row[cols].to_dict()
    demo_list = []
    for key in demo_dict.keys():
        if (not pd.isnull(demo_dict[key])):
            demo_list.append(key + '.' + demo_dict[key])
#     demo_list = [key + '.' + demo_dict[key] if not math.isnan(demo_dict[key]) for key in demo_dict.keys()]
    for c in cols:
        if (not pd.isnull(row[c])):
            person = 'person' + str(i).zfill(3)
            demo = c + '.' + row[c]
            other_demo_list = demo_list.copy()
            other_demo_list.remove(demo)

            entry = {'name':demo + '.' + person, 
                     'other_demographics':[ v + '.' + person for v in other_demo_list], 
    #                  'full_demographics':', '.join(list(demo_dict.values()))
                     'size': row['count']
                    }
            bundling.append(entry)
#     if (i > 2):
#         break
        
#bundling
    
    
# I need to combine duplicates!

In [None]:
with open(os.path.join('demographics_circle_bundling_plot', 'src','data','ISTP_demographics_oct21_bundling.json'), 'w') as outfile:
    json_object = json.dumps(bundling)
    outfile.write(json_object)

In [None]:
bundlingDf['Gender'].unique()