# Tree diagram of demographics

This looks like a good package to try : http://etetoolkit.org/ .  But it apparently only works on Linux and Mac (not Windows).  I will try in wsl.

```
conda create -n ete3 python=3 jupyter pandas numpy
conda activate ete3
conda install -c etetoolkit ete3 ete_toolchain 
ete3 build check
```

That build check revealed that muscle and slr were "missing" (though they are installed, so they must not be compatible). I tried following the recommendations to upgrade-external-tools, but that didn't work.  Maybe I don't need them...

```
conda install -c conda-forge cxx-compiler
ete3 upgrade-external-tools
```



I need to envision how this tree would look.  I suppose the root breaks into man vs. woman and on each side I repeat the breakdown while there are still people in the category.
- how should I deal with multiple races and other multiple columns (currently they each get their own branch)


In [None]:
from ete3 import Tree, faces, AttrFace, TreeStyle

In [None]:
import pandas as pd
import numpy as np
import itertools

In [None]:
# mapping for the answer from numbers to words from the SP22_RQ2_Participant_ISTP.docx file
# I am going to ignore "Other" answers
roleMap = {
    1:'Faculty member, lecturer, instructor, or adjunct faculty',
    2:'Graduate student',
    3:'Posdoctoral scholar',
    4:'Staff member',
#    5:'Other (role)'
    5:np.nan
}
disciplineMap = {
    1:'Agriculture and natural resource sciences',
    2:'Arts',
    3:'Biological and life sciences',
    4:'Business and management science',
    5:'Chemistry',
    6:'Computer, information, and technological sciences',
    7:'Earth, environmental, atmospheric, and ocean sciences',
    8:'Education',
    9:'Engineering',
    10:'Humanities',
    11:'Law',
    12:'Mathematics and Statistics',
    13:'Medical sciences',
    14:'Physical sciences',
    15:'Psychology',
    16:'Social, behavioral, and economic sciences (not including psychology)',
#    17:'Other (discipline)'
    17:np.nan
}
institutionMap = {
    1:'Community college / 2-year institution',
    7:'Comprehensive or Regional University (e.g., smaller state school, schools that offer mostly bachelor or masters degrees)',
    8:'Liberal arts college',
    9:'Research University',
    10:'Technical college', 
#    11:'Other (institution)'
    11:np.nan
}
genderMap = {
    1:'Gender queer or gender non-conforming', 
    8:'Man',
    9:'Nonbinary',
    10:'Transman',
    14:'Transwoman', 
    11:'Woman', 
#    12:'I self-describe as (gender)',
#    13:'I prefer not to respond (gender).'
    12:np.nan,
    13:np.nan
}
firstgenMap = {1:'first gen'}
armyMap = {1:'veteran'}

# these are checkboxes so I will keep each individual column
institutionTypeMap = {
    1:'Asian American and Pacific Islander Serving Institution (AAPISI)',
    8:'Hispanic Serving Institution (HSI)', 
    9:'Historically Black College and University (HBCU)', 
    10:'Predominantly White Institution (PWI)',
    11:'Tribal College/University',
    12:'Other Minority Serving Institution (MSI)',
#    13:'I am not sure (institution)'
    13: np.nan
}
raceMap = {
    1:'Alaska Native, American Indian, Native American or Indigenous',
    14:'Asian American',
    15:'Black or African American',
    16:'East Asian',
    17:'Latina/o/x or Hispanic',
    18:'Middle Eastern or Northern African',
    19:'Pacific Islander',
    20:'South Asian',
    21:'Southeast Asian',
    22:'White',
    23:'Multiracial',
#    24:'I self-describe as (race):',
#    25:'I prefer not to respond (race).'
    24:np.nan,
    25:np.nan
}
raceMap2 = {
    'nativA':'Alaska Native, American Indian, Native American or Indigenous',
    'asianA':'Asian American',
    'africanA':'Black or African American',
    'asianE':'East Asian',
    'latinx':'Latina/o/x or Hispanic',
    'MENA':'Middle Eastern or Northern African',
    'pi':'Pacific Islander',
    'asianS':'South Asian',
    'asianSE':'Southeast Asian',
    'white':'White',
    'multi':'Multiracial',
}
tenureMap = {
    7:'Tenured (associate or full professor status)',
    19:'Tenure-track (assistant professor status)',
    12:'Full-time teaching or instructional track on a fixed-term renewable contract',
    20:'Full-time teaching or instructional on a fixed-term, non-renewable contract',
    23:'Part-time teaching or instructional on a fixed-term, non-renewable contract',
    22:'Research faculty on a fixed-term, renewable contract',
    21:'Research faculty on a fixed-term, non-renewable contract',
    15:np.nan
}

In [None]:
def multiReplacer(usedfHuman, c, m):
    usedfHuman[c].loc[~pd.isna(usedfHuman[c])] = usedfHuman[c].loc[~pd.isna(usedfHuman[c])].astype(str) + ','
    # can't use a simple string replace on the entire dataframe because there are single and double digits 
    # (e.g., 1, can be confused with 21,)
    replacer = usedfHuman[c].values
    for index,row in usedfHuman.iterrows():

        if (row[c] is not np.nan):
            replace = ''

            if (',' in row[c]):
                vals = list(filter(None, row[c].split(','))) # remove empty strings
            else:
                vals = [row[c]]

            for v in vals: 
                if (v != ''):
                    if (m[int(v)] is not np.nan):
                        replace += m[int(v)] + ', '
            replacer[index] = replace
                
    return replacer

In [None]:
def addNodesToTree(base, cols, i, nodes, inputdf):
    c = cols[i]
    if (c in inputdf.columns):
        # get the unique values in this column
        unique_values = inputdf[c].dropna().unique()

        # add them as nodes to the tree
        for col_name in unique_values:
            usedf = inputdf.loc[inputdf[c] == col_name]

            # if there are >0 rows in the inputdf that have these values then 
            if len(usedf) > 0:
                name = ' ' + col_name + ' [' + str(len(usedf)) + '] '
                nodes[name] = base.add_child(name = name)
                nodes[name].support = len(usedf)
                
#                 print('==========', c)
#                 print('name:', name)
#                 print('n:', len(usedf))
#                 print('base:', nodes[name])
#                 print('cols:', cols[i:])
#                 print('nodes:', nodes)
#                 print('\n')
                # recursively move down the tree
                if (i+1 < len(cols)):
                    addNodesToTree(nodes[name], cols, i+1, nodes, usedf)

In [None]:
# https://github.com/etetoolkit/ete/issues/219
def my_layout(node):
    F = TextFace(node.name, tight_text = True)
    add_face_to_node(F, node, column = 0, position = "branch-right")
        
# http://etetoolkit.org/docs/latest/faqs/#how-do-i-visualize-internal-node-names
# def my_layout(node):
#     if node.is_leaf():
#         # If terminal node, draws its name
#         name_face = AttrFace("name")
#     else:
#         # If internal node, draws label with smaller font size
#         name_face = AttrFace("name", fsize=10)
#     # Adds the name face to the image at the preferred position
#     faces.add_face_to_node(name_face, node, column=0, position="branch-right")

tree_style = TreeStyle()

# Do not add leaf names automatically
tree_style.show_leaf_name = False

# increase the y spacing
tree_style.branch_vertical_margin = 10

# I need some way to remove the scale bar at the bottom

# Use my custom layout
tree_style.layout_fn = my_layout

## Second file
`data/Cleaned_ISTP_Participant_Data.csv`

In [None]:
df = pd.read_csv('data/Cleaned_ISTP_Participant_Data.csv')
df

In [None]:
# important columns (trying for same as in the first file)
# this does include the faculty status
# They appear to come from the SP22_RQ2_Participant_ISTP.docx file
# Q35.1 (institution designation), Q37 (race) need to be split
# role, discipline, institution type, gender, faculty status, institution designation, race
useCols = [
    'Q36', 'Q37', 'Q31', 'Q33.1', 'Q34.1', 'Q35', 'Q35.1'
]

In [None]:
usedf = df[useCols].dropna(how = 'all').reset_index(drop = True)#.fillna(0)
usedf

In [None]:
# replace the entries

# replace numbers with values
replacements = {
    'Q31':roleMap,
    'Q33.1':disciplineMap,
    'Q34.1':institutionMap,
    'Q36':genderMap,
    'Q35':tenureMap,
#     'Q35.1':institutionTypeMap,
#     'Q37':raceMap
}


usedfHuman = usedf.replace(replacements)

# treat the cells with multiple entries a bit differently
usedfHuman['Q35.1'] = multiReplacer(usedfHuman, 'Q35.1', institutionTypeMap)
usedfHuman['Q37'] = multiReplacer(usedfHuman, 'Q37', raceMap)
    
# also replace the column names
usedfHuman.rename(columns = {'Q31': 'Q31-role', 
                            'Q33.1': 'Q33.1-discipline',
                            'Q34.1' : 'Q34.1-institution',
                            'Q36':'Q36-gender',
                            'Q35':'Q35-tenure',
                            'Q35.1':'Q35.1-institutionType',
                            'Q37':'Q37-race'}, 
                  inplace=True)

# remove any extra commas
usedfHuman = usedfHuman.applymap(lambda x: str(x).rstrip(', '))

# fix any lingering nan values
usedfHuman.replace('nan',np.nan, inplace = True)
usedfHuman.replace(r'^\s*$', np.nan, regex = True, inplace = True)

usedfHuman

In [None]:
tree = Tree()
nodes = {}

addNodesToTree(tree, usedfHuman.columns, 0, nodes, usedfHuman)

In [None]:
tree.show(tree_style = tree_style)

In [None]:
_ = tree.render("test_tree.pdf", w=11, units="in", tree_style=tree_style)