# Determine the demographic breakdown of the participants

We want to know the most frequent categories of the participants, including all overlaps.

In [None]:
import pandas as pd
import numpy as np
import itertools

In [None]:
def getDemographics(inputdf):

    # loop through all the columns and groupby all the other columns to get the overlaps
    # if i == 0, then we find the overlaps of all columns
    # if i > 0, then we ignore any columns < i, and replace their values with nan
    # using method from : https://stackoverflow.com/questions/35268817/unique-combinations-of-values-in-selected-columns-in-pandas-data-frame-and-count

    n = 0
    cols = inputdf.columns
    for i in np.arange(1, len(cols)):
        # get all the combinations of i columns in useCols
        itr = list(itertools.combinations(cols, i))
        print(i, len(itr))

        # these columns will be used in groupby while others will be anything
        for useColList in itr:
            g = inputdf.groupby(list(useColList)).size().reset_index().rename(columns = {0:'count'})

            # add the missing column(s) as NaN 
            for cc in cols:
                if (cc not in useColList):
                    g.insert(0, cc, np.nan)

            # remove any rows that are all nans (excluding count)
            g.dropna(how = 'all', inplace = True, subset = useColList)

            # move the count column to be first
            count = g.pop('count')
            g.insert(0, 'count', count)

            if (n == 0):
                groupdf = g
            else:
                groupdf = pd.concat([groupdf, g])

            n += 1

        print(len(groupdf))

    # convert any entry with a space or blank entry to nan
    groupdf.replace(r'^\s*$', np.nan, regex=True, inplace = True)

    # remove duplicates
    groupdf.drop_duplicates(keep = 'first', inplace = True)

    # sort
    groupdf = groupdf.sort_values(by = 'count', ascending = False)

    # add a column that has the fraction of total
    groupdf.insert(1, 'fraction', groupdf['count']/len(df))

    # add a column to count the number of non-nan entries in each row (excluding "count" and "fraction")
    groupdf.insert(2, 'nAxes', groupdf.count(axis = 1) - 2)

    # remove any rows with nAxes == 0
    groupdf = groupdf.loc[groupdf['nAxes'] > 0].reset_index(drop = True)

    # take only the rows with > 5 people in the group and sort
    groupdfTrim = groupdf.loc[groupdf['count'] > 5]
    
    # coimbine groups into a single columns, and output a condensed file
    groups = []

    for i,row in groupdf.iterrows():
        foo = row[cols].copy().dropna().values

        group = [x for x in foo if x != '' and not x.isspace()]
        groups.append('; '.join(group))

    outdf = groupdf[['count','fraction','nAxes']].copy()
    outdf['group'] = groups
    
    return groupdf, groupdfTrim, outdf

In [None]:
# mapping for the answer from numbers to words from the SP22_RQ2_Participant_ISTP.docx file
# I am going to ignore "Other" answers
roleMap = {
    1:'Faculty member, lecturer, instructor, or adjunct faculty',
    2:'Graduate student',
    3:'Posdoctoral scholar',
    4:'Staff member',
#    5:'Other (role)'
    5:np.nan
}
disciplineMap = {
    1:'Agriculture and natural resource sciences',
    2:'Arts',
    3:'Biological and life sciences',
    4:'Business and management science',
    5:'Chemistry',
    6:'Computer, information, and technological sciences',
    7:'Earth, environmental, atmospheric, and ocean sciences',
    8:'Education',
    9:'Engineering',
    10:'Humanities',
    11:'Law',
    12:'Mathematics and Statistics',
    13:'Medical sciences',
    14:'Physical sciences',
    15:'Psychology',
    16:'Social, behavioral, and economic sciences (not including psychology)',
#    17:'Other (discipline)'
    17:np.nan
}
institutionMap = {
    1:'Community college / 2-year institution',
    7:'Comprehensive or Regional University (e.g., smaller state school, schools that offer mostly bachelor or masters degrees)',
    8:'Liberal arts college',
    9:'Research University',
    10:'Technical college', 
#    11:'Other (institution)'
    11:np.nan
}
genderMap = {
    1:'Gender queer or gender non-conforming', 
    8:'Man',
    9:'Nonbinary',
    10:'Transman',
    14:'Transwoman', 
    11:'Woman', 
#    12:'I self-describe as (gender)',
#    13:'I prefer not to respond (gender).'
    12:np.nan,
    13:np.nan
}
firstgenMap = {1:'first gen'}
armyMap = {1:'veteran'}

# these are checkboxes so I will keep each individual column
institutionTypeMap = {
    1:'Asian American and Pacific Islander Serving Institution (AAPISI)',
    8:'Hispanic Serving Institution (HSI)', 
    9:'Historically Black College and University (HBCU)', 
    10:'Predominantly White Institution (PWI)',
    11:'Tribal College/University',
    12:'Other Minority Serving Institution (MSI)',
#    13:'I am not sure (institution)'
    13: np.nan
}
raceMap = {
    1:'Alaska Native, American Indian, Native American or Indigenous',
    14:'Asian American',
    15:'Black or African American',
    16:'East Asian',
    17:'Latina/o/x or Hispanic',
    18:'Middle Eastern or Northern African',
    19:'Pacific Islander',
    20:'South Asian',
    21:'Southeast Asian',
    22:'White',
    23:'Multiracial',
#    24:'I self-describe as (race):',
#    25:'I prefer not to respond (race).'
    24:np.nan,
    25:np.nan
}
raceMap2 = {
    'nativA':'Alaska Native, American Indian, Native American or Indigenous',
    'asianA':'Asian American',
    'africanA':'Black or African American',
    'asianE':'East Asian',
    'latinx':'Latina/o/x or Hispanic',
    'MENA':'Middle Eastern or Northern African',
    'pi':'Pacific Islander',
    'asianS':'South Asian',
    'asianSE':'Southeast Asian',
    'white':'White',
    'multi':'Multiracial',
}
tenureMap = {
    7:'Tenured (associate or full professor status)',
    19:'Tenure-track (assistant professor status)',
    12:'Full-time teaching or instructional track on a fixed-term renewable contract',
    20:'Full-time teaching or instructional on a fixed-term, non-renewable contract',
    23:'Part-time teaching or instructional on a fixed-term, non-renewable contract',
    22:'Research faculty on a fixed-term, renewable contract',
    21:'Research faculty on a fixed-term, non-renewable contract',
    15:np.nan
}

## First file

`data/Consent_Pre_Post_MERGE.sav`

In [None]:
# this is a file that combines results from multiple surveys. 
# the challenge is going to be identifying the useful columns and what questions they belong to!
df = pd.read_spss('data/Consent_Pre_Post_MERGE.sav')
df

In [None]:
cols = df.columns.values
print(cols)

In [None]:
# I think these are the important columns
# They appear to come from the SP22_RQ2_Participant_ISTP.docx file
# missing faculty status question
useCols = [
    'primerole_march22', 'discipline_march22', 'institution_march22', 'gender_march22',
    #'firstgen0322', 'army',
    'institution1', 'institution8', 'institution9', 'institution10', 'institution11', 'institution12',# 'institution13',
    'race_nativA_march22', 'race_asianA_march22',
    'race_africanA_march22', 'race_asianE_march22', 'race_latinx_march22',
    'race_MENA_march22', 'race_pi_march22', 'race_asianS_march22',
    'race_asianSE_march22', 'race_white_march22', 'race_multi_march22'
]

In [None]:
usedf = df[useCols].dropna(how = 'all').reset_index(drop = True)#.fillna(0)
usedf

In [None]:
def findRange(col):
    return [np.min(usedf[col].fillna(0).values), np.max(usedf[col].fillna(0).values)]

In [None]:
for i,c in enumerate(useCols):
    print(c, findRange(c))

In [None]:
# I need to create a combined column that has the checkbox results as a list (for all non-nan)
# otherwise, it will take way to long to iterate over all these columns!
# I think maybe now would be a good time to convert to human readable format

# replace numbers with values
replacements = {
    'primerole_march22':roleMap,
    'discipline_march22':disciplineMap,
    'institution_march22':institutionMap,
    'gender_march22':genderMap,
    'firstgen0322':firstgenMap,
    'army':armyMap
}

for key, value in institutionTypeMap.items():
    col = 'institution' + str(key)
    replacements[col] = {1:value}
    
for key, value in raceMap2.items():
    col = 'race_' + key + '_march22'
    replacements[col] = {1:value}

usedfHuman = usedf.replace(replacements)

# now combine the institution and race columns into single columns, respectively
institutionCols = ['institution1', 'institution8', 'institution9', 'institution10', 'institution11', 'institution12']
foo = usedfHuman[institutionCols].values.tolist() 
institutionType = [ ', '.join([x for x in y if str(x) != 'nan']) for y in foo]
usedfHuman.drop(institutionCols, axis = 1, inplace = True)
usedfHuman['institutionType_march22'] = institutionType

raceCols = ['race_nativA_march22', 'race_asianA_march22',
    'race_africanA_march22', 'race_asianE_march22', 'race_latinx_march22',
    'race_MENA_march22', 'race_pi_march22', 'race_asianS_march22',
    'race_asianSE_march22', 'race_white_march22', 'race_multi_march22']
foo = usedfHuman[raceCols].values.tolist() 
raceType = [ ', '.join([x for x in y if str(x) != 'nan']) for y in foo]
usedfHuman.drop(raceCols, axis = 1, inplace = True)
usedfHuman['race_march22'] = raceType

usedfHuman

In [None]:
groupdf, groupdfTrim, outdf = getDemographics(usedfHuman)

In [None]:
groupdfTrim

In [None]:
outdf

In [None]:
outdf.to_csv('data/Consent_Pre_Post_MERGE_demographicsGroupsCondensed.csv', index = False)
groupdf.to_csv('data/Consent_Pre_Post_MERGE_demographicsGroupsFull.csv', index = False)

## Second file
`data/Cleaned_ISTP_Participant_Data.csv`

In [None]:
df = pd.read_csv('data/Cleaned_ISTP_Participant_Data.csv')
df

In [None]:
# important columns (trying for same as in the first file)
# this does include the faculty status
# They appear to come from the SP22_RQ2_Participant_ISTP.docx file
# Q35.1 (institution designation), Q37 (race) need to be split
# role, discipline, institution type, gender, faculty status, institution designation, race
useCols = [
    'Q31', 'Q33.1', 'Q34.1', 'Q36', 'Q35', 'Q35.1','Q37'
]

In [None]:
usedf = df[useCols].dropna(how = 'all').reset_index(drop = True)#.fillna(0)
usedf

In [None]:
# replace the entries

# replace numbers with values
replacements = {
    'Q31':roleMap,
    'Q33.1':disciplineMap,
    'Q34.1':institutionMap,
    'Q36':genderMap,
    'Q35':tenureMap,
#     'Q35.1':institutionTypeMap,
#     'Q37':raceMap
}


usedfHuman = usedf.replace(replacements)

# treat the cells with multiple entries a bit differently
# add a comma to the end of each entry so that I can use that to divide the values
c = 'Q35.1'
m = institutionTypeMap
usedfHuman[c].loc[~pd.isna(usedfHuman[c])] = usedfHuman[c].loc[~pd.isna(usedfHuman[c])].astype(str) + ','
for key, value in m.items():
    val = value
    if (pd.isna(val)):
        val = ''
    print(key, val)
    usedfHuman[c] = usedfHuman[c].str.replace(str(key) + ',', str(val) + ', ')
    
c = 'Q37'
m = raceMap
usedfHuman[c].loc[~pd.isna(usedfHuman[c])] = usedfHuman[c].loc[~pd.isna(usedfHuman[c])].astype(str) + ','
for key, value in m.items():
    val = value
    if (pd.isna(val)):
        val = ''
    print(key, val)
    usedfHuman[c] = usedfHuman[c].str.replace(str(key) + ',', str(val) + ', ')
    
# also replace the column names
usedfHuman.rename(columns = {'Q31': 'Q31-role', 
                            'Q33.1': 'Q33.1-discipline',
                            'Q34.1' : 'Q34.1-institution',
                            'Q36':'Q36-gender',
                            'Q35':'Q35-tenure',
                            'Q35.1':'Q35.1-institutionType',
                            'Q37':'Q37-race'}, 
                  inplace=True)

# remove any extra commas
usedfHuman = usedfHuman.applymap(lambda x: str(x).rstrip(', '))

# fix any lingering nan values
usedfHuman.replace('nan',np.nan, inplace = True)

usedfHuman

In [None]:
groupdf, groupdfTrim, outdf = getDemographics(usedfHuman)

In [None]:
groupdfTrim

In [None]:
outdf

In [None]:
outdf.to_csv('data/Cleaned_ISTP_Participant_Data_demographicsGroupsCondensed.csv', index = False)
groupdf.to_csv('data/Cleaned_ISTP_Participant_Data_demographicsGroupsFull.csv', index = False)

## Third file
`data/Cleaned_ISTP_Facilitator_Data.csv`

In [None]:
df = pd.read_csv('data/Cleaned_ISTP_Facilitator_Data.csv')
df

In [None]:
# important columns (trying for same as in the first file)
# this does not appear to include the faculty status
# They appear to come from the SP22_RQ2_Participant_ISTP.docx file
# Q35 (institution designation), Q37 (race) need to be split
# role, discipline, institution type, gender, institution designation, race
useCols = [
    'Q31', 'Q33', 'Q34', 'Q36', 'Q35', 'Q37'
]

In [None]:
usedf = df[useCols].dropna(how = 'all').reset_index(drop = True)#.fillna(0)
usedf

In [None]:
# replace the entries

# replace numbers with values
replacements = {
    'Q31':roleMap,
    'Q33':disciplineMap,
    'Q34':institutionMap,
    'Q36':genderMap,
#     'Q35':tenureMap,
#     'Q35.1':institutionTypeMap,
#     'Q37':raceMap
}


usedfHuman = usedf.replace(replacements)

# treat the cells with multiple entries a bit differently
# add a comma to the end of each entry so that I can use that to divide the values
c = 'Q35'
m = institutionTypeMap
usedfHuman[c].loc[~pd.isna(usedfHuman[c])] = usedfHuman[c].loc[~pd.isna(usedfHuman[c])].astype(str) + ','
for key, value in m.items():
    val = value
    if (pd.isna(val)):
        val = ''
    print(key, val)
    usedfHuman[c] = usedfHuman[c].str.replace(str(key) + ',', str(val) + ', ')

c = 'Q37'
m = raceMap
usedfHuman[c].loc[~pd.isna(usedfHuman[c])] = usedfHuman[c].loc[~pd.isna(usedfHuman[c])].astype(str) + ','
for key, value in m.items():
    val = value
    if (pd.isna(val)):
        val = ''
    usedfHuman[c] = usedfHuman[c].str.replace(str(key) + ',', str(val) + ', ')

# also replace the column names
usedfHuman.rename(columns = {'Q31': 'Q31-role', 
                            'Q33': 'Q33-discipline',
                            'Q34' : 'Q34-institution',
                            'Q36':'Q36-gender',
                            #'Q35':'Q35-tenure',
                            'Q35':'Q35-institutionType',
                            'Q37':'Q37-race'}, 
                  inplace=True)

# remove any extra commas
usedfHuman = usedfHuman.applymap(lambda x: str(x).rstrip(', '))

# fix any lingering nan values
usedfHuman.replace('nan',np.nan, inplace = True)

usedfHuman

In [None]:
groupdf, groupdfTrim, outdf = getDemographics(usedfHuman)

In [None]:
groupdfTrim

In [None]:
outdf

In [None]:
outdf.to_csv('data/Cleaned_ISTP_Facilitator_Data_demographicsGroupsCondensed.csv', index = False)
groupdf.to_csv('data/Cleaned_ISTP_Facilitator_Data_demographicsGroupsFull.csv', index = False)

## Scratch below

In [None]:
# this file does not have enough demographics info
df = pd.read_spss('data/ParticipantProfile.sav')
df

In [None]:
cols = df.columns.values
print(cols)

In [None]:
# looks like this data is included in the other file and not any more helpful to distinguish which questions is which
df = pd.read_spss('data/Pre_Survey_Oct21.sav')
df

In [None]:
cols = df.columns.values
print(cols)