The following notebook documents the procedures for integrating Qualtrics data into PeopleGrove.

In [None]:
# install and load packages
import os

# pip install datetime
from datetime import datetime, timedelta

# pip install pandas
import pandas as pd

# pip install numpy
import numpy as np

# pip install QualtricsAPI 
from QualtricsAPI import Credentials
from QualtricsAPI.Survey import Responses

# call the qualtrics_api_credentials() to authenticate account
Credentials().qualtrics_api_credentials(token = "Jc9vraFFrDBsaiJQGjvn32yjtMOrehmT9uYWjkT6", data_center = "iad1")

# pip install unidecode
import unidecode

# pip install logging boto3
import logging
import boto3
from botocore.exceptions import ClientError

## Import Data

In [80]:
# survey id can be found in Qualtrics > Profile > Survey Ids
surveyid = "" # add survey id as string

# import data from the Qualtrics intake form
intake = Responses().get_survey_responses(survey = surveyid, useLabels = True)

# cast to a pandas dataframe
intake = pd.DataFrame(intake)

# remove the second row of data
# this contains metadata from Qualtrics
intake = intake.drop([0,1], axis = "index")

# view the first 10 cases
#intake.head(10)

Format the intake form according to PeopleGrove Import File Format.xlsx. Start by getting a subset of columns and rows.

In [82]:
intake = intake[['RecordedDate', 'ResponseId', 'DistributionChannel', 
                 'name_drop', 'firstname_text', 'lastname_text', 'email', 
                 'status', 'gradfac', 'gradfac_3_TEXT', 
                 'match_type_1', 'match_type_2', 'nmatch', 'nmatch_4_TEXT', 
                 'broad', 'broad_26_TEXT', 'specific', 
                 'scale_1', 'scale_2', 'scale_3', 'scale_4', 'scale_5',
                 'apply_program_1', 'apply_program_2', 'apply_program_4', 'apply_program_4_TEXT', 
                 'apply_when', 'apply_when_5_TEXT', 
                 'broad2', 'broad2_26_TEXT', 
                 'broad_all_14', 'broad_all_15', 'broad_all_16',
                 'broad_all_17', 'broad_all_18', 'broad_all_19', 'broad_all_20',
                 'broad_all_21', 'broad_all_22', 'broad_all_23', 'broad_all_24',
                 'broad_all_25', 'broad_all_26', 'broad_all_26_TEXT', 'specific2',
                 'firstgen_1', 'firstgen_2',
                 'international', 'international_3_TEXT', 
                 'race_1', 'race_2', 'race_3', 'race_4', 'race_5', 'race_6', 
                 'race_7', 'race_8', 'race_8_TEXT',
                 'gender', 'gender_4_TEXT', 
                 'disability', 'disability_3_TEXT',
                 'id_matching_1', 'id_matching_2', 'id_matching_3', 'id_matching_4',
                 'id_matching_5', 'id_matching_6', 'id_matching_7', 'id_matching_7_TEXT', 
                 'id_matching_text']]

Remove cases before the last load date and time. Loads occur daily at 23:59:59. The code below will only keep cases that happened after the last load so after 24 hours ago. 

In [83]:
# create a function to 1
# recast from a string to datetime format 2
# return the string with the new format 3
def cast_datetime(string): # 1
    string_recast = datetime.strptime(string, '%Y-%m-%d %H:%M:%S') # 2
    return string_recast # 3

# apply the function to the date column
intake['date'] = intake['RecordedDate'].apply(cast_datetime)

In [84]:
# get the current date
current_datetime = datetime.now()

# get the time change to 24 ago
last_24 = timedelta(hours = 24)

# print the date & time for 24 hours ago in the same structure as current date and intake['date']
last_24_datetime = datetime.strftime(current_datetime - last_24, '%Y-%m-%d %H:%M:%S')

# only keep cases that came after 24 hours ago
intake = intake[intake['date'] > last_24_datetime]

Remove preview. Responses where DistributionChannel = "preview" were submitted as a part of survey testing and are not valid cases. 

In [85]:
# remove preview cases
intake = intake[intake["DistributionChannel"] != "preview"]

## Data Cleaning

### Name
Clean name_text. Change to title case, remove accents (unicode text), and remove honorifics.

In [86]:
# recast to a string
intake.firstname_text = intake.firstname_text.astype(str)
intake.lastname_text = intake.lastname_text.astype(str)

# create a function to 1
# make title case 2
# remove any accents 3
# return the cleaned text 4
def clean(text): # 1
    text = text.title() # 2
    text = unidecode.unidecode(text) # 3
    return text # 4

# apply the function to the name_text column
intake.firstname_text = intake.firstname_text.apply(clean)
intake.lastname_text = intake.lastname_text.apply(clean)

# remove honorifics
intake.firstname_text = intake.firstname_text.str.replace(r"^(Mr|Mrs|Ms|Miss|Dr)(\. | )", "", case = False)
intake.lastname_text = intake.lastname_text.str.replace(r"(, | )(Ph.D.|Ph.D|PhD|M.A.|M.A|MA|M.S.|M.S|MS)", "", case = False)

  intake.name_text = intake.name_text.str.replace(r"^(Mr|Mrs|Ms|Miss|Dr)(\. | )", "", case = False)
  intake.name_text = intake.name_text.str.replace(r"(, | )(Ph.D.|Ph.D|PhD|M.A.|M.A|MA|M.S.|M.S|MS)", "", case = False)


Combine name_drop and name_text. name_drop is a variable with names of past mentors and mentees. If their name does not appear in name_drop, then they proceed to a free response question, name_text, where they can input their name. 

Split name_drop into first and last.

In [88]:
# cast name to string
intake.name = intake.name_drop.astype(str)

# create first and last name colums
intake["first_name"] = ""
intake["last_name"] = ""

# for each row 1
# set an object called "name" equal to the name 2
# set an object called "split_name" equal to the split of name (split on space 1 time) 3
# update the first_name column with the first part of the split 4
# na's will only have one part when split, so only update last name for cases that split into 2 parts 5
# update the last_name column with the second part of the split 6

for row in intake.index: # 1
    name = intake.name[row] # 2
    split_name = name.split(" ", maxsplit = 1) # 3
    intake.first_name[row] = split_name[0] # 4
    if len(split_name) == 2: # 5
        intake.last_name[row] = split_name[1] # 6

# cast first and last name to string
intake.first_name = intake.first_name.astype(str)
intake.last_name = intake.last_name.astype(str)

# change nan's to blanks
intake.first_name = intake.first_name.str.replace("^NaN$", "", case = False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  intake.first_name[row] = split_name[0] # 4
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  intake.last_name[row] = split_name[1] # 6
  intake.first_name = intake.first_name.str.replace("^NaN$", "", case = False)


In [None]:
# for each row 1
# set an object called "firstname" equal to the first name text entry 2
# set an object called "lastname" equal to the last name text entry 3
# if intake.first_name (the split from the dropdown) is "Not listed" 4
# then set first_name equal to the text entry (firstname) 5
# if intake.last_name (the split from the dropdown) is "Not listed" 6
# then set last_name equal to the text entry (lastname) 7

for row in intake.index: # 1
    firstname = intake.firstname_text[row] # 2
    lastname = intake.lastname_text[row] # 3
    if intake.first_name[row] == "Not listed": # 4
        intake.first_name[row] = firstname # 5
    if intake.last_name[row] == "Not listed": # 6
        intake.last_name[row] = lastname # 7


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  intake.name[row] = name # 5


### Email
Email contains the email addresses of respondents. There is a validation setting on this item, meaning the response must contain an “@” symbol and a valid domain format.

In [89]:
# cast to string
intake.email = intake.email.astype(str)

# change nan's to blanks
intake.email = intake.email.str.replace("^NaN$", "", case = False)

  intake.email = intake.email.str.replace("^NaN$", "", case = False)


### Graduate Student or Faculty
Mentors can be either graduate students or faculty members. The column gradfac indicates which type of mentor the respondent is.

In [90]:
# for each row 1
# if the text response is not empty 2
# update gradfac with the text 3

for row in intake.index: # 1
    if intake.gradfac_3_TEXT is not np.nan: # 2
        intake.gradfac[row] == intake.gradfac_3_TEXT # 3

### Match type
Mentors can elect into formal, drop-in mentoring, or both. The column match_type combines the following columns into a list:
- match_type_1: formal mentoring
- match_type_2: drop-in mentoring

In [91]:
# create a new column to save the combined list 1
# reset the index (i.e., row numbers) 2
# for each row in the index 3
# create an object called broad_all_list containing a list of all the broad all columns for that row # 4
# set the exclusion criteria  # 5
# update broad_all_list to exclude all cases that match the exclusion criteria # 6
# set broad_all for that row to the contents of broad_all_list # 7

intake["match_type"] = "" # 1
intake = intake.reset_index(drop = True) # 2

for row in intake.index: # 3
    match_list = intake[["match_type_1", "match_type_2"]].iloc[row].tolist() # 4
    exclude = [np.nan] # 5 
    match_list = list(filter(lambda x:x not in exclude, match_list)) # 6
    intake["match_type"][row] = match_list # 7

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  intake["match_type"][row] = match_list # 7
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  intake["match_type"][row] = match_list # 7
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  intake["match_type"][row] = match_list # 7
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  intake["match_type"][row] = match_list 

### Number of matches
The column nmatch, contains the number of mentees that mentors are willing to be formally matched with.

In [92]:
# for each row 1
# if nmatch is 4+ 2
# and if the open response accompanying 4+ is not missing 3
# then replace 4+ with the text 4
# if the text is missing 5
# then just use 4 6

for row in intake.index: # 1
    if intake.nmatch[row] == "4+": # 2
        if intake.nmatch_4_TEXT[row] is not [np.nan]: # 3
            intake.nmatch[row] == intake.nmatch_4_TEXT # 4
        elif intake.nmatch_4_TEXT[row] is [np.nan]: # 5
            intake.nmatch[row] == 4 # 6

### Broad interest
Mentors are asked to select their broad interest area. Mentees were forced to choose one top broad area of interest. This makes formal matching a little easier. The column broad combines mentors area of interest and mentee's top area of interest. Mentees were also allowed to select other areas of interest in the broad_all columns.

In [93]:
# update other cases with the free response text

# for each row 1
# if broad (mentors) is other 2
# then set broad equal to broad text 3
# if broad (mentees) is other 4
# then set broad equal to broad text 5

for row in intake.index: # 1
    if intake.broad[row] == "Other": # 2
        intake.broad[row] = intake.broad_26_TEXT # 3
    if intake.broad2[row] == "Other": # 4
        intake.broad2[row] = intake.broad2_26_TEXT # 5

In [94]:
# for each row 1
# if they're a mentee 2
# update broad with the mentee broad question 3

for row in intake.index: # 1
    if intake.status[row] == "Mentee": # 2
        intake.broad[row] = intake.broad2[row] # 3

# cast to string
intake.broad = intake.broad.astype(str)

# change nan's to blanks
intake.broad = intake.broad.str.replace(r"^NaN$", "", case = False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  intake.broad[row] = intake.broad2[row] # 3
  intake.broad = intake.broad.str.replace(r"^NaN$", "", case = False)


Combining broad_all for mentees.

In [95]:
# for each row 1
# if the text response is not missing (this will only be the case if broad_all_26 is selected & they provided a text response) 2
# then update broad_all_26 so instead of other, it has the text response 3

for row in intake.index: # 1
    if intake.broad_all_26_TEXT[row] is not np.nan: # 2
        intake.broad_all_26[row] = intake.broad_all_26_TEXT[row] # 3

In [96]:
# create a new column to save the combined list 1
# reset the index (i.e., row numbers) 2
# for each row in the index 3
# create an object called broad_all_list containing a list of all the broad all columns for that row # 4
# set the exclusion criteria  # 5
# update broad_all_list to exclude all cases that match the exclusion criteria # 6
# set broad_all for that row to the contents of broad_all_list # 7

intake["broad_all"] = "" # 1
intake = intake.reset_index(drop = True) # 2

for row in intake.index: # 3
    broad_all_list = intake[['broad_all_14', 'broad_all_15', 'broad_all_16',
                             'broad_all_17', 'broad_all_18', 'broad_all_19', 
                             'broad_all_20', 'broad_all_21', 'broad_all_22', 
                             'broad_all_23', 'broad_all_24', 'broad_all_25', 
                             'broad_all_26']].iloc[row].tolist() # 4
    exclude = [np.nan] # 5 
    broad_all_list = list(filter(lambda x:x not in exclude, broad_all_list)) # 6
    intake["broad_all"][row] = broad_all_list # 7


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  intake["broad_all"][row] = broad_all_list # 7
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  intake["broad_all"][row] = broad_all_list # 7
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  intake["broad_all"][row] = broad_all_list # 7
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  intake["broad_all"][row] = bro

### Sepecific interest
This is a free response question prompting respondents to provide the specific research area they are interested in e.g., psycholinguistics, language processing, spoken word recognition.

In [97]:
# for each row 1
# if they're a mentee 2
# update broad with the mentee specifc question 3

for row in intake.index: # 1
    if intake.status[row] == "Mentee": # 2
        intake.specific[row] = intake.specific2[row] # 3

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  intake.specific[row] = intake.specific2[row] # 3


### Graduate School Readiness Scale
The scale includes 5 items prompting mentees to consider what stage they're at in the application process. Mentees respond on a 7-point agreement scale. The questions are as follows:
- scale_1: "I know my long-term career plan"
- scale_2: "I know I want to go to graduate school"
- scale_3: "I know which research area(s) I want to study"
- scale_4: "I know which school(s) I want to apply to"
- scale_5: "I know the next steps I need to take in preparing my application"

Create a total score by combining responses. The scale is weighted according to the items that are most important for matching.

In [98]:
# create a function defining how to recode the scale responses
def scale_recode(series):
    if series == "Strongly disagree":
        return 0
    elif series == "Disagree":
        return 1
    elif series == "Somewhat disagree":
        return 2
    elif series == "Neither agree nor disagree":
        return 3
    elif series == "Somewhat agree":
        return 4
    elif series == "Agree":
        return 5
    elif series == "Strongly agree":
        return 6

# create a list of the columns to recode
scale_columns = intake[['scale_1', 'scale_2', 'scale_3', 'scale_4', 'scale_5']]

# for each column 1
# apply the function 2
for column in scale_columns: # 1
    intake[column] = intake[column].apply(scale_recode) # 2


In [99]:
intake["gsr_scale"] = ""

intake.gsr_scale = (3 * intake.scale_2) + (2 * intake.scale_3) + (2 * intake.scale_4) + (1 * intake.scale_1) + (1 * intake.scale_5)

### MA or PhD program
The column apply_program contains responses to the question, "Are you planning to apply to a Master's or Ph.D. program? (Select all that apply)"

In [100]:
# for each row 1
# if the text response is not missing (this will only be the case if apply_program_4 is selected & they provided a text response) 2
# then update apply_program_4 so instead of other, it has the text response 3

for row in intake.index: # 1
    if intake.apply_program_4_TEXT[row] is not np.nan: # 2
        intake.apply_program_4[row] = intake.apply_program_4_TEXT[row] # 3

In [101]:
# create a new column to save the combined list 1
# for each row in the index 2
# create an object called broad_all_list containing a list of all the broad all columns for that row # 3
# set the exclusion criteria  # 4
# update broad_all_list to exclude all cases that match the exclusion criteria # 5
# set broad_all for that row to the contents of broad_all_list # 6

intake["apply_program"] = "" # 1

for row in intake.index: # 2
    apply_list = intake[['apply_program_1', 'apply_program_2', 'apply_program_4']].iloc[row].tolist() # 3
    exclude = [np.nan] # 4
    apply_list = list(filter(lambda x:x not in exclude, apply_list)) # 5
    intake["apply_program"][row] = apply_list # 6

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  intake["apply_program"][row] = apply_list # 6
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  intake["apply_program"][row] = apply_list # 6
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  intake["apply_program"][row] = apply_list # 6
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  intake["apply_program"][row] =

### Start grad school
The column apply_when contains responses to the question, "When do you plan to start graduate school?"

In [102]:
# for each row 1
# if the text response is not missing (this will only be the case if apply_when is selected & they provided a text response) 2
# then update apply_when so instead of other, it has the text response 3

for row in intake.index: # 1
    if intake.apply_when_5_TEXT[row] is not np.nan: # 2
        intake.apply_when[row] = intake.apply_when_5_TEXT[row] # 3

### First-generation college student
The column firstgen indicates whether the person is a first-generation college student. To qualify as a first gen student, the person must have at least 1 parent or guardian who has had some college experience. 

In [103]:
# initate the variable 1
# for each row 2
# if parent/guardian 1 did not go to college & parent/guardian 2 didn't go, is N/A or missing (np.nan) 3
# then mark as "Yes" to first gen 4
# if parent/guardian 2 did not go to college & parent/guardian 1 didn't go, is N/A or missing (np.nan) 5
# then mark as "Yes" to first gen 6
# if either parent went to attended, but did not graduate or graduated 7
# then mark as "No" to first gen 8

intake["firstgen"] = "" # 1

for row in intake.index: # 2
    if (intake["firstgen_1"][row] == "No college") and (intake["firstgen_2"][row] == "No college" or intake["firstgen_2"][row] == "NA" or intake["firstgen_2"][row] == np.nan): # 3
         intake["firstgen"][row] = "Yes" # 4
    elif (intake["firstgen_2"][row] == "No college") and (intake["firstgen_1"][row] == "No college" or intake["firstgen_1"][row] == "NA" or intake["firstgen_1"][row] == np.nan): # 5
         intake["firstgen"][row] = "Yes" # 6
    elif intake["firstgen_1"][row] == "Attended, but did not graduate" or intake["firstgen_1"][row] == "Graduated college" or intake["firstgen_2"][row] == "Attended, but did not graduate" or intake["firstgen_2"][row] == "Graduated college": # 7
          intake["firstgen"][row] = "No" # 8



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  intake["firstgen"][row] = "Yes" # 4
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  intake["firstgen"][row] = "No" # 8


### International student
The international column contains responses to the question, "Do you identify as an international student?

In [104]:
# for each row 1
# if the text response is not missing (this will only be the case if international is selected & they provided a text response) 2
# then update international so instead of other, it has the text response 3

for row in intake.index: # 1
    if intake.international_3_TEXT[row] is not np.nan: # 2
        intake.international[row] = intake.international_3_TEXT[row] # 3

### Race/Ethnicity
The columns for race contain responses to the question, "What race or ethnicity do you identify with?" Respondents were able to select as many responses as applied and were also given an text-response option to self-describe their identity.
- race_1: American Indian or Alaska Native
- race_2: Asian
- race_3: Black or African American
- race_4: Hispanic, Latinx, or Spanish
- race_5: Middle Eastern or North African
- race_6: Native Hawaiian or Pacific Islander
- race_7: White
- race_8: Prefer to self-describe

In [105]:
# for each row 1
# if the text response is not missing (this will only be the case if international is selected & they provided a text response) 2
# then update international so instead of other, it has the text response 3

for row in intake.index: # 1
    if intake.race_8_TEXT[row] is not np.nan: # 2
        intake.race_8[row] = intake.race_8_TEXT[row] # 3

In [106]:
# create a new column to save the combined list 1
# for each row in the index 2
# create an object called race_list containing a list of all the race columns for that row # 3
# set the exclusion criteria  # 4
# update race_list to exclude all cases that match the exclusion criteria # 5
# set race for that row to the contents of race_list # 6

intake["race"] = "" # 1

for row in intake.index: # 2
    race_list = intake[['race_1', 'race_2', 'race_3', 'race_4',
                             'race_5', 'race_6', 'race_7', 'race_8']].iloc[row].tolist() # 3
    exclude = [np.nan] # 4
    race_list = list(filter(lambda x:x not in exclude, race_list)) # 5
    intake["race"][row] = race_list # 6


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  intake["race"][row] = race_list # 6
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  intake["race"][row] = race_list # 6
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  intake["race"][row] = race_list # 6
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  intake["race"][row] = race_list # 6
A value is trying to be 

### Gender
The column gender contains responses to the question, "What gender do you identify with?" Respondents were able to select "Male", "Female", "Non-binary", or "Prefer to self-describe" where they were given an opportunity to freely describe in a text response.

In [107]:
# for each row 1
# if the text response is not missing (this will only be the case if "prefer to self describe" is selected & they provided a text response) 2
# then update gender so instead it has the text response 3

for row in intake.index: # 1
    if intake.gender_4_TEXT[row] is not np.nan: # 2
        intake.gender[row] = intake.gender_4_TEXT[row] # 3

### Disability
The column disability contains responses to the question, "Do you identiy as having a disability?" Respondents were able to select "Yes", "No", or "Prefer to self-describe" where they were given an opportunity to freely describe in a text response.

### Matching on identity
The id_matching columns contain the scale responses to the question "After your research area, which aspects of your identity are most important to consider when matching you with a mentor?" Each item is rated on a 5 point scale ranging from "Not important" to "Essential." The items are as follows,
- id_matching_1: Broad research area
- id_matching_2: First-gen status
- id_matching_3: International student status
- id_matching_4: Race/ethnicity
- id_matching_5: Gender identity
- id_matching_6: Disability status
- id_matching_7: Other

id_matching_2, id_matching_3, id_matching_6 are conditionally based on responses to prior questions in the survey. These 3 items are only shown if mentees affirm that they belong to the respective identity groups. 

In [108]:
# for each row 1
# if the text response is not missing (this will only be the case if "prefer to self describe" is selected & they provided a text response) 2
# then update disability so instead it has the text response 3

for row in intake.index: # 1
    if intake.id_matching_7_TEXT[row] is not np.nan: # 2
        intake.id_matching_7[row] = intake.id_matching_7_TEXT[row] # 3

In [109]:
# create a function defining how to recode the scale responses
def important_recode(series):
    if series == "Not important":
        return 0
    elif series == "Slightly important":
        return 1
    elif series == "Moderately important":
        return 2
    elif series == "Very important":
        return 3
    elif series == "Essential":
        return 4

# create a list of the columns to recode
scale_columns = intake[['id_matching_1', 'id_matching_2', 'id_matching_3', 
                        'id_matching_4', 'id_matching_5', 'id_matching_6',
                        'id_matching_7']]

# for each column 1
# apply the function 2
for column in scale_columns: # 1
    intake[column] = intake[column].apply(important_recode) # 2

## PeopleGrove Load

In [110]:
# subset of cleaned variables to load
intake = intake[['first_name', 'last_name', 'email', 
                 'db_key', 'status', 'gradfac', 
                 'match_type', 'nmatch',
                 'broad', 'broad_all', 'specific',
                 'scale_1', 'scale_2', 'scale_3', 'scale_4', 'scale_5', 'gsr_scale',
                 'apply_program', 'apply_when',
                 'firstgen', 'international', 'race', 'gender', 'disability',
                 'id_matching_1', 'id_matching_2', 'id_matching_3', 'id_matching_4',
                 'id_matching_5', 'id_matching_6', 'id_matching_7', 'id_matching_7_TEXT',
                 'id_matching_text']]

In [111]:
# rename variables required by PeopleGrove
intake.rename(columns = {'first_name':'First Name', 'last_name':'Last Name', 'email':'Email',
                         'db_key':'Data Base Key','status':'User Type',
                         'id_matching_1': 'match_on_broad', 'id_matching_2': 'match_on_firstgen', 
                         'id_matching_3': 'match_on_international', 'id_matching_4': 'match_on_race',
                         'id_matching_5': 'match_on_gender', 'id_matching_6': 'match_on_disability', 
                         'id_matching_7': 'match_on_other', 'id_matching_7_TEXT': 'match_on_other_text',
                         'id_matching_text': 'match_on_text'}, inplace = True)

Save the file to a .csv

In [117]:
file_path = "./Dropbox/MAGIC Mentor Network/9. Platform Management/Load Data/"
file_name = "Qualtrics_to_PeopleGrove.csv"

intake.to_csv(file_path + file_name, header = True, index = False)

In [None]:
def upload_file(file_name, bucket, object_name = None):
   """Upload a file to an S3 bucket

   :param file_name: File to upload
   :param bucket: Bucket to upload to
   :param object_name: S3 object name. If not specified then file_name is used
   :return: True if file was uploaded, else False
   """
   AWS_ACCESS_KEY_ID = ""
   AWS_SECRET_ACCESS_KEY = ""

   session = boto3.Session(
     aws_access_key_id = AWS_ACCESS_KEY_ID,
     aws_secret_access_key = AWS_SECRET_ACCESS_KEY,
   )

   # If S3 object_name was not specified, use file_name
   if object_name is None:
       object_name = os.path.basename(file_name)

   # Upload the file
   s3_client = session.client("s3")

   # s3_client = boto3.client("s3")
   try:
      response = s3_client.upload_file(file_name, bucket, object_name)
      # print(dir(response))
   except ClientError as e:
       logging.error(e)
   return False
   return True

In [None]:
# set the parameters
# file_name is the same as was used to save to .csv
# object name
bucket_name = ""

# here yourfile.csv is the file on local machine
# yourfile-upload.csv is the file name with which it will be present on s3
upload_file(file_name, bucket_name, file_name + "-upload.csv")