In [1]:
"""
Author:
Cameron Smither, American Institutes for Research

Purpose:
Use institution 3175's student, progression, and cost data to attempt to build 
"Progression Profiles" for CBE students using behavioral information (course
and cost). We will (attempt to) use cluster analysis to see what natural 
patterns appear in enrollment data related to student outcomes with the 
ultimate goal of creating "Profiles" of students (or, a process to build them) 
that institutions could use to better understand and support their students.

This work will connect with the work of institutions using non-behavioral data
(e.g. students' intentions) to attempt to profile.


Change log:
#--------------------------------------------------------------------------------
# 2018-06-04.1 | CS | Initial file created
#--------------------------------------------------------------------------------
"""

'\nAuthor:\nCameron Smither, American Institutes for Research\n\nPurpose:\nUse institution 3175\'s student, progression, and cost data to attempt to build \n"Progression Profiles" for CBE students using behavioral information (course\nand cost). We will (attempt to) use cluster analysis to see what natural \npatterns appear in enrollment data related to student outcomes with the \nultimate goal of creating "Profiles" of students (or, a process to build them) \nthat institutions could use to better understand and support their students.\n\nThis work will connect with the work of institutions using non-behavioral data\n(e.g. students\' intentions) to attempt to profile.\n\n\nChange log:\n#--------------------------------------------------------------------------------\n# 2018-06-04.1 | CS | Initial file created\n#--------------------------------------------------------------------------------\n'

In [85]:
# Import needed modules
import os
import pandas as pd
import numpy as np
import sklearn as skl

In [3]:
# Set up some global references that will be helpful throughout; update for subsequent institutions
inst_id = "3175"
inst_dir = 'Z:\\Lumina - CBE Evaluation\\02 Data\\Analysis\\Capella-3175'


# Change dir to where inst's data are stored
os.chdir(inst_dir)
os.getcwd()
#os.listdir()

'Z:\\Lumina - CBE Evaluation\\02 Data\\Analysis\\Capella-3175'

In [40]:
## Prep inst's Student data
stud_df = pd.read_stata(str(inst_dir+'\\Student\\'+inst_id+'_Student_v1_cap.dta'))

# view data
#stud_df.head()
#stud_df.describe()
print (stud_df[:0]) #view elements

# Delete elements we don't want/need
# BE CAREFUL WITH RACETH (Capella shared raceth as numbers like we wanted, raceth_int not needed)
stud_df.drop(columns=['inst_id', 'sex', 'myob', 'forres_ind', 'raceth', 'prior_pse_amttype', 'prior_pse_credential', 
                     'mil_res_ae', 'vet_ae', 'ftft_flag', 'employed_ae', 'remed', 'pell_elig', 'pell_recip',
                     'debt_ae', 'unemploy', 'status', 'trnsfroutcbe', 'trnsfrincbe'], inplace=True) 

print (stud_df[:0])

Empty DataFrame
Columns: [inst_id, stud_id, cohort, sex, myob, zip, forres_ind, raceth, prior_pse_amt, prior_pse_amttype, prior_pse_credential, start, mil_res_ae, vet_ae, ftft_flag, employed_ae, remed, pell_elig, pell_recip, debt_ae, unemploy, status, trnsfroutcbe, trnsfrincbe, z_sex, z_start, age, age_bin, z_forres, raceth_int, z_white, z_black, z_hisp, z_asian, z_2plus, z_missrace, z_orace, prior_pse_amt_bin, z_prior_cred, z_milres, z_vet, z_ftft, z_employed, z_remed, z_pelle, z_pellr, debt_ae_bin]
Index: []

[0 rows x 47 columns]
Empty DataFrame
Columns: [stud_id, cohort, zip, prior_pse_amt, start, z_sex, z_start, age, age_bin, z_forres, raceth_int, z_white, z_black, z_hisp, z_asian, z_2plus, z_missrace, z_orace, prior_pse_amt_bin, z_prior_cred, z_milres, z_vet, z_ftft, z_employed, z_remed, z_pelle, z_pellr, debt_ae_bin]
Index: []

[0 rows x 28 columns]


In [63]:
## Prep inst's Progression data
prog_df = pd.read_stata(str(inst_dir+'\\Analysis\\'+inst_id+'_progression_MASTER.dta'),)

# view data
#prog_df.head()
#print (prog_df[:0]) #view elements

# columns 0-6   (stud_id ... age) are student characteristics
# columns 7-8   (retain*) are retention indicators
# columns 9-18  (avail_p*) describe the number of attempted units, by period
# columns 19-28 (earn_p*) describe the number of earned units, by period
# columns 29-34 (prior_group ... tt_cred) reflect share of program completed in units at entry and time to milestone values
# columns 35-44 (ratio_p*) describe the ratio of units earned vs attempted
# columns 45-70 (p*) describe the running cumulative share of their program accumulated by+during that period
# columns 71-93 (earned*) describe the share of their program accumulated in *each individual* period
# columns 94-99 (ever_cred ... inst_id) some additional elements. Keep only ever_cred (completion flag) 
#               and ETC (elapsed time to credential)

#print (prog_df.iloc[0:0,94:]) # view specific element names
#prog_df['uniqueid'].value_counts() # check specific element values

del_list = list(range(1,70)) + list(range(94,100))
prog_df.drop(prog_df.columns[[del_list]], axis=1, inplace=True) #delete elements we don't want/need

print (prog_df[:0]) #view elements


Empty DataFrame
Columns: [stud_id, earned1, earned2, earned3, earned4, earned5, earned6, earned7, earned8, earned9, earned10, earned11, earned12, earned13, earned14, earned15, earned16, earned17, earned18, earned19, earned20, earned21, earned22, earned23, earned24]
Index: []

[0 rows x 25 columns]


In [66]:
## Prep inst's Cost data
cost_df = pd.read_stata(str(inst_dir+'\\Analysis\Cost analyses\\cost_calculated.dta'),)

# view data
#cost_df.head()

keep_list = ['stud_id', 'pct_charges_grants', 'pct_charges_loans', 'pct_charges_unknown']
cost_df = cost_df[keep_list] #keep only elements we want/need

print (cost_df[:0]) #view elements

Empty DataFrame
Columns: [stud_id, pct_charges_grants, pct_charges_loans, pct_charges_unknown]
Index: []


In [80]:
## Create a master df
# Student + progression
master_df = stud_df.set_index('stud_id').join(prog_df.set_index('stud_id'))

# (Student + progression) + cost
master_df = master_df.join(cost_df.set_index('stud_id'))

print (master_df[:0]) #view elements

Empty DataFrame
Columns: [cohort, zip, prior_pse_amt, start, z_sex, z_start, age, age_bin, z_forres, raceth_int, z_white, z_black, z_hisp, z_asian, z_2plus, z_missrace, z_orace, prior_pse_amt_bin, z_prior_cred, z_milres, z_vet, z_ftft, z_employed, z_remed, z_pelle, z_pellr, debt_ae_bin, earned1, earned2, earned3, earned4, earned5, earned6, earned7, earned8, earned9, earned10, earned11, earned12, earned13, earned14, earned15, earned16, earned17, earned18, earned19, earned20, earned21, earned22, earned23, earned24, pct_charges_grants, pct_charges_loans, pct_charges_unknown]
Index: []

[0 rows x 54 columns]


In [None]:
## SCI-KIT ATTEMPTS