In [30]:
"""
Author:
Cameron Smither, American Institutes for Research

Purpose:
Use institution 3175's student, progression, and cost data to attempt to build 
"Progression Profiles" for CBE students using behavioral information (course
and cost). We will (attempt to) use cluster analysis to see what natural 
patterns appear in enrollment data related to student outcomes with the 
ultimate goal of creating "Profiles" of students (or, a process to build them) 
that institutions could use to better understand and support their students.

This work will connect with the work of institutions using non-behavioral data
(e.g. students' intentions) to attempt to profile.


Change log:
#--------------------------------------------------------------------------------
# 2018-06-04.1 | CS | Initial file created
#--------------------------------------------------------------------------------
"""

'\nAuthor: Cameron Smither, American Institutes for Research\n\nPurpose:\nUse institution 3175\'s student, progression, and cost data to attempt to construct\nsome form of "CBE progression profiles". This will be an interative process.\n\n\nChange log:\n#--------------------------------------------------------------------------------\n# 2018-06-04.1 | CS | Initial file created\n#--------------------------------------------------------------------------------\n'

In [2]:
# Import needed modules
import os
import pandas as pd
import numpy as np


In [3]:
# Set up some global references that will be helpful throughout; update for subsequent institutions
inst_id = "3175"
inst_dir = 'Z:\\Lumina - CBE Evaluation\\02 Data\\Analysis\\Capella-3175'


# Change dir to where inst's data are stored
os.chdir(inst_dir)
os.getcwd()
#os.listdir()

'Z:\\Lumina - CBE Evaluation\\02 Data\\Analysis\\Capella-3175'

In [44]:
## Prep inst's Student data
stud_df = pd.read_stata(str(inst_dir+'\\Student\\'+inst_id+'_Student_v1_cap.dta'))

# view data
#stud_df.head()
#stud_df.describe()
print (stud_df[:0]) #view elements

Empty DataFrame
Columns: [inst_id, stud_id, cohort, sex, myob, zip, forres_ind, raceth, prior_pse_amt, prior_pse_amttype, prior_pse_credential, start, mil_res_ae, vet_ae, ftft_flag, employed_ae, remed, pell_elig, pell_recip, debt_ae, unemploy, status, trnsfroutcbe, trnsfrincbe, z_sex, z_start, age, age_bin, z_forres, raceth_int, z_white, z_black, z_hisp, z_asian, z_2plus, z_missrace, z_orace, prior_pse_amt_bin, z_prior_cred, z_milres, z_vet, z_ftft, z_employed, z_remed, z_pelle, z_pellr, debt_ae_bin]
Index: []

[0 rows x 47 columns]


In [8]:
## Prep inst's Progression data
prog_df = pd.read_stata(str(inst_dir+'\\Analysis\\'+inst_id+'_progression_MASTER.dta'),)

# view data
#prog_df.head()
#print (prog_df[:0]) #view elements

# columns 0-6   (stud_id ... age) are student characteristics
# columns 7-8   (retain*) are retention indicators
# columns 9-18  (avail_p*) describe the number of attempted units, by period
# columns 19-28 (earn_p*) describe the number of earned units, by period
# columns 29-34 (prior_group ... tt_cred) reflect share of program completed in units at entry and time to milestone values
# columns 35-44 (ratio_p*) describe the ratio of units earned vs attempted
# columns 45-70 (p*) describe the running cumulative share of their program accumulated by+during that period
# columns 71-93 (earned*) describe the share of their program accumulated in *each individual* period
# columns 94-99 (ever_cred ... inst_id) some additional elements. Keep only ever_cred (completion flag) 
#               and ETC (elapsed time to credential)

#print (prog_df.iloc[0:0,94:]) # view specific element names
#prog_df['ETC'].value_counts() # check specific element values

prog_df.drop(columns=['uniqueid', 'z_sex', 'raceth_int', 'inst_id'], inplace=True) #delete elements we don't want/need
print (prog_df[:0]) #view elements


Empty DataFrame
Columns: [stud_id, cohort, sex, raceth, pell_elig, pell_recip, age, retain_91, retain_1yr, avail_p1, avail_p2, avail_p3, avail_p4, avail_p5, avail_p6, avail_p7, avail_p8, avail_p9, avail_p10, earn_p1, earn_p2, earn_p3, earn_p4, earn_p5, earn_p6, earn_p7, earn_p8, earn_p9, earn_p10, prior_group, ttm_25, ttm_50, ttm_75, ttm_100, tt_cred, ratio_p1, ratio_p2, ratio_p3, ratio_p4, ratio_p5, ratio_p6, ratio_p7, ratio_p8, ratio_p9, ratio_p10, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23, p24, earned1, earned2, earned3, earned4, earned5, earned6, earned7, earned8, earned9, earned10, earned11, earned12, earned13, earned14, earned15, earned16, earned17, earned18, earned19, earned20, earned21, earned22, earned23, earned24, ever_cred, ETC]
Index: []

[0 rows x 96 columns]


In [152]:
## Prep inst's Cost data
