In [1]:
import pandas as pd
import json
#import the lti_context_id to root_guid_id report
guid_to_context = pd.read_csv('lti_context_to_outcome_guid.csv',encoding='utf-8')
guid_to_context = guid_to_context[['RootOutcomeGuid','LtiContextId']]   #I only need these two columns so I get rid of the rest
guid_to_context.index = guid_to_context.LtiContextId   #assign LtiContextId as the index (to easily match values)

df = pd.read_csv('Spring_Assessment_Data.csv')
summative = df[df.quiz_type == 'summative']   #only select summative questions
summative = summative[summative.attempt == 0]   #only keep first attempt for each student
summative['outquest_map'] = summative.outcome_guid.astype(str) + ':' + summative.question_id.astype(str)
outquest_to_id_default = pd.DataFrame(pd.Series(range(len(summative.outquest_map.unique()))).astype(str).values,index=summative.outquest_map.unique(),columns=['id'])
summative['outquest'] = summative['outquest_map'].map(lambda x: outquest_to_id_default.loc[x,'id'])   #assigns a new id to each question attempt record
def get_root_guid(cell):
    if cell != cell:   #handles None or missing values
        return None
    else:
        if str(cell) in guid_to_context.index:   #checks if the LtiContextId is located in the mapping file
            result = guid_to_context.loc[str(cell),'RootOutcomeGuid']   #gets the RootOutcomeGuid
            if type(result) == float:   #handles float values
                result = str(result)
            if len(result) == 2:   #handles list values
                try:
                    cell_guid = result[0]
                except:
                    return None
            else:                
                cell_guid = str(result)
        else:
            return None
    
    return cell_guid

#run the get_root_guid function to get RootOutcomeGuid from Lti_Context_Id
summative['root_guid'] = summative['lti_context_id'].map(get_root_guid)

#read in the all-outcomes text file
outcomes1_txt = open('outcomes.txt')
outcomes1_str = outcomes1_txt.read()
outcomes1_json = json.loads(outcomes1_str)
outcomes1 = pd.DataFrame(outcomes1_json)

courses_df = outcomes1[outcomes1.root_guid != outcomes1.root_guid]

dfBus = pd.read_csv('Introduction to Business_c948_final_IRT-CFA_data.csv')
dfMac = pd.read_csv('Macroeconomics _32d6_final_IRT-CFA_data.csv')
dfMic = pd.read_csv('Microeconomics _ec9a_final_IRT-CFA_data.csv')
##

In [2]:
dfBus_outcomes = dfBus[dfBus.model_fit < 3].outcome.unique()
dfMac_outcomes = dfMac[dfMac.model_fit < 3].outcome.unique()
dfMic_outcomes = dfMic[dfMic.model_fit < 3].outcome.unique()
##

In [3]:
dfBusOutcomes = pd.read_csv('Introduction to Business_c948_no_converge_outcomes.csv')
dfBusOutcomes = dfBusOutcomes.outcome.values
dfMacOutcomes = pd.read_csv('Macroeconomics _32d6_no_converge_outcomes.csv')
dfMacOutcomes = dfMacOutcomes.outcome.values
dfMicOutcomes = pd.read_csv('Microeconomics _ec9a_no_converge_outcomes.csv')
dfMicOutcomes = dfMicOutcomes.outcome.values
##

In [4]:
business_outcomes = list(dfBus_outcomes) + list(dfBusOutcomes)
macro_outcomes = list(dfMac_outcomes) + list(dfMacOutcomes)
micro_outcomes = list(dfMic_outcomes) + list(dfMicOutcomes)
##

In [5]:
# efa_trial_outcomes = dfBus[dfBus.model_fit == 1].outcome.value_counts().index[:3]
# outcome = efa_trial_outcomes[0]

In [6]:
#extract root_guid and course name
from subprocess import call

root_guid = courses_df.guid.values[4]
course_name = courses_df.short_title.values[4]

problem_didnt_run = []
it_totally_ran = []
results = []

for outcome in macro_outcomes:
    
    pivot_table = summative[(summative.outcome_guid == outcome)&(summative.root_guid == root_guid)].pivot_table(index='lti_user_id', columns='outquest', values='score')
    
    #remove questions with less than 100 responses
    response_threshold = 100
    question_count = pivot_table.count()
    greater_than_100_questions = question_count[question_count.values > response_threshold].index
    pivot_table = pivot_table[greater_than_100_questions]
    
    #make sure there are enough responses per student (need at least one or two I think)
    pivot_table = pivot_table.dropna(axis=0,thresh=1)
    pivot_table.fillna(-999,inplace=True)
    
    for x in pivot_table.columns:
        pivot_table.rename(columns={x:"A"+str(x)},inplace=True)

    pivot_table.to_csv('{}.csv'.format(outcome),header=False,index=False)
    
    variables = '\n    '.join(pivot_table.columns)
    input_file_name = '{}.inp'.format(outcome)
    with open("{}".format(input_file_name), "w") as text_file:
        text_file.write('''

DATA:
  FILE IS {}.csv;

VARIABLE:

  NAMES ARE
  {}
  ;

  USEVARIABLES ARE
  {}
  ;

  MISSING ARE ALL (-999);

ANALYSIS:
    TYPE = efa 1 3;
    COVERAGE = 0.001;
    ROTATION = varimax;

PLOT: TYPE = PLOT2;

!CFA code
!MODEL:
!outcome by a1003
!a1034
!a705
!a825
!a844;
'''.format(outcome, variables, variables))
                        
    output_file_name = '{}.out'.format(outcome)
    store = call(['mplus','{}'.format(input_file_name),'{}'.format(output_file_name)])
    if store == 1:
        problem_didnt_run.append(outcome)
        continue
    else:
        it_totally_ran.append(outcome)
    
    with open('{}'.format(output_file_name), 'r') as f:
        string = f.readlines()
    
    output = ''
    for line in string:
        output += line
    ##

    def get_number_out(string,starta,enda,startb,endb):
        l = re.search(starta,string)
        start1 = l.start(0)
        m = re.search(enda,string[start1:])
        end1 = m.start(0) + start1 + 3
        string2 = string[start1:end1]

        n = re.search(startb,string2)
        start2 = n.start(0)
        o = re.search(endb,string2[start2:])
        end2 = o.start(0) + start2
        final_string = string2[start2:end2]
        return final_string
    ##

    import re
    output2 = output[output.find('SUMMARY OF MODEL FIT INFORMATION'):]
    try:
        factor_one_results = get_number_out(output,'1-factor','\n','[0-9][0-9]','\n')
        factor_one_results = pd.Series(factor_one_results.split(' '))
        factor_one_results = factor_one_results[factor_one_results != ''].values
    except:
        factor_one_results = None

    try:
        factor_two_results = get_number_out(output,'2-factor','\n','[0-9][0-9]','\n')
        factor_two_results = pd.Series(factor_two_results.split(' '))
        factor_two_results = factor_two_results[factor_two_results != ''].values
    except:
        factor_two_results = None

    try:
        factor_three_results = get_number_out(output,'3-factor','\n','[0-9][0-9]','\n')
        factor_three_results = pd.Series(factor_three_results.split(' '))
        factor_three_results = factor_three_results[factor_three_results != ''].values
    except:
        factor_three_results = None
    ##

    #this might not work if two models both work. I'll need to test that.
    if 'EXPLORATORY FACTOR ANALYSIS WITH 1 FACTOR(S):' in output:
        output2 = output[output.find('EXPLORATORY FACTOR ANALYSIS WITH 1 FACTOR(S):'):]
        output2 = output2[output2.find('ESTIMATED FACTOR LOADINGS'):]
        factor_loadings = []
        for x in range(len(pivot_table.columns)):
            if x == -1:
                factor_loadings.append(get_number_out(output2,'ESTIMATED FACTOR LOADINGS',pivot_table.columns[1],'[0-9]\.[0-9]','\n'))
            elif x == len(pivot_table.columns) - 1:
                factor_loadings.append(get_number_out(output2,pivot_table.columns[x],'ESTIMATED RESIDUAL VARIANCES','[0-9]\.[0-9]','\n'))
            else:
                factor_loadings.append(get_number_out(output2,pivot_table.columns[x],pivot_table.columns[x+1],'[0-9]\.[0-9]','\n'))
        factor1df = pd.DataFrame(zip(pivot_table.columns,factor_loadings))
    else:
        factor1df = None
    ##

    #parse the factor two results if there are results
    if 'EXPLORATORY FACTOR ANALYSIS WITH 2 FACTOR(S):' in output:
        output2 = output[output.find('EXPLORATORY FACTOR ANALYSIS WITH 2 FACTOR(S):'):]
        output2 = output2[output2.find('VARIMAX ROTATED LOADINGS'):]
        factor_loadings1 = []
        factor_loadings2 = []
        for x in range(len(pivot_table.columns)):
            if x == -1:
                loadings = get_number_out(output2,'VARIMAX ROTATED LOADINGS',pivot_table.columns[1],'[0-9]\.[0-9]','\n').split(' ')
                loadings2 = []
                for y in loadings:
                    if y == '':
                        pass
                    else:
                        loadings2.append(y)
                factor_loadings1.append(loadings2[0])
                factor_loadings2.append(loadings2[1])
            elif x == len(pivot_table.columns) - 1:
                loadings = get_number_out(output2,pivot_table.columns[x],'ESTIMATED RESIDUAL VARIANCES','[0-9]\.[0-9]','\n').split(' ')
                loadings2 = []
                for y in loadings:
                    if y == '':
                        pass
                    else:
                        loadings2.append(y)
                factor_loadings1.append(loadings2[0])
                factor_loadings2.append(loadings2[1])
            else:
                loadings = get_number_out(output2,pivot_table.columns[x],pivot_table.columns[x+1],'[0-9]\.[0-9]','\n').split(' ')
                loadings2 = []
                for y in loadings:
                    if y == '':
                        pass
                    else:
                        loadings2.append(y)
                factor_loadings1.append(loadings2[0])
                factor_loadings2.append(loadings2[1])
        factor2df = pd.DataFrame(zip(pivot_table.columns,factor_loadings1,factor_loadings2))
    else:
        factor2df = None
    ##

    #parse the factor three results if there are results
    if 'EXPLORATORY FACTOR ANALYSIS WITH 3 FACTOR(S):' in output:
        output2 = output[output.find('EXPLORATORY FACTOR ANALYSIS WITH 3 FACTOR(S):'):]
        output2 = output2[output2.find('VARIMAX ROTATED LOADINGS'):]
        factor_loadings1 = []
        factor_loadings2 = []
        factor_loadings3 = []
        for x in range(len(pivot_table.columns)):
            if x == -1:
                loadings = get_number_out(output2,'VARIMAX ROTATED LOADINGS',pivot_table.columns[1],'[0-9]\.[0-9]','\n').split(' ')
                loadings2 = []
                for y in loadings:
                    if y == '':
                        pass
                    else:
                        loadings2.append(y)
                factor_loadings1.append(loadings2[0])
                factor_loadings2.append(loadings2[1])
                factor_loadings3.append(loadings2[2])
            elif x == len(pivot_table.columns) - 1:
                loadings = get_number_out(output2,pivot_table.columns[x],'ESTIMATED RESIDUAL VARIANCES','[0-9]\.[0-9]','\n').split(' ')
                loadings2 = []
                for y in loadings:
                    if y == '':
                        pass
                    else:
                        loadings2.append(y)
                factor_loadings1.append(loadings2[0])
                factor_loadings2.append(loadings2[1])
                factor_loadings3.append(loadings2[2])
            else:
                loadings = get_number_out(output2,pivot_table.columns[x],pivot_table.columns[x+1],'[0-9]\.[0-9]','\n').split(' ')
                loadings2 = []
                for y in loadings:
                    if y == '':
                        pass
                    else:
                        loadings2.append(y)
                factor_loadings1.append(loadings2[0])
                factor_loadings2.append(loadings2[1])
                factor_loadings3.append(loadings2[2])
        factor3df = pd.DataFrame(zip(pivot_table.columns,factor_loadings1,factor_loadings2,factor_loadings3))
    else:
        factor3df = None
    ##

    results.append({
        'Outcome':outcome,
        '1FactorResults':factor_one_results,
        '1FactorLoadings':factor1df,
        '2FactorResults':factor_two_results,
        '2FactorLoadings':factor2df,
        '3FactorResults':factor_three_results,
        '3FactorLoadings':factor3df
    })

df = pd.DataFrame(results)

dfMac_fit = dfMac[['outcome','short_title','model_fit']].copy()
dfMac_fit.drop_duplicates(inplace=True)
dfMac_fit.rename(columns={'outcome':'Outcome'},inplace=True)
dfMac_fit = dfMac_fit[dfMac_fit.model_fit < 3]

df = df.merge(dfMac_fit,on='Outcome',how='outer')

df.to_csv('{}_EFA_Results.csv'.format(course_name),index=False)

In [None]:
tarah_outcomes = dfMac[dfMac.parent_outcome == '479cce89-e456-4fa0-9bc0-debe586390ac'].outcome.unique()

test = summative[(summative.outcome_guid.isin(tarah_outcomes))&(summative.root_guid == '32d69033-0515-4d54-b615-c05e74f88bf5')]

for outcome in test.outcome_guid:
    test_df = test[test.outcome_guid == outcome]
    final_data = test_df.pivot_table(index='lti_user_id', columns='outquest', values='score')
    final_data.to_csv('{}_TBKIkahihifo.csv'.format(outcome),index=False)
    

dfMac.groupby('parent_outcome').sum()