In [3]:
import csv
import random

random.seed(10)

from sklearn.decomposition import PCA
from copy import deepcopy

import tensorflow as tf
import tensorflow_hub as hub

In [None]:
# Order of columns in csv file
columns = ['TestSubject', 'Event', 'TimeStamp', 'Duration', 'Location',
          'GameTime', 'Target', 'HowIsItGoingLikert', 'ProgressPlanSummary',
          'SolutionApproachSummary', 'DifferentProblemApproachSummary',
          'Name', 'Title', 'NPC', 'NpcSpokeCount', 'ConceptMatrixEditCount',
          'ConceptMatrixAnsweredCorrectly', 'ObjectScanned', 'TestingFor',
          'ReasonForTesting', 'TotalFieldsModified', 'WorksheetSubmitResult']

# Order of columns in activity summary csv file
activity_columns = ['TestSubject', 'ActivityUri', 'ActivityStarts', 'ActivityRestarts',
                    'Age', 'Gender', 'Race', 'VideoGamePlayingFrequency', 
                    'VideoGamePlayingSkill', 'VideoGamePlayingHoursPerWeek', 'VideoGamesPlayed',
                    'PreTestScore', 'PostTestScore', 'LearningGain', 'MysterySolved', 
                    'TotalPromptOpens', 'MeanPromptResponseDuration', 'MeanHowIsItGoingLikert',
                    'TotalWorksheetSubmits', 'SolutionDisease', 'InfectionType', 'SolutionObject',
                    'SolutionTreatment', 'StartTime', 'StopTime', 'Duration', 'PlotPointsActivated',
                    'TotalComplexTextDuration', 'MeanComplexTextDuration', 'TotalPosterOpenDuration',
                    'MeanPosterOpenDuration', 'TotalScanningDuration', 'MeanScanningDuration', 
                    'TotalDiagnosisWorksheetDuration', 'MeanDiagnosisWorksheetDuration',
                    'TotalDialogSelections', 'TotalComplexTextOpens', 'TotalPostersOpens', 'TotalScans',
                    'TotalDiagnosisWorksheetOpens', 'TotalBackpackOpens']

# Order of columns in reflect data csv file
reflect_columns = ['StudentId', 'Instance', 'PromptId', 'GameTime(sec)',
                  'QuestionId', 'Response', 'ReflectionRating1', 'ReflectionRating2',
                  'ReflectionRatingAvg', 'PreTestScore', 'PostTestScore', 
                  'NormalizedLearningGain', 'NLG_revised']

events_without_movement = ['Conversation', 'BooksAndArticles', 'Worksheet', 'Prompts', 'PlotPoint',
                           'Posters', 'Scanner', 'WorksheetSubmit']

plot_names = ['IntroFromKim', 'TutorialComplete', 'CompletedGOT', 'TalkedToBryce', 
              'SecondaryPatientSymptoms', 'TalkedToQuentin', 'LearnFoodHistory', 
              'TalkedToElise', 'TestObject', 'TalkedToTheresa', 'PrimaryPatientSymptoms', 
              'TalkedToSam', 'TalkedToGreg', 'TestContaminatedObject', 'SubmittedDiagnosis', 
              'TalkedToRobert', 'LearnAboutBacteria', 'SolvedMystery', 'TalkedToFord', 
              'LearnAboutViruses'] 

filenames = {2018:"Datasets/EventSequence2018.csv", 2019:"Datasets/EventSequence2019.csv"}

summary_filenames = {2018:"Datasets/ActivitySummary2018.csv", 2019:"Datasets/ActivitySummary2019.csv"}

# Reflection data csv filename
reflect_file = 'Datasets/LabeledReflections.csv'

elmo_module_locn = "module/module_elmo2/"

In [None]:
def embed_elmo2(module):
    with tf.Graph().as_default():
        sentences = tf.compat.v1.placeholder(tf.string)
        embed = hub.Module(module)
        embeddings = embed(sentences)
        session = tf.compat.v1.train.MonitoredSession()
    return lambda x: session.run(embeddings, {sentences: x})

elmo = hub.load(elmo_module_locn)
embed_fn = embed_elmo2(elmo_module_locn)

In [None]:
def get_embedding(text, pca=None, pca_dim=32):
    if text != "": # string not blank
        embeddings = elmo.signatures['default'](tf.convert_to_tensor([text]))["elmo"]
    else: # return default embedding
        if pca == None: 
            return [random.uniform(-1,1) for i in range(emb_dim)]
        return [random.uniform(-1,1) for i in range(pca_dim)]
    
    x = embed_fn([text]) # get elmo embedding of string (sentence-level)
    embed = x[0].tolist() # there will be only one vector (like, [[vector]]), get rid of extra
                          # dimension
    if pca == None: # if no PCA model is provided, return embedding directly
        return embed
    
    return pca.transform([embed])[0] # transform embedding and return

In [None]:
def get_scores(s_id, study_number):
    pretest, posttest = -1, -1 # entries with pretest/posttest -1 are ignored in final dataset
    student_found = False
    
    for study in summary_filenames:
        with open(summary_filenames[study]) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            for row in csv_reader:
                if row[activity_columns.index('TestSubject')] == s_id:
                    student_found = True
                    pretest_txt = row[activity_columns.index('PreTestScore')]
                    posttest_txt = row[activity_columns.index('PostTestScore')]
                    if pretest_txt != "" and posttest_txt != "":
                        pretest = float(pretest_txt)
                        posttest = float(posttest_txt)
                        break
        if student_found:
            break
                    
    return pretest, posttest

In [None]:
def get_next_rating(s_id, response_number):
    next_rating = -1 # entries with next rating -1 are ignored in final dataset
    
    with open(reflect_file) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        for row in csv_reader:
            id_idx = reflect_columns.index('StudentId')
            instance_idx = reflect_columns.index('Instance')
            rating_idx = reflect_columns.index('ReflectionRatingAvg')
            if row[id_idx] == s_id and float(row[instance_idx]) == response_number + 1:
                next_rating = row[rating_idx]
                
    return next_rating

In [None]:
def get_logs(s_id, data_row, prev_log, pca=None, pca_dim=32, tutorial_complete=False):
    log = {}
    
    event_vec = [0 for e in range(len(events_without_movement))]
    goal_vec = [0 for g in range(len(plot_names))]
    goal_vec[plot_names.index('IntroFromKim')] = 1
    goal_vec[plot_names.index('TutorialComplete')] = 1
    emb = [random.uniform(-1,1) for e in range(pca_dim)]
    next_rating = -1
    response_number = 0
    if prev_log: 
        # if previous action exists, maintain prev goals, response emb, next rating
        # and current response number
        goal_vec = deepcopy(prev_log['goal'])
        emb = deepcopy(prev_log['response_emb'])
        next_rating = deepcopy(prev_log['next_rating'])
        response_number = deepcopy(prev_log['response_number'])
    
    event_col = columns.index('Event')
    if data_row[event_col] in events_without_movement:
        event_vec[events_without_movement.index(data_row[event_col])] = 1 # set current event to 1
    else:
        return {}, tutorial_complete # event is movement, and hence is not added to logs
    
    if data_row[event_col] == 'PlotPoint': # if new goal achieved
        plot_col = columns.index('Name')
        goal_vec[plot_names.index(data_row[plot_col])] = 1 # set new goal to 1
        if data_row[plot_col] == 'TutorialComplete':
            tutorial_complete = True
            
    if data_row[event_col] == 'Prompts':
        progress_plan_col = columns.index('ProgressPlanSummary')
        solution_col = columns.index('SolutionApproachSummary')
        diff_solution_col = columns.index('DifferentProblemApproachSummary')
        # combine all response types to construct response
        response = data_row[progress_plan_col] + data_row[solution_col] + data_row[diff_solution_col]
        emb = get_embedding(response, pca=pca, pca_dim=pca_dim)
        response_number += 1 # update current response count for student
        next_rating = get_next_rating(s_id, response_number) # get response rating for next response
    elif next_rating == -1:
        next_rating = get_next_rating(s_id, response_number) # get response rating for next response
    
    if prev_log:
        log['event'] = [a + b for a, b in zip(prev_log['event'], event_vec)] # event vector is a count vec
    else:
        log['event'] = event_vec
    log['goal'] = goal_vec
    log['response_emb'] = emb
    log['next_rating'] = next_rating
    log['response_number'] = response_number
    
    return log, tutorial_complete

In [None]:
def get_pca_model(pca_dim):
    pca = PCA(pca_dim)
    
    responses = []
    with open(reflect_file) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        for i,row in enumerate(csv_reader): # for each written response in LabeledReflections file
            if i == 0: # ignore column headers
                continue
            idx = reflect_columns.index('Response') # get column index of response
            if row[idx] != "": # if response exists (not blank)
                responses.append(get_embedding(row[idx])) # append ELMo embedding
    
    pca.fit(responses)
    
    return pca

In [None]:
def get_game_logs(pca_dim=32, movement=False):
    # get PCA model for response embeddings
    pca = get_pca_model(pca_dim)
    
    data = {} # dictionary to store student logs [Key: StudentID]
    
    for study in filenames: # study: 2018/2019
        with open(filenames[study]) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            for row_idx, row in enumerate(csv_reader):
                if row_idx == 0: # skip header
                    continue
                
                s_id = row[columns.index('TestSubject')] # ID of current student
                if s_id not in data: # add student to dictionary
                    data[s_id] = {}
                    pretest, posttest = get_scores(s_id, study)
                    data[s_id]['pretest'] = pretest
                    data[s_id]['posttest'] = posttest
                    data[s_id]['tutorial_complete'] = False
                    data[s_id]['logs'] = [] # no actions logged yet
                    
                if data[s_id]['posttest'] != -1 and data[s_id]['pretest'] != -1:
                    if len(data[s_id]['logs']) > 0: # has previous actions logged
                        prev_log = data[s_id]['logs'][len(data[s_id]['logs'])-1] # pick last action
                    else:
                        prev_log = {} # no available previous action
                    log, data[s_id]['tutorial_complete'] = get_logs(s_id, row, prev_log, pca=pca, 
                                                                    pca_dim=pca_dim, 
                                                                    tutorial_complete=data[s_id]['tutorial_complete'])
                    if log and data[s_id]['tutorial_complete']: # not a movement event
                        data[s_id]['logs'].append(log) # add new action (format: dict) to logs
                        
    return data