In [1]:
import numpy as np
import pandas as pd
import featuretools as ft
ft.__version__

'0.1.17'

# About
This is a notebook which shows how we can create an EntitySet from a dataset with the learnlab structure. The content of this notebook, the function `learnlab_to_entityset`, is also in [utilities](utilities.py) so that it can be loaded as a package in the [demo](Datashop%20CMU%20-%20Geometry.ipynb) notebook.

In [2]:
data = pd.read_csv('data/data.txt', '\t')
data.columns

Index(['Row', 'Sample Name', 'Transaction Id', 'Anon Student Id', 'Session Id',
       'Time', 'Time Zone', 'Duration (sec)', 'Student Response Type',
       'Student Response Subtype', 'Tutor Response Type',
       'Tutor Response Subtype', 'Level (Unit)', 'Level (Section)',
       'Problem Name', 'Problem View', 'Problem Start Time', 'Step Name',
       'Attempt At Step', 'Is Last Attempt', 'Outcome', 'Selection', 'Action',
       'Input', 'Feedback Text', 'Feedback Classification', 'Help Level',
       'Total Num Hints', 'Condition Name', 'Condition Type', 'KC (Default)',
       'KC Category (Default)', 'KC (Single-KC)', 'KC Category (Single-KC)',
       'KC (Unique-step)', 'KC Category (Unique-step)', 'School', 'Class'],
      dtype='object')

In [28]:
import featuretools.variable_types as vtypes

def learnlab_to_entityset(data):
    # Make an EntitySet called Dataset with the following structure
    #
    # schools       students     problems
    #        \        |         /
    #   classes   sessions   problem steps
    #          \     |       /
    #           transactions  -- attempts
    #
    data.index = data['Transaction Id']
    data = data.drop(['Row'], axis=1)
    data = data[data['Duration (sec)'] != '.']

    kc_and_cf_cols = [x for x in data.columns if (x.startswith('KC ') or x.startswith('CF '))]
    kc_and_cf_cols.append('Problem Name')
    data['Outcome'] = data['Outcome'].map({'INCORRECT': 0, 'CORRECT': 1})
    data['End Time'] = pd.to_datetime(data['Time']) + pd.to_timedelta(pd.to_numeric(data['Duration (sec)']), 's')


    es = ft.EntitySet('Dataset')
    es.entity_from_dataframe(entity_id='transactions', 
                             index='Transaction Id', 
                             dataframe=data,
                             variable_types={'Outcome': vtypes.Boolean},
                             time_index='Time',
                             secondary_time_index={'End Time': ['Outcome', 'Is Last Attempt', 'Duration (sec)']})
    
    # Two entities associated to problems
    es.normalize_entity(base_entity_id='transactions',
                        new_entity_id='problem_steps',
                        index='Step Name',
                        additional_variables=kc_and_cf_cols,
                        make_time_index=False)

    es.normalize_entity(base_entity_id='problem_steps',
                        new_entity_id='problems',
                        index='Problem Name',
                        make_time_index=False)
    
    
    # Two entities associated to students
    es.normalize_entity(base_entity_id='transactions',
                        new_entity_id='sessions',
                        index='Session Id',
                        additional_variables=['Anon Student Id'],
                        make_time_index=True)

    es.normalize_entity(base_entity_id='sessions',
                        new_entity_id='students',
                        index='Anon Student Id',
                        make_time_index=True)
    
    # Two entities associated to a school
    es.normalize_entity(base_entity_id='transactions',
                        new_entity_id='classes',
                        index='Class',
                        additional_variables=['School'],
                        make_time_index=False)
    
    es.normalize_entity(base_entity_id='classes',
                        new_entity_id='schools',
                        index='School',
                        make_time_index=False)

    # An entity associated to attempts
    es.normalize_entity(base_entity_id='transactions',
                        new_entity_id='attempts',
                        index='Attempt At Step',
                        additional_variables=[],
                        make_time_index=False)
    return es

In [31]:
es = learnlab_to_entityset(data)
es

Entityset: Dataset
  Entities:
    transactions (shape = [19998, 29])
    problem_steps (shape = [9665, 8])
    problems (shape = [1861, 1])
    sessions (shape = [204, 3])
    students (shape = [54, 2])
    ...And 3 more
  Relationships:
    transactions.Step Name -> problem_steps.Step Name
    problem_steps.Problem Name -> problems.Problem Name
    transactions.Session Id -> sessions.Session Id
    sessions.Anon Student Id -> students.Anon Student Id
    transactions.Class -> classes.Class
    ...and 2 more