In [2]:
import numpy as np
import pandas as pd
import featuretools as ft
ft.__version__

'0.1.17'

# About
This is a notebook which shows how we can create an EntitySet from a dataset with the LearnLab structure. The content of this notebook, the function `learnlab_to_entityset`, is also in [utilities](utilities.py) so that it can be loaded as a package in the [demo](Demo%20-%20LearnLab.ipynb) notebook.

In [3]:
import featuretools.variable_types as vtypes

def learnlab_to_entityset(data):
    # Make an EntitySet called Dataset with the following structure
    #
    # schools       students     problems
    #        \        |         /
    #   classes   sessions   problem steps
    #          \     |       /
    #           transactions  -- attempts
    #
    
    # Set the Transaction Id as the index column for the whole table
    # and clean up other columns.
    data.index = data['Transaction Id']
    data = data.drop(['Row'], axis=1)
    data['Outcome'] = data['Outcome'].map({'INCORRECT': 0, 'CORRECT': 1})
    
    # Make a new 'End Time' column which is start_time + duration
    # This is /super useful/ because you shouldn't be using the outcome data at 
    # any point before the End Time since it's not known.
    data['End Time'] = pd.to_datetime(data['Time']) + pd.to_timedelta(pd.to_numeric(data['Duration (sec)']), 's')


    # Make a list of all KC and CF columns present
    kc_and_cf_cols = [x for x in data.columns if (x.startswith('KC ') or x.startswith('CF '))]
    kc_and_cf_cols.append('Problem Name')
    
    # Now we start making an entityset. We make 'End Time' a time index for 'Outcome',
    # even though our primary time index for a row is 'Time'
    es = ft.EntitySet('Dataset')
    es.entity_from_dataframe(entity_id='transactions', 
                             index='Transaction Id', 
                             dataframe=data,
                             variable_types={'Outcome': vtypes.Boolean},
                             time_index='Time',
                             secondary_time_index={'End Time': ['Outcome', 'Is Last Attempt', 'Duration (sec)']})
    
    # Every transaction has a 'problem step', which is associated to a 'problem'
    es.normalize_entity(base_entity_id='transactions',
                        new_entity_id='problem_steps',
                        index='Step Name',
                        additional_variables=kc_and_cf_cols,
                        make_time_index=False)

    es.normalize_entity(base_entity_id='problem_steps',
                        new_entity_id='problems',
                        index='Problem Name',
                        make_time_index=False)
    
    
    # Every transaction has a 'session' which is associated to a 'student'
    es.normalize_entity(base_entity_id='transactions',
                        new_entity_id='sessions',
                        index='Session Id',
                        additional_variables=['Anon Student Id'],
                        make_time_index=True)

    es.normalize_entity(base_entity_id='sessions',
                        new_entity_id='students',
                        index='Anon Student Id',
                        make_time_index=True)
    
    # Every transaction has a 'class' which is associated to a 'school'
    es.normalize_entity(base_entity_id='transactions',
                        new_entity_id='classes',
                        index='Class',
                        additional_variables=['School'],
                        make_time_index=False)
    
    es.normalize_entity(base_entity_id='classes',
                        new_entity_id='schools',
                        index='School',
                        make_time_index=False)

    # And we might be interested in grouping by attempts, 
    # so make a table of those as well
    es.normalize_entity(base_entity_id='transactions',
                        new_entity_id='attempts',
                        index='Attempt At Step',
                        additional_variables=[],
                        make_time_index=False)
    return es

In [4]:
data = pd.read_csv('data/ds2174_tx_All_Data_3991_2017_1128_123859.txt', '\t')
data.columns
es = learnlab_to_entityset(data)
es

Entityset: Dataset
  Entities:
    transactions (shape = [6778, 26])
    problem_steps (shape = [78, 49])
    problems (shape = [20, 1])
    sessions (shape = [59, 3])
    students (shape = [59, 2])
    ...And 3 more
  Relationships:
    transactions.Step Name -> problem_steps.Step Name
    problem_steps.Problem Name -> problems.Problem Name
    transactions.Session Id -> sessions.Session Id
    sessions.Anon Student Id -> students.Anon Student Id
    transactions.Class -> classes.Class
    ...and 2 more

# Summary
In total we have made 8 entities. At the base is `transactions` and then we have the following one-to-many relationships which have a one-to-many with `transactions`:
1. `problems` -> `problem_steps` -> `transactions`  
2. `students` -> `sessions`  -> `transactions`
3. `schools` -> `classes` -> `transactions`
4. `attempts` -> `transactions`

Our base entity also has a time index `Time` and a secondary time index `End Time` for columns which can only be known when the event is over. This allows us to use `Outcome` in our feature matrix, since it will only be used for later events.