In [1]:
import numpy as np
import pandas as pd
import featuretools as ft
ft.__version__

'0.1.17'

In [13]:
data = pd.read_csv('data/data.txt', '\t')
data.index = data['Transaction Id']
data = data.drop(['Row'], axis=1)
print("This data has {} columns:".format(len(data.columns)))
print(data.columns)

This data has 75 columns:
Index(['Sample Name', 'Transaction Id', 'Anon Student Id', 'Session Id',
       'Time', 'Time Zone', 'Duration (sec)', 'Student Response Type',
       'Student Response Subtype', 'Tutor Response Type',
       'Tutor Response Subtype', 'Level (Unit)', 'Problem Name',
       'Problem View', 'Problem Start Time', 'Step Name', 'Attempt At Step',
       'Is Last Attempt', 'Outcome', 'Selection', 'Action', 'Input',
       'Feedback Text', 'Feedback Classification', 'Help Level',
       'Total Num Hints', 'KC (Geometry)', 'KC Category (Geometry)',
       'KC (Textbook)', 'KC Category (Textbook)', 'KC (Single-KC)',
       'KC Category (Single-KC)', 'KC (Unique-step)',
       'KC Category (Unique-step)', 'KC (NewModel)', 'KC Category (NewModel)',
       'KC (NNEWWW)', 'KC Category (NNEWWW)', 'KC (New)', 'KC Category (New)',
       'KC (MyKC)', 'KC Category (MyKC)', 'KC (MJB-SQRECT-Merge)',
       'KC Category (MJB-SQRECT-Merge)', 'KC (KRE_circle_area)',
       'KC Cate

# Phase 1: Creating a useful dataset structure
Since we have so many categorical columns, it's worth taking a moment to think about how this data is structured. At the base level we have `transactions`, every event that is recorded in the data. The columns of those transactions have variables that can be grouped together. As an example, there are only 78 distinct `problem_steps` for the 6778 transactions we have. Associated to each such problem step, we have a variety of knowledge components (KC) and custom fields (CF) associated to that step.

Because we can expect those columns to be consistently named, we can pull them all out at once using ordinary python list comprehensions and `normalize_entity` from featuretools. These steps will be consistent for any dataset of this format.

In [3]:
#               students     problems
#                 |         /
#   attempts   sessions   problem steps
#          \     |       /
#           transactions


import featuretools.variable_types as vtypes

kc_and_cf_cols = [x for x in data.columns if (x.startswith('KC ') or x.startswith('CF '))]
kc_and_cf_cols.append('Problem Name')
data['Outcome'] = data['Outcome'].map({'INCORRECT': 0, 'CORRECT': 1})
data['End Time'] = pd.to_datetime(data['Time'])+pd.to_timedelta(data['Duration (sec)'], 's')

es = ft.EntitySet('Geometry Dataset')
es.entity_from_dataframe(entity_id='transactions', 
                         index='Transaction Id', 
                         dataframe=data,
                         variable_types={'Outcome': vtypes.Boolean},
                         time_index='Time',
                         secondary_time_index={'End Time': ['Outcome', 'Is Last Attempt', 'Duration (sec)']})

es.normalize_entity(base_entity_id='transactions',
                    new_entity_id='problem_steps',
                    index='Step Name',
                    additional_variables=kc_and_cf_cols,
                    make_time_index=False)

es.normalize_entity(base_entity_id='problem_steps',
                    new_entity_id='problems',
                    index='Problem Name',
                    make_time_index=False)

es.normalize_entity(base_entity_id='transactions',
                    new_entity_id='sessions',
                    index='Session Id',
                    additional_variables=['Anon Student Id'],
                    make_time_index=False)

es.normalize_entity(base_entity_id='sessions',
                    new_entity_id='students',
                    index='Anon Student Id',
                    make_time_index=False)

es.normalize_entity(base_entity_id='transactions',
                    new_entity_id='attempts',
                    index='Attempt At Step',
                    additional_variables=[],
                    make_time_index=True)

es

Entityset: Geometry Dataset
  Entities:
    transactions (shape = [6778, 27])
    problem_steps (shape = [78, 49])
    problems (shape = [20, 1])
    sessions (shape = [59, 2])
    students (shape = [59, 1])
    ...And 1 more
  Relationships:
    transactions.Step Name -> problem_steps.Step Name
    problem_steps.Problem Name -> problems.Problem Name
    transactions.Session Id -> sessions.Session Id
    sessions.Anon Student Id -> students.Anon Student Id
    transactions.Attempt At Step -> attempts.Attempt At Step

In [10]:
es['transactions'].df.columns

Index(['Sample Name', 'Transaction Id', 'Session Id', 'Time', 'Time Zone',
       'Duration (sec)', 'Student Response Type', 'Student Response Subtype',
       'Tutor Response Type', 'Tutor Response Subtype', 'Level (Unit)',
       'Problem View', 'Problem Start Time', 'Step Name', 'Attempt At Step',
       'Is Last Attempt', 'Outcome', 'Selection', 'Action', 'Input',
       'Feedback Text', 'Feedback Classification', 'Help Level',
       'Total Num Hints', 'School', 'Class', 'End Time'],
      dtype='object')

## Building Features
We create a custom primitive: `Prob`, which calculates the likelihood that a boolean variable is false. It's worth noting that the opposite of this primitive is built in to Featuretools: `PercentTrue`. One of the many advantages in defining custom primitives is that we can define the name and input types as we would like. If you're interested in creating your own custom primitives for this dataset, copy and modify this code as necessary.

In [5]:
from featuretools.primitives import make_agg_primitive
def probability(boolean):
    numtrue = len([x for x in boolean if x==1])
    return 1 - numtrue/len(boolean)

Prob = make_agg_primitive(probability,
                          input_types=[vtypes.Boolean],
                          name='failure_rate',
                          description='Calculates likelihood a boolean is false over a region',
                          return_type=vtypes.Numeric)


Here we calculate a feature matrix on the `transactions` entity to try to predict the outcome of a given transaction. It's at this step that our previous setup pays off: we can automatically calculate features as if at a given point in time using Deep Feature Synthesis. Furthermore, we can guarentee that future values for `Outcome` won't be used for any calculations because we set the time index of that value to be after the cutoff time.

Lastly, we can automatically apply `Prob` while grouping by any of the entities we created before.

In [6]:
# Automatically generate features on collected data
from featuretools.primitives import Sum, Mean, Median, Count, Hour 
cutoff_times = data[['Transaction Id', 'Time', 'Outcome']][500:]
cutoff_times['Time'] = pd.to_datetime(cutoff_times['Time'])
fm, features = ft.dfs(entityset=es, 
                      target_entity='transactions',
                      agg_primitives=[Prob],
                      trans_primitives=[],
                      seed_features=[],
                      max_depth=3,
                      approximate='1m',
                      cutoff_time=cutoff_times,
                      verbose=True)
print('Created {} features'.format(len(features)))

Building features: 143it [00:00, 6821.87it/s]
Progress: 100%|██████████| 120/120 [01:33<00:00,  1.29cutoff time/s]
Created 74 features


In [7]:
features[-8:]

[<Feature: problem_steps.CF (Factor trapezoid-part)>,
 <Feature: problem_steps.Problem Name>,
 <Feature: sessions.Anon Student Id>,
 <Feature: problem_steps.FAILURE_RATE(transactions.Outcome)>,
 <Feature: sessions.FAILURE_RATE(transactions.Outcome)>,
 <Feature: attempts.FAILURE_RATE(transactions.Outcome)>,
 <Feature: problem_steps.problems.FAILURE_RATE(transactions.Outcome)>,
 <Feature: sessions.students.FAILURE_RATE(transactions.Outcome)>]

Let's parse a couple of features. The feature `problem_steps.FAILURE_RATE(transactions.Outcome)` is exactly the percent of students who did not succeed on a given `problem_step` as calculated at a given time. Similarly, the `attempts.FAILURE_RATE(transactions.Outcome)` is the failure rate as grouped by the problem attempt (i.e. more students miss the questions on an earlier attempts than later ones).

In [8]:
from featuretools.selection import remove_low_information_features
fm_enc, _ = ft.encode_features(fm, features)
fm_enc = fm_enc.fillna(0)
fm_enc = remove_low_information_features(fm_enc)
labels = fm.pop('Outcome')


In [9]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score


splitter = TimeSeriesSplit(n_splits=5, max_train_size=None)
i=0
for train_index, test_index in splitter.split(fm):
    clf = RandomForestClassifier()
    X_train, X_test = fm_enc.iloc[train_index], fm_enc.iloc[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    score = round(roc_auc_score(preds, y_test), 2)
    print("AUC score on time split {} is {}".format(i, score))
    feature_imps = [(imp, fm_enc.columns[i]) for i, imp in enumerate(clf.feature_importances_)]
    feature_imps.sort()
    feature_imps.reverse()
    print(feature_imps[0:5])
    i += 1

AUC score on time split 0 is 0.6
[(0.1337026297426933, 'sessions.students.FAILURE_RATE(transactions.Outcome)'), (0.10700496004502816, 'sessions.FAILURE_RATE(transactions.Outcome)'), (0.10637038180553655, 'attempts.FAILURE_RATE(transactions.Outcome)'), (0.0837125374049934, 'problem_steps.FAILURE_RATE(transactions.Outcome)'), (0.07826273247899355, 'problem_steps.problems.FAILURE_RATE(transactions.Outcome)')]
AUC score on time split 1 is 0.6
[(0.15051363545733915, 'sessions.students.FAILURE_RATE(transactions.Outcome)'), (0.15014320148753196, 'sessions.FAILURE_RATE(transactions.Outcome)'), (0.12471825254376057, 'attempts.FAILURE_RATE(transactions.Outcome)'), (0.08880605139516665, 'problem_steps.FAILURE_RATE(transactions.Outcome)'), (0.08468644597671429, 'problem_steps.problems.FAILURE_RATE(transactions.Outcome)')]
AUC score on time split 2 is 0.61
[(0.14266583800988802, 'sessions.FAILURE_RATE(transactions.Outcome)'), (0.13988039617382023, 'sessions.students.FAILURE_RATE(transactions.Outcom