<a href="https://colab.research.google.com/github/allen44/riiid-test-answer-prediction/blob/main/feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%cd /content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction/
%pwd

/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction


'/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction'

## Preprocessed data

In [32]:
import pickle
from pathlib import Path

# #Define data paths
df_train_preprocessed_path = Path('/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction/data/intermediate/df_train_preprocessed.pkl.gzip')
df_lectures_preprocessed_path = Path('/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction/data/intermediate/df_lectures_preprocessed.pkl.gzip')
df_questions_preprocessed_path = Path('/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction/data/intermediate/df_questions_preprocessed.pkl.gzip')

Using our insights gained from the EDA, when can import the data from csv with the best preprocessing for feature engineering.

In [33]:
with open(df_train_preprocessed_path, 'rb') as f:
  df_train = pickle.load(f)

with open(df_lectures_preprocessed_path, 'rb') as f:
  df_lectures = pickle.load(f)

with open(df_questions_preprocessed_path, 'rb') as f:
  df_questions = pickle.load(f)

assert df_train['content_id'].dtype == df_lectures['lecture_id'].dtype
assert df_questions['question_id'].dtype == df_lectures['lecture_id'].dtype

df_train.shape, df_lectures.shape, df_questions.shape 

((101230331, 9), (418, 4), (13522, 192))

In [34]:
# Use a subset of df_train
df_train = df_train[df_train.index % 10000 == 0]


## Manually add features

In [35]:
df_users = pd.DataFrame(df_train['user_id'].unique(), columns=['user_id'])
df_users

Unnamed: 0,user_id
0,115
1,91216
2,220268
3,650467
4,1084314
...,...
10052,2146686585
10053,2146925942
10054,2147012157
10055,2147192385


In [36]:
# Get and set the union of the three cat dtypes
categories = set(df_train['user_id'].dtype.categories)\
            .union(set(df_users['user_id'].dtype.categories))

df_train['user_id'] = pd.Categorical(df_train['user_id'], categories=categories)
df_users['user_id'] = pd.Categorical(df_users['user_id'], categories=categories)

# Confirm that the dtypes are the same
assert df_users['user_id'].dtype == df_train['user_id'].dtype



## Install and import featuretools

In [37]:
# % pip install featuretools dask distributed tornado
import featuretools as ft

# Prepare data

First, we specify a dictionary with all the entities in our dataset.

In [38]:
# Define entity set
es = ft.EntitySet()

# Add df_train to entity set
es.entity_from_dataframe(entity_id='train', 
                         dataframe=df_train,
                         index='row_id',
                         time_index='timestamp')

es['train'].variables

  "integer column".format(index))


[<Variable: row_id (dtype = index)>,
 <Variable: timestamp (dtype: datetime_time_index, format: None)>,
 <Variable: user_id (dtype = categorical)>,
 <Variable: content_id (dtype = categorical)>,
 <Variable: content_type_id (dtype = categorical)>,
 <Variable: task_container_id (dtype = categorical)>,
 <Variable: user_answer (dtype = categorical)>,
 <Variable: answered_correctly (dtype = categorical)>,
 <Variable: prior_question_elapsed_time (dtype = numeric)>,
 <Variable: prior_question_had_explanation (dtype = boolean)>]

In [39]:
# Add df_lectures to entity set
es.entity_from_dataframe(entity_id='lectures', dataframe=df_lectures, index='lecture_id')

es['lectures'].variables

[<Variable: lecture_id (dtype = index)>,
 <Variable: tag (dtype = categorical)>,
 <Variable: part (dtype = categorical)>,
 <Variable: type_of (dtype = categorical)>]

In [40]:
# Add df_questions to entity set
es.entity_from_dataframe(entity_id='questions', dataframe=df_questions, index='question_id')

es['questions'].variables

[<Variable: question_id (dtype = index)>,
 <Variable: bundle_id (dtype = categorical)>,
 <Variable: correct_answer (dtype = categorical)>,
 <Variable: part (dtype = categorical)>,
 <Variable: 0 (dtype = boolean)>,
 <Variable: 1 (dtype = boolean)>,
 <Variable: 10 (dtype = boolean)>,
 <Variable: 100 (dtype = boolean)>,
 <Variable: 101 (dtype = boolean)>,
 <Variable: 102 (dtype = boolean)>,
 <Variable: 103 (dtype = boolean)>,
 <Variable: 104 (dtype = boolean)>,
 <Variable: 105 (dtype = boolean)>,
 <Variable: 106 (dtype = boolean)>,
 <Variable: 107 (dtype = boolean)>,
 <Variable: 108 (dtype = boolean)>,
 <Variable: 109 (dtype = boolean)>,
 <Variable: 11 (dtype = boolean)>,
 <Variable: 110 (dtype = boolean)>,
 <Variable: 111 (dtype = boolean)>,
 <Variable: 112 (dtype = boolean)>,
 <Variable: 113 (dtype = boolean)>,
 <Variable: 114 (dtype = boolean)>,
 <Variable: 115 (dtype = boolean)>,
 <Variable: 116 (dtype = boolean)>,
 <Variable: 117 (dtype = boolean)>,
 <Variable: 118 (dtype = boolean)>

In [41]:
# Add df_users to entity set
es.entity_from_dataframe(entity_id='users', dataframe=df_users, index='user_id')

es['users'].variables

[<Variable: user_id (dtype = index)>]

In [42]:
import gc

del df_train
del df_lectures
del df_questions

gc.collect()

1243

In [43]:
r_lectures_train= ft.Relationship(es["lectures"]["lecture_id"],
                                          es["train"]["content_id"])

r_questions_train = ft.Relationship(es["questions"]["question_id"],
                                    es["train"]["content_id"])

r_users_train = ft.Relationship(es["users"]["user_id"],
                                    es["train"]["user_id"])

es = es.add_relationship(r_lectures_train)
es = es.add_relationship(r_questions_train)
es = es.add_relationship(r_users_train)
es

Entityset: None
  Entities:
    train [Rows: 10124, Columns: 10]
    lectures [Rows: 418, Columns: 4]
    questions [Rows: 13522, Columns: 192]
    users [Rows: 10057, Columns: 1]
  Relationships:
    train.content_id -> lectures.lecture_id
    train.content_id -> questions.question_id
    train.user_id -> users.user_id

In [44]:
es['train']['answered_correctly'].interesting_values = [0, 1]

In [60]:
for entity in ['train', 'lectures', 'questions', 'users']:
  feature_defs = ft.dfs(entityset=es, target_entity=entity, 
                      where_primitives = ['sum', 'mean'],
                      max_depth=2, features_only=True)

  print(f'This will generate {len(feature_defs)} features for {entity}.\n')

This will generate 260 features for train.

This will generate 227 features for lectures.

This will generate 227 features for questions.

This will generate 230 features for users.



  where_primitives: ['mean', 'sum']
This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible variable types for the primitive were found in the data.


In [47]:
import random; random.seed(42)

random.sample(feature_defs, 10)

[<Feature: PERCENT_TRUE(train.questions.40)>,
 <Feature: MODE(train.questions.correct_answer)>,
 <Feature: MODE(train.content_type_id)>,
 <Feature: PERCENT_TRUE(train.questions.64)>,
 <Feature: PERCENT_TRUE(train.questions.125)>,
 <Feature: PERCENT_TRUE(train.questions.118)>,
 <Feature: PERCENT_TRUE(train.questions.113)>,
 <Feature: NUM_UNIQUE(train.lectures.tag)>,
 <Feature: PERCENT_TRUE(train.questions.63)>,
 <Feature: MODE(train.lectures.type_of)>]

### Aggregation Primitives

In [48]:
all_p = ft.list_primitives()
trans_p = all_p.loc[all_p['type'] == 'transform'].copy()
agg_p = all_p.loc[all_p['type'] == 'aggregation'].copy()

pd.options.display.max_colwidth = 100
# agg_p

In [49]:
# Specify aggregation primitives
agg_primitives = ['sum', 'time_since_last', 'avg_time_between', 'all', 'mode', 'num_unique', 'min', 'last', 
                  'mean', 'percent_true', 'max', 'std', 'count']

## Transform Primitives

In [50]:
# trans_p

In [51]:
# Specify transformation primitives
trans_primitives = ['cum_sum', 'diff', 'time_since_previous']

### Where Primitives

These primitives are applied to the `interesting_values` to build conditional features. 

In [52]:
# Specify where primitives
where_primitives = ['sum', 'mean', 'percent_true', 'all', 'any']

## Custom Primitives

For this problem, I wrote a custom primitive that calculates the sum of a value in the month prior to the cutoff time.

The second custom primitive finds the time since a previous true value. It simply finds the time between True examples.

In [56]:
def total_previous_month(numeric, datetime, time):
    """Return total of `numeric` column in the month prior to `time`."""
    df = pd.DataFrame({'value': numeric, 'date': datetime})
    previous_month = time.month - 1
    year = time.year
   
    # Handle January
    if previous_month == 0:
        previous_month = 12
        year = time.year - 1
        
    # Filter data and sum up total
    df = df[(df['date'].dt.month == previous_month) & (df['date'].dt.year == year)]
    total = df['value'].sum()
    
    return total

def time_since_true(boolean, datetime):
    """Calculate time since previous true value"""
    
    if np.any(np.array(list(boolean)) == 1):
        # Create dataframe sorted from oldest to newest 
        df = pd.DataFrame({'value': boolean, 'date': datetime}).\
                sort_values('date', ascending = False).reset_index()

        older_date = None

        # Iterate through each date in reverse order
        for date in df.loc[df['value'] == 1, 'date']:

            # If there was no older true value
            if older_date == None:
                # Subset to times on or after true
                times_after_idx = df.loc[df['date'] >= date].index

            else:
                # Subset to times on or after true but before previous true
                times_after_idx = df.loc[(df['date'] >= date) & (df['date'] < older_date)].index
            older_date = date
            # Calculate time since previous true
            df.loc[times_after_idx, 'time_since_previous'] = (df.loc[times_after_idx, 'date'] - date).dt.total_seconds()

        return list(df['time_since_previous'])[::-1]
    
    # Handle case with no true values
    else:
        return [np.nan for _ in range(len(boolean))]

### Custom Primitive Implementation

Making a custom primitive is simple: first we define a function (`total_previous_month`) and then we `make_agg_primitive` with `input_type[s]`, a `return_type`, and whether or not the primitive requires the `cutoff_time` through `uses_calc_time`. 

This primitive is an aggregation primitive because it takes in multiple numbers - transactions for the previous month - and returns a single number - the total of the transactions. 

In [57]:
from featuretools.primitives import make_agg_primitive

# Takes in a number and outputs a number
total_previous = make_agg_primitive(total_previous_month, 
                                    input_types = [ft.variable_types.Numeric,
                                                   ft.variable_types.Datetime],
                                    return_type = ft.variable_types.Numeric, 
                                    uses_calc_time = True)

In [58]:
from featuretools.primitives import make_trans_primitive

# Specify the inputs and return
time_since = make_trans_primitive(time_since_true, 
                                  input_types = [ft.variable_types.Boolean, 
                                                  ft.variable_types.Datetime],
                                  return_type = ft.variable_types.Numeric)

Now just have to pass this in as another aggregation primitive for Featuretools to use it in calculations.



Let's add the two custom primitives to the respective lists. In the final version of feature engineering, I did not use the `time_since` primitive. I ran into problems with the implementation but would encourage anyone to try and fix it or build their own custom primitive[s].

In [59]:
agg_primitives.append(total_previous)
trans_primitives.append(time_since)

## Deep Feature Synthesis with Specified Primitives

We'll again run Deep Feature Synthesis to make the feature definitions this time using the selected primitives and the custom primitives. 

In [65]:
feature_defs = {}

for entity in ['train', 'lectures', 'questions', 'users']:
  feature_defs[entity] = ft.dfs(entityset=es, target_entity=entity, 
                        # cutoff_time = cutoff_times, 
                        agg_primitives = agg_primitives,
                        trans_primitives = trans_primitives,
                        where_primitives = where_primitives,
                        chunk_size = 100, #len(cutoff_times), 
                        # cutoff_time_in_index = True,
                        max_depth = 2, 
                        features_only = True)

  print(f'This will generate {len(feature_defs[entity])} features for {entity}.')

  where_primitives: ['all', 'any', 'mean', 'percent_true', 'sum']
This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible variable types for the primitive were found in the data.


This will generate 481 features for train.


  where_primitives: ['any']
This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible variable types for the primitive were found in the data.


This will generate 1443 features for lectures.
This will generate 503 features for questions.
This will generate 1449 features for users.


In [107]:
random.sample(feature_defs, 15)

[<Feature: NUM_UNIQUE(train.user_answer)>,
 <Feature: LAST(train.user_answer)>,
 <Feature: SUM(train.prior_question_elapsed_time WHERE answered_correctly = 0)>,
 <Feature: NUM_UNIQUE(train.task_container_id)>,
 <Feature: MIN(train.prior_question_elapsed_time)>,
 <Feature: MODE(train.task_container_id)>,
 <Feature: PERCENT_TRUE(train.prior_question_had_explanation WHERE answered_correctly = 1)>,
 <Feature: ALL(train.prior_question_had_explanation WHERE answered_correctly = 1)>,
 <Feature: NUM_UNIQUE(train.content_type_id)>,
 <Feature: NUM_UNIQUE(train.content_id)>,
 <Feature: COUNT(train)>,
 <Feature: NUM_UNIQUE(train.answered_correctly)>,
 <Feature: TIME_SINCE_LAST(train.timestamp)>,
 <Feature: LAST(train.content_type_id)>,
 <Feature: MEAN(train.prior_question_elapsed_time WHERE answered_correctly = 1)>]

# Run Deep Feature Synthesis

Once we're happy with the features that will be generated, we can run deep feature synthesis to make the actual features. We need to change `feature_only` to `False` and then we're good to go.

In [66]:
from timeit import default_timer as timer

feature_defs = {}
feature_matrix = {}
for entity in ['train', 'lectures', 'questions', 'users']:
  start = timer()
  tmp_1, tmp_2 = ft.dfs(entityset=es, 
                        target_entity=entity, 
                        # cutoff_time = cutoff_times, 
                        agg_primitives = agg_primitives,
                        trans_primitives = trans_primitives,
                        where_primitives = where_primitives,
                        max_depth = 2, features_only = False,
                        verbose = 1, 
                        chunk_size = 1000,  
                        # n_jobs = -1,
                        # cutoff_time_in_index = True
                        )
  feature_matrix[entity] = tmp_1
  feature_defs[entity] = tmp_2
  end = timer()
  print(f'{round(end - start)} seconds elapsed.')

Built 481 features
Elapsed: 00:00 | Progress:   0%|          

  where_primitives: ['all', 'any', 'mean', 'percent_true', 'sum']
This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible variable types for the primitive were found in the data.


Elapsed: 03:42 | Progress: 100%|██████████
223 seconds elapsed.
Built 1443 features
Elapsed: 00:00 | Progress:   0%|          

  where_primitives: ['any']
This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible variable types for the primitive were found in the data.


Elapsed: 01:18 | Progress: 100%|██████████
79 seconds elapsed.
Built 503 features
Elapsed: 00:00 | Progress:   3%|▎         

  where_primitives: ['any']
This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible variable types for the primitive were found in the data.


Elapsed: 04:10 | Progress: 100%|██████████
251 seconds elapsed.
Built 1449 features
Elapsed: 00:00 | Progress:   0%|          

  where_primitives: ['any']
This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible variable types for the primitive were found in the data.


Elapsed: 22:27 | Progress: 100%|██████████
1348 seconds elapsed.


The `chunk_size` is a parameter that may need to be adjusted to optimize the calculation. I suggest playing around with this parameter to find the optimal value. Generally I've found that a large value makes the calculation proceed quicker although it depends on the machine in use and the number of unique cutoff times. 

In [72]:
feature_defs['users']

[<Feature: ALL(train.prior_question_had_explanation)>,
 <Feature: AVG_TIME_BETWEEN(train.timestamp)>,
 <Feature: COUNT(train)>,
 <Feature: LAST(train.answered_correctly)>,
 <Feature: LAST(train.content_id)>,
 <Feature: LAST(train.content_type_id)>,
 <Feature: LAST(train.prior_question_elapsed_time)>,
 <Feature: LAST(train.prior_question_had_explanation)>,
 <Feature: LAST(train.row_id)>,
 <Feature: LAST(train.task_container_id)>,
 <Feature: LAST(train.user_answer)>,
 <Feature: MAX(train.prior_question_elapsed_time)>,
 <Feature: MEAN(train.prior_question_elapsed_time)>,
 <Feature: MIN(train.prior_question_elapsed_time)>,
 <Feature: MODE(train.answered_correctly)>,
 <Feature: MODE(train.content_id)>,
 <Feature: MODE(train.content_type_id)>,
 <Feature: MODE(train.task_container_id)>,
 <Feature: MODE(train.user_answer)>,
 <Feature: NUM_UNIQUE(train.answered_correctly)>,
 <Feature: NUM_UNIQUE(train.content_id)>,
 <Feature: NUM_UNIQUE(train.content_type_id)>,
 <Feature: NUM_UNIQUE(train.task_

In [67]:
feature_matrix['users']

Unnamed: 0_level_0,ALL(train.prior_question_had_explanation),AVG_TIME_BETWEEN(train.timestamp),COUNT(train),LAST(train.answered_correctly),LAST(train.content_id),LAST(train.content_type_id),LAST(train.prior_question_elapsed_time),LAST(train.prior_question_had_explanation),LAST(train.row_id),LAST(train.task_container_id),LAST(train.user_answer),MAX(train.prior_question_elapsed_time),MEAN(train.prior_question_elapsed_time),MIN(train.prior_question_elapsed_time),MODE(train.answered_correctly),MODE(train.content_id),MODE(train.content_type_id),MODE(train.task_container_id),MODE(train.user_answer),NUM_UNIQUE(train.answered_correctly),NUM_UNIQUE(train.content_id),NUM_UNIQUE(train.content_type_id),NUM_UNIQUE(train.task_container_id),NUM_UNIQUE(train.user_answer),PERCENT_TRUE(train.prior_question_had_explanation),STD(train.prior_question_elapsed_time),SUM(train.prior_question_elapsed_time),TIME_SINCE_LAST(train.timestamp),"TOTAL_PREVIOUS_MONTH(train.prior_question_elapsed_time, timestamp)",ALL(train.prior_question_had_explanation WHERE answered_correctly = 0),ALL(train.prior_question_had_explanation WHERE answered_correctly = 1),ALL(train.questions.0),ALL(train.questions.0 WHERE answered_correctly = 0),ALL(train.questions.0 WHERE answered_correctly = 1),ALL(train.questions.1),ALL(train.questions.1 WHERE answered_correctly = 0),ALL(train.questions.1 WHERE answered_correctly = 1),ALL(train.questions.10),ALL(train.questions.10 WHERE answered_correctly = 0),ALL(train.questions.10 WHERE answered_correctly = 1),...,SUM(train.prior_question_elapsed_time WHERE answered_correctly = 0),SUM(train.prior_question_elapsed_time WHERE answered_correctly = 1),"TOTAL_PREVIOUS_MONTH(train.CUM_SUM(prior_question_elapsed_time), timestamp)","TOTAL_PREVIOUS_MONTH(train.DIFF(prior_question_elapsed_time), timestamp)","TOTAL_PREVIOUS_MONTH(train.TIME_SINCE_PREVIOUS(timestamp), timestamp)","TOTAL_PREVIOUS_MONTH(train.TIME_SINCE_TRUE(prior_question_had_explanation, timestamp), timestamp)",CUM_SUM(AVG_TIME_BETWEEN(train.timestamp)),CUM_SUM(COUNT(train)),CUM_SUM(LAST(train.prior_question_elapsed_time)),CUM_SUM(MAX(train.prior_question_elapsed_time)),CUM_SUM(MEAN(train.prior_question_elapsed_time)),CUM_SUM(MIN(train.prior_question_elapsed_time)),CUM_SUM(NUM_UNIQUE(train.answered_correctly)),CUM_SUM(NUM_UNIQUE(train.content_id)),CUM_SUM(NUM_UNIQUE(train.content_type_id)),CUM_SUM(NUM_UNIQUE(train.task_container_id)),CUM_SUM(NUM_UNIQUE(train.user_answer)),CUM_SUM(PERCENT_TRUE(train.prior_question_had_explanation)),CUM_SUM(STD(train.prior_question_elapsed_time)),CUM_SUM(SUM(train.prior_question_elapsed_time)),CUM_SUM(TIME_SINCE_LAST(train.timestamp)),"CUM_SUM(TOTAL_PREVIOUS_MONTH(train.prior_question_elapsed_time, timestamp))",DIFF(AVG_TIME_BETWEEN(train.timestamp)),DIFF(COUNT(train)),DIFF(LAST(train.prior_question_elapsed_time)),DIFF(MAX(train.prior_question_elapsed_time)),DIFF(MEAN(train.prior_question_elapsed_time)),DIFF(MIN(train.prior_question_elapsed_time)),DIFF(NUM_UNIQUE(train.answered_correctly)),DIFF(NUM_UNIQUE(train.content_id)),DIFF(NUM_UNIQUE(train.content_type_id)),DIFF(NUM_UNIQUE(train.task_container_id)),DIFF(NUM_UNIQUE(train.user_answer)),DIFF(PERCENT_TRUE(train.prior_question_had_explanation)),DIFF(STD(train.prior_question_elapsed_time)),DIFF(SUM(train.prior_question_elapsed_time)),DIFF(TIME_SINCE_LAST(train.timestamp)),"DIFF(TOTAL_PREVIOUS_MONTH(train.prior_question_elapsed_time, timestamp))","TIME_SINCE_TRUE(ALL(train.prior_question_had_explanation), LAST(train.timestamp))","TIME_SINCE_TRUE(LAST(train.prior_question_had_explanation), LAST(train.timestamp))"
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
115,True,,1,1,5692,0,,True,0,1,3,,,,1,5692,0,1,3,1,1,1,1,1,1.0,,0.0,1.000395e+08,0.0,,True,False,,False,False,,False,False,,False,...,0.0,0.0,0.0,0.0,0.0,0.0,,1,,,,,1,1,1,1,1,1.0,,0.0,1.000395e+08,0.0,,,,,,,,,,,,,,,,,0.000,0.000
91216,True,,1,1,1219,0,17000.0,True,1,780,1,17000.0,17000.0,17000.0,1,1219,0,780,1,1,1,1,1,1,1.0,,17000.0,9.717089e+07,0.0,,True,False,,False,False,,False,False,,False,...,0.0,17000.0,0.0,0.0,0.0,0.0,,2,17000.0,17000.0,17000.0,17000.0,2,2,2,2,2,2.0,,17000.0,1.972104e+08,0.0,,0.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0,,17000.0,-2.868613e+06,0.0,0.000,0.000
220268,True,,1,1,10656,0,19000.0,True,2,1317,0,19000.0,19000.0,19000.0,1,10656,0,1317,0,1,1,1,1,1,1.0,,19000.0,8.535296e+07,0.0,,True,False,,False,False,,False,True,,True,...,0.0,19000.0,0.0,0.0,0.0,0.0,,3,36000.0,36000.0,36000.0,36000.0,3,3,3,3,3,3.0,,36000.0,2.825634e+08,0.0,,0.0,2000.0,2000.0,2000.0,2000.0,0.0,0.0,0.0,0.0,0.0,0.0,,2000.0,-1.181793e+07,0.0,0.000,0.000
650467,True,,1,1,6857,0,11750.0,True,3,243,3,11750.0,11750.0,11750.0,1,6857,0,243,3,1,1,1,1,1,1.0,,11750.0,9.895601e+07,0.0,,True,False,,False,False,,False,False,,False,...,0.0,11750.0,0.0,0.0,0.0,0.0,,4,47750.0,47750.0,47750.0,47750.0,4,4,4,4,4,4.0,,47750.0,3.815194e+08,0.0,,0.0,-7250.0,-7250.0,-7250.0,-7250.0,0.0,0.0,0.0,0.0,0.0,0.0,,-7250.0,1.360305e+07,0.0,0.000,0.000
1084314,False,,1,1,6798,0,9000.0,False,4,1515,0,9000.0,9000.0,9000.0,1,6798,0,1515,0,1,1,1,1,1,0.0,,9000.0,8.461257e+07,0.0,,False,False,,False,True,,True,False,,False,...,0.0,9000.0,0.0,0.0,0.0,0.0,,5,56750.0,56750.0,56750.0,56750.0,5,5,5,5,5,4.0,,56750.0,4.661319e+08,0.0,,0.0,-2750.0,-2750.0,-2750.0,-2750.0,0.0,0.0,0.0,0.0,0.0,-1.0,,-2750.0,-1.434345e+07,0.0,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2146686585,True,,1,0,383,0,20000.0,True,10119,300,0,20000.0,20000.0,20000.0,0,383,0,300,0,1,1,1,1,1,1.0,,20000.0,9.888097e+07,0.0,True,,False,False,,False,False,,False,False,,...,20000.0,0.0,0.0,0.0,0.0,0.0,,10120,251923532.0,252502631.0,251966673.5,251430716.0,10086,10120,10056,10120,10108,8931.0,,253660529.0,9.290061e+11,0.0,,0.0,,,,,0.0,0.0,0.0,0.0,0.0,1.0,,20000.0,1.071665e+05,0.0,1309537.989,1309537.989
2146925942,True,,1,1,11859,0,26000.0,True,10120,163,1,26000.0,26000.0,26000.0,1,11859,0,163,1,1,1,1,1,1,1.0,,26000.0,9.077006e+07,0.0,,True,False,,False,False,,False,True,,True,...,0.0,26000.0,0.0,0.0,0.0,0.0,,10121,251949532.0,252528631.0,251992673.5,251456716.0,10087,10121,10057,10121,10109,8932.0,,253686529.0,9.290968e+11,0.0,,0.0,6000.0,6000.0,6000.0,6000.0,0.0,0.0,0.0,0.0,0.0,0.0,,6000.0,-8.110902e+06,0.0,1358541.796,0.000
2147012157,True,,1,1,1202,0,15000.0,True,10121,1506,1,15000.0,15000.0,15000.0,1,1202,0,1506,1,1,1,1,1,1,1.0,,15000.0,9.543003e+07,0.0,,True,False,,False,False,,False,False,,False,...,0.0,15000.0,0.0,0.0,0.0,0.0,,10122,251964532.0,252543631.0,252007673.5,251471716.0,10088,10122,10058,10122,10110,8933.0,,253701529.0,9.291923e+11,0.0,,0.0,-11000.0,-11000.0,-11000.0,-11000.0,0.0,0.0,0.0,0.0,0.0,0.0,,-11000.0,4.659965e+06,0.0,0.000,0.000
2147192385,False,,1,0,6880,0,44000.0,False,10122,14,2,44000.0,44000.0,44000.0,0,6880,0,14,2,1,1,1,1,1,0.0,,44000.0,1.000388e+08,0.0,False,,False,False,,False,False,,False,False,,...,44000.0,0.0,0.0,0.0,0.0,0.0,,10123,252008532.0,252587631.0,252051673.5,251515716.0,10089,10123,10059,10123,10111,8933.0,,253745529.0,9.292923e+11,0.0,,0.0,29000.0,29000.0,29000.0,29000.0,0.0,0.0,0.0,0.0,0.0,-1.0,,29000.0,4.608782e+06,0.0,0.000,0.000


In [68]:
feature_matrix['train']

Unnamed: 0_level_0,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,CUM_SUM(prior_question_elapsed_time),DIFF(prior_question_elapsed_time),TIME_SINCE_PREVIOUS(timestamp),"TIME_SINCE_TRUE(prior_question_had_explanation, timestamp)",lectures.tag,lectures.part,lectures.type_of,questions.bundle_id,questions.correct_answer,questions.part,questions.0,questions.1,questions.10,questions.100,questions.101,questions.102,questions.103,questions.104,questions.105,questions.106,questions.107,questions.108,questions.109,questions.11,questions.110,questions.111,questions.112,questions.113,questions.114,questions.115,questions.116,questions.117,...,"TIME_SINCE_TRUE(questions.63, timestamp)","TIME_SINCE_TRUE(questions.64, timestamp)","TIME_SINCE_TRUE(questions.65, timestamp)","TIME_SINCE_TRUE(questions.66, timestamp)","TIME_SINCE_TRUE(questions.67, timestamp)","TIME_SINCE_TRUE(questions.68, timestamp)","TIME_SINCE_TRUE(questions.69, timestamp)","TIME_SINCE_TRUE(questions.7, timestamp)","TIME_SINCE_TRUE(questions.70, timestamp)","TIME_SINCE_TRUE(questions.71, timestamp)","TIME_SINCE_TRUE(questions.72, timestamp)","TIME_SINCE_TRUE(questions.73, timestamp)","TIME_SINCE_TRUE(questions.74, timestamp)","TIME_SINCE_TRUE(questions.75, timestamp)","TIME_SINCE_TRUE(questions.76, timestamp)","TIME_SINCE_TRUE(questions.77, timestamp)","TIME_SINCE_TRUE(questions.78, timestamp)","TIME_SINCE_TRUE(questions.79, timestamp)","TIME_SINCE_TRUE(questions.8, timestamp)","TIME_SINCE_TRUE(questions.80, timestamp)","TIME_SINCE_TRUE(questions.81, timestamp)","TIME_SINCE_TRUE(questions.82, timestamp)","TIME_SINCE_TRUE(questions.83, timestamp)","TIME_SINCE_TRUE(questions.84, timestamp)","TIME_SINCE_TRUE(questions.85, timestamp)","TIME_SINCE_TRUE(questions.86, timestamp)","TIME_SINCE_TRUE(questions.87, timestamp)","TIME_SINCE_TRUE(questions.88, timestamp)","TIME_SINCE_TRUE(questions.89, timestamp)","TIME_SINCE_TRUE(questions.9, timestamp)","TIME_SINCE_TRUE(questions.90, timestamp)","TIME_SINCE_TRUE(questions.91, timestamp)","TIME_SINCE_TRUE(questions.92, timestamp)","TIME_SINCE_TRUE(questions.93, timestamp)","TIME_SINCE_TRUE(questions.94, timestamp)","TIME_SINCE_TRUE(questions.95, timestamp)","TIME_SINCE_TRUE(questions.96, timestamp)","TIME_SINCE_TRUE(questions.97, timestamp)","TIME_SINCE_TRUE(questions.98, timestamp)","TIME_SINCE_TRUE(questions.99, timestamp)"
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
0,115,5692,0,1,3,1,,True,,,,0.000,,,,5692,3,5,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,,,,0.000000e+00,,,,,,,,0.000000e+00,,,,,,,0.000000e+00,0.000000e+00,0.000,,,,,,,,0.000000e+00,,,,0.000,0.000000e+00,,,,,,
543,116101507,5109,0,0,0,1,,True,,,0.000,0.000,,,,5109,0,5,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,,,,0.000000e+00,,,,,,,,0.000000e+00,,,,,,,0.000000e+00,0.000000e+00,0.000,,,,,,,,0.000000e+00,,,,0.000,0.000000e+00,,,,,,
883,192321976,3747,0,0,0,1,,True,,,0.000,0.000,,,,3747,0,5,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,...,,,,0.000000e+00,,,,,,,,0.000000e+00,,,,,,,0.000000e+00,0.000000e+00,0.000,,,,,,,,0.000000e+00,,,,0.000,0.000000e+00,,,,,,
1091,237419764,5235,0,0,0,1,,True,,,0.000,0.000,,,,5235,0,5,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,,,,0.000000e+00,,,,,,,,0.000000e+00,,,,,,,0.000000e+00,0.000000e+00,0.000,,,,,,,,0.000000e+00,,,,0.000,0.000000e+00,,,,,,
1495,322530642,5649,0,0,1,1,,True,,,0.000,0.000,,,,5649,1,5,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,,,,0.000000e+00,,,,,,,,0.000000e+00,,,,,,,0.000000e+00,0.000000e+00,0.000,,,,,,,,0.000000e+00,,,,0.000,0.000000e+00,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8731,1857664183,6987,0,1519,3,1,50500.0,False,253642797.0,34500.0,1309537.989,1309537.989,,,,6986,3,7,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,6.702435e+07,3.305096e+07,4.051366e+07,2.528836e+07,1.613624e+07,5.064067e+07,1.319432e+07,1.712957e+07,4.603327e+07,2.114258e+07,2.224572e+07,2.659774e+06,3.064040e+07,2.877893e+07,5.319391e+07,0.000,4.314024e+07,1.471483e+07,8.149375e+06,3.254465e+07,1309537.989,7.861691e+06,1.767598e+07,1.236057e+07,4.508863e+07,7.227331e+07,3.674011e+07,2.543894e+07,3.055806e+07,5.686853e+06,1.790306e+07,4.022957e+07,1309537.989,1.647102e+07,3.019870e+07,5.137775e+07,1.741080e+07,0.000,1.965629e+07,3.691242e+07
907,197212993,3273,0,7550,1,1,10333.0,True,253653130.0,-40167.0,49003.807,0.000,,,,3273,1,4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,6.707335e+07,3.309996e+07,4.056266e+07,2.533736e+07,1.618525e+07,5.068968e+07,1.324332e+07,1.717857e+07,4.608228e+07,2.119159e+07,2.229472e+07,2.708778e+06,0.000000e+00,2.882794e+07,5.324291e+07,49003.807,4.318924e+07,1.476384e+07,8.198379e+06,3.259365e+07,0.000,7.910695e+06,1.772498e+07,1.240957e+07,4.513763e+07,7.232231e+07,3.678911e+07,2.548794e+07,3.060707e+07,5.735857e+06,1.795206e+07,4.027858e+07,1358541.796,1.652002e+07,3.024771e+07,5.142676e+07,1.745981e+07,49003.807,1.970529e+07,3.696143e+07
6991,1486864723,1606,0,329,3,1,24333.0,True,253677463.0,14000.0,733096.111,0.000,,,,1604,3,3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,6.780645e+07,3.383306e+07,4.129576e+07,2.607046e+07,1.691834e+07,5.142277e+07,1.397642e+07,1.791167e+07,4.681537e+07,2.192468e+07,2.302782e+07,3.441874e+06,7.330961e+05,2.956103e+07,5.397601e+07,782099.918,4.392234e+07,1.549693e+07,8.931475e+06,3.332675e+07,0.000,8.643791e+06,1.845808e+07,1.314267e+07,4.587073e+07,7.305541e+07,3.752221e+07,2.622104e+07,3.134016e+07,6.468953e+06,1.868516e+07,4.101167e+07,0.000,1.725312e+07,3.098080e+07,5.215985e+07,1.819290e+07,782099.918,2.043839e+07,3.769452e+07
2431,519974433,10409,0,206,1,0,59400.0,True,253736863.0,35067.0,3112419.292,0.000,,,,10409,3,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,7.091887e+07,3.694548e+07,4.440817e+07,2.918288e+07,2.003076e+07,5.453519e+07,1.708884e+07,2.102408e+07,4.992779e+07,2.503710e+07,2.614023e+07,6.554294e+06,3.845515e+06,3.267345e+07,5.708843e+07,3894519.210,4.703475e+07,1.860935e+07,1.204389e+07,3.643917e+07,0.000,1.175621e+07,2.157050e+07,1.625509e+07,4.898315e+07,7.616783e+07,4.063462e+07,2.933346e+07,3.445258e+07,9.581372e+06,2.179758e+07,4.412409e+07,3112419.292,2.036554e+07,3.409322e+07,5.527227e+07,2.130532e+07,3894519.210,2.355081e+07,4.080694e+07


In [70]:
feature_matrix['lectures']

Unnamed: 0_level_0,tag,part,type_of,ALL(train.prior_question_had_explanation),AVG_TIME_BETWEEN(train.timestamp),COUNT(train),LAST(train.answered_correctly),LAST(train.content_type_id),LAST(train.prior_question_elapsed_time),LAST(train.prior_question_had_explanation),LAST(train.row_id),LAST(train.task_container_id),LAST(train.user_answer),LAST(train.user_id),MAX(train.prior_question_elapsed_time),MEAN(train.prior_question_elapsed_time),MIN(train.prior_question_elapsed_time),MODE(train.answered_correctly),MODE(train.content_type_id),MODE(train.task_container_id),MODE(train.user_answer),MODE(train.user_id),NUM_UNIQUE(train.answered_correctly),NUM_UNIQUE(train.content_type_id),NUM_UNIQUE(train.task_container_id),NUM_UNIQUE(train.user_answer),NUM_UNIQUE(train.user_id),PERCENT_TRUE(train.prior_question_had_explanation),STD(train.prior_question_elapsed_time),SUM(train.prior_question_elapsed_time),TIME_SINCE_LAST(train.timestamp),"TOTAL_PREVIOUS_MONTH(train.prior_question_elapsed_time, timestamp)",ALL(train.prior_question_had_explanation WHERE answered_correctly = 0),ALL(train.prior_question_had_explanation WHERE answered_correctly = 1),ALL(train.questions.0),ALL(train.questions.0 WHERE answered_correctly = 0),ALL(train.questions.0 WHERE answered_correctly = 1),ALL(train.questions.1),ALL(train.questions.1 WHERE answered_correctly = 0),ALL(train.questions.1 WHERE answered_correctly = 1),...,SUM(train.prior_question_elapsed_time WHERE answered_correctly = 0),SUM(train.prior_question_elapsed_time WHERE answered_correctly = 1),"TOTAL_PREVIOUS_MONTH(train.CUM_SUM(prior_question_elapsed_time), timestamp)","TOTAL_PREVIOUS_MONTH(train.DIFF(prior_question_elapsed_time), timestamp)","TOTAL_PREVIOUS_MONTH(train.TIME_SINCE_PREVIOUS(timestamp), timestamp)","TOTAL_PREVIOUS_MONTH(train.TIME_SINCE_TRUE(prior_question_had_explanation, timestamp), timestamp)",CUM_SUM(AVG_TIME_BETWEEN(train.timestamp)),CUM_SUM(COUNT(train)),CUM_SUM(LAST(train.prior_question_elapsed_time)),CUM_SUM(MAX(train.prior_question_elapsed_time)),CUM_SUM(MEAN(train.prior_question_elapsed_time)),CUM_SUM(MIN(train.prior_question_elapsed_time)),CUM_SUM(NUM_UNIQUE(train.answered_correctly)),CUM_SUM(NUM_UNIQUE(train.content_type_id)),CUM_SUM(NUM_UNIQUE(train.task_container_id)),CUM_SUM(NUM_UNIQUE(train.user_answer)),CUM_SUM(NUM_UNIQUE(train.user_id)),CUM_SUM(PERCENT_TRUE(train.prior_question_had_explanation)),CUM_SUM(STD(train.prior_question_elapsed_time)),CUM_SUM(SUM(train.prior_question_elapsed_time)),CUM_SUM(TIME_SINCE_LAST(train.timestamp)),"CUM_SUM(TOTAL_PREVIOUS_MONTH(train.prior_question_elapsed_time, timestamp))",DIFF(AVG_TIME_BETWEEN(train.timestamp)),DIFF(COUNT(train)),DIFF(LAST(train.prior_question_elapsed_time)),DIFF(MAX(train.prior_question_elapsed_time)),DIFF(MEAN(train.prior_question_elapsed_time)),DIFF(MIN(train.prior_question_elapsed_time)),DIFF(NUM_UNIQUE(train.answered_correctly)),DIFF(NUM_UNIQUE(train.content_type_id)),DIFF(NUM_UNIQUE(train.task_container_id)),DIFF(NUM_UNIQUE(train.user_answer)),DIFF(NUM_UNIQUE(train.user_id)),DIFF(PERCENT_TRUE(train.prior_question_had_explanation)),DIFF(STD(train.prior_question_elapsed_time)),DIFF(SUM(train.prior_question_elapsed_time)),DIFF(TIME_SINCE_LAST(train.timestamp)),"DIFF(TOTAL_PREVIOUS_MONTH(train.prior_question_elapsed_time, timestamp))","TIME_SINCE_TRUE(ALL(train.prior_question_had_explanation), LAST(train.timestamp))","TIME_SINCE_TRUE(LAST(train.prior_question_had_explanation), LAST(train.timestamp))"
lecture_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
89,159,5,concept,True,4.040796e+06,3.0,0.0,0.0,19000.0,True,1633.0,257.0,0.0,3.509071e+08,20000.0,19333.333333,19000.0,0.0,0.0,226.0,0.0,1.617249e+09,2.0,1.0,3.0,3.0,3.0,1.000000,577.350269,58000.0,1.000389e+08,0.0,True,True,False,False,False,False,False,False,...,39000.0,19000.0,0.0,0.0,0.0,0.0,4.040796e+06,3.0,19000.0,20000.0,19333.333333,19000.0,2.0,1.0,3.0,3.0,3.0,1.000000,577.350269,58000.0,1.000389e+08,0.0,,,,,,,,,,,,,,,,,,
100,70,1,concept,False,1.058718e+05,2.0,0.0,0.0,24000.0,True,7875.0,44.0,2.0,1.676301e+09,24000.0,21000.000000,18000.0,0.0,0.0,30.0,2.0,1.676301e+09,1.0,1.0,2.0,1.0,2.0,0.500000,4242.640687,42000.0,9.599825e+07,0.0,False,,False,False,,False,False,,...,42000.0,0.0,0.0,0.0,0.0,0.0,4.146668e+06,5.0,43000.0,44000.0,40333.333333,37000.0,3.0,2.0,5.0,4.0,5.0,1.500000,4819.990956,100000.0,1.960371e+08,0.0,-3.934925e+06,-1.0,5000.0,4000.0,1666.666667,-1000.0,-1.0,0.0,-1.0,-2.0,-1.0,-0.500000,3665.290418,-16000.0,-4.040649e+06,0.0,,
185,45,6,concept,False,1.870765e+07,3.0,0.0,0.0,29750.0,False,17.0,16.0,1.0,3.637398e+06,29750.0,22583.333333,18000.0,0.0,0.0,16.0,0.0,3.637398e+06,2.0,1.0,3.0,3.0,3.0,0.333333,6286.559738,67750.0,5.147161e+07,0.0,False,True,False,False,False,False,False,False,...,47750.0,20000.0,,,,,2.285432e+07,8.0,72750.0,73750.0,62916.666667,55000.0,5.0,3.0,8.0,7.0,8.0,1.833333,11106.550694,167750.0,2.475087e+08,0.0,1.860178e+07,1.0,5750.0,5750.0,1583.333333,0.0,1.0,0.0,1.0,2.0,1.0,-0.166667,2043.919050,25750.0,-4.452664e+07,0.0,,
192,79,5,solving question,False,,1.0,-1.0,1.0,,False,7717.0,895.0,-1.0,1.642796e+09,,,,-1.0,1.0,895.0,-1.0,1.642796e+09,1.0,1.0,1.0,1.0,1.0,0.000000,,0.0,9.538097e+07,0.0,,,False,,,False,,,...,0.0,0.0,,,,,,9.0,,,,,6.0,4.0,9.0,8.0,9.0,1.833333,,167750.0,3.428897e+08,0.0,,-2.0,,,,,-1.0,0.0,-2.0,-2.0,-2.0,-0.333333,,-67750.0,4.390937e+07,0.0,,
317,156,5,solving question,True,,1.0,1.0,0.0,15000.0,True,7814.0,622.0,0.0,1.663474e+09,15000.0,15000.000000,15000.0,1.0,0.0,622.0,0.0,1.663474e+09,1.0,1.0,1.0,1.0,1.0,1.000000,,15000.0,9.646527e+07,0.0,,True,False,,False,False,,False,...,0.0,15000.0,,,,,,10.0,87750.0,88750.0,77916.666667,70000.0,7.0,5.0,10.0,9.0,10.0,2.833333,,182750.0,4.393550e+08,0.0,,0.0,,,,,0.0,0.0,0.0,0.0,0.0,1.000000,,15000.0,1.084292e+06,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32535,8,5,solving question,False,,1.0,-1.0,1.0,,False,1674.0,312.0,-1.0,3.599880e+08,,,,-1.0,1.0,312.0,-1.0,3.599880e+08,1.0,1.0,1.0,1.0,1.0,0.000000,,0.0,,,,,True,,,True,,,...,0.0,0.0,,,,,,323.0,,,,,206.0,188.0,322.0,214.0,323.0,53.358712,,3198164.0,,,,1.0,,,,,,,,,,0.000000,,0.0,,,0.000,0.000
32570,113,3,solving question,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.000000,,0.0,,,,,,,,,,,...,0.0,0.0,,,,,,323.0,,,,,,,,,,53.358712,,3198164.0,,,,-1.0,,,,,,,,,,0.000000,,0.0,,,401945.645,401945.645
32604,24,6,concept,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.000000,,0.0,,,,,,,,,,,...,0.0,0.0,,,,,,323.0,,,,,,,,,,53.358712,,3198164.0,,,,0.0,,,,,,,,,,0.000000,,0.0,,,825445.787,825445.787
32625,142,2,concept,False,,1.0,-1.0,1.0,,False,7561.0,77.0,-1.0,1.609377e+09,,,,-1.0,1.0,77.0,-1.0,1.609377e+09,1.0,1.0,1.0,1.0,1.0,0.000000,,0.0,,,,,True,,,True,,,...,0.0,0.0,,,,,,324.0,,,,,207.0,189.0,323.0,215.0,324.0,53.358712,,3198164.0,,,,1.0,,,,,,,,,,0.000000,,0.0,,,2535573.010,2535573.010


In [71]:
feature_matrix['questions']

Unnamed: 0_level_0,bundle_id,correct_answer,part,0,1,10,100,101,102,103,104,105,106,107,108,109,11,110,111,112,113,114,115,116,117,118,119,12,120,121,122,123,124,125,126,127,128,129,13,130,...,"TIME_SINCE_TRUE(65, LAST(train.timestamp))","TIME_SINCE_TRUE(66, LAST(train.timestamp))","TIME_SINCE_TRUE(67, LAST(train.timestamp))","TIME_SINCE_TRUE(68, LAST(train.timestamp))","TIME_SINCE_TRUE(69, LAST(train.timestamp))","TIME_SINCE_TRUE(7, LAST(train.timestamp))","TIME_SINCE_TRUE(70, LAST(train.timestamp))","TIME_SINCE_TRUE(71, LAST(train.timestamp))","TIME_SINCE_TRUE(72, LAST(train.timestamp))","TIME_SINCE_TRUE(73, LAST(train.timestamp))","TIME_SINCE_TRUE(74, LAST(train.timestamp))","TIME_SINCE_TRUE(75, LAST(train.timestamp))","TIME_SINCE_TRUE(76, LAST(train.timestamp))","TIME_SINCE_TRUE(77, LAST(train.timestamp))","TIME_SINCE_TRUE(78, LAST(train.timestamp))","TIME_SINCE_TRUE(79, LAST(train.timestamp))","TIME_SINCE_TRUE(8, LAST(train.timestamp))","TIME_SINCE_TRUE(80, LAST(train.timestamp))","TIME_SINCE_TRUE(81, LAST(train.timestamp))","TIME_SINCE_TRUE(82, LAST(train.timestamp))","TIME_SINCE_TRUE(83, LAST(train.timestamp))","TIME_SINCE_TRUE(84, LAST(train.timestamp))","TIME_SINCE_TRUE(85, LAST(train.timestamp))","TIME_SINCE_TRUE(86, LAST(train.timestamp))","TIME_SINCE_TRUE(87, LAST(train.timestamp))","TIME_SINCE_TRUE(88, LAST(train.timestamp))","TIME_SINCE_TRUE(89, LAST(train.timestamp))","TIME_SINCE_TRUE(9, LAST(train.timestamp))","TIME_SINCE_TRUE(90, LAST(train.timestamp))","TIME_SINCE_TRUE(91, LAST(train.timestamp))","TIME_SINCE_TRUE(92, LAST(train.timestamp))","TIME_SINCE_TRUE(93, LAST(train.timestamp))","TIME_SINCE_TRUE(94, LAST(train.timestamp))","TIME_SINCE_TRUE(95, LAST(train.timestamp))","TIME_SINCE_TRUE(96, LAST(train.timestamp))","TIME_SINCE_TRUE(97, LAST(train.timestamp))","TIME_SINCE_TRUE(98, LAST(train.timestamp))","TIME_SINCE_TRUE(99, LAST(train.timestamp))","TIME_SINCE_TRUE(ALL(train.prior_question_had_explanation), LAST(train.timestamp))","TIME_SINCE_TRUE(LAST(train.prior_question_had_explanation), LAST(train.timestamp))"
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
0,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1,1,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2,0,1,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,3,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,4,3,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13518,13518,3,5,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,4.051366e+07,2.528836e+07,1.613624e+07,5.064067e+07,1.319432e+07,1.712957e+07,4.603327e+07,2.114258e+07,2.224572e+07,2.659774e+06,3.064040e+07,2.877893e+07,5.319391e+07,0.000,4.314024e+07,1.471483e+07,8.149375e+06,3.254465e+07,1309537.989,7.861691e+06,1.767598e+07,1.236057e+07,4.508863e+07,7.227331e+07,3.674011e+07,2.543894e+07,3.055806e+07,5.686853e+06,1.790306e+07,4.022957e+07,1309537.989,1.647102e+07,3.019870e+07,5.137775e+07,1.741080e+07,0.000,1.965629e+07,3.691242e+07,1309537.989,1309537.989
13519,13519,3,5,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,4.056266e+07,2.533736e+07,1.618525e+07,5.068968e+07,1.324332e+07,1.717857e+07,4.608228e+07,2.119159e+07,2.229472e+07,2.708778e+06,0.000000e+00,2.882794e+07,5.324291e+07,49003.807,4.318924e+07,1.476384e+07,8.198379e+06,3.259365e+07,0.000,7.910695e+06,1.772498e+07,1.240957e+07,4.513763e+07,7.232231e+07,3.678911e+07,2.548794e+07,3.060707e+07,5.735857e+06,1.795206e+07,4.027858e+07,1358541.796,1.652002e+07,3.024771e+07,5.142676e+07,1.745981e+07,49003.807,1.970529e+07,3.696143e+07,0.000,0.000
13520,13520,2,5,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,4.129576e+07,2.607046e+07,1.691834e+07,5.142277e+07,1.397642e+07,1.791167e+07,4.681537e+07,2.192468e+07,2.302782e+07,3.441874e+06,7.330961e+05,2.956103e+07,5.397601e+07,782099.918,4.392234e+07,1.549693e+07,8.931475e+06,3.332675e+07,0.000,8.643791e+06,1.845808e+07,1.314267e+07,4.587073e+07,7.305541e+07,3.752221e+07,2.622104e+07,3.134016e+07,6.468953e+06,1.868516e+07,4.101167e+07,0.000,1.725312e+07,3.098080e+07,5.215985e+07,1.819290e+07,782099.918,2.043839e+07,3.769452e+07,0.000,0.000
13521,13521,0,5,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,...,4.440817e+07,2.918288e+07,2.003076e+07,5.453519e+07,1.708884e+07,2.102408e+07,4.992779e+07,2.503710e+07,2.614023e+07,6.554294e+06,3.845515e+06,3.267345e+07,5.708843e+07,3894519.210,4.703475e+07,1.860935e+07,1.204389e+07,3.643917e+07,0.000,1.175621e+07,2.157050e+07,1.625509e+07,4.898315e+07,7.616783e+07,4.063462e+07,2.933346e+07,3.445258e+07,9.581372e+06,2.179758e+07,4.412409e+07,3112419.292,2.036554e+07,3.409322e+07,5.527227e+07,2.130532e+07,3894519.210,2.355081e+07,4.080694e+07,0.000,0.000


We can save these feature definitions as a binary file which will allow us to make the same exact features for another entityset of the same format. This is useful when we have multiple partitions and we want to make the same features for each. Instead of remaking the feature definitions, we pass in the same feature definitions to a call to `calculate_feature_matrix`.

In [74]:
# define paths
feature_matrix_paths = {}
feature_matrix['lectures'] = '/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction/data/intermediate/feature_matrix_lectures.pkl.gzip'
feature_matrix['questions'] = '/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction/data/intermediate/feature_matrix_questions.pkl.gzip'
feature_matrix['train'] = '/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction/data/intermediate/feature_matrix_train.pkl.gzip'
feature_matrix['users'] = '/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction/data/intermediate/feature_matrix_users.pkl.gzip'

feature_defs_paths = {}
feature_defs['lectures'] = '/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction/data/intermediate/feature_defs_lectures.pkl.gzip'
feature_defs['questions'] = '/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction/data/intermediate/feature_defs_questions.pkl.gzip'
feature_defs['train'] = '/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction/data/intermediate/feature_defs_train.pkl.gzip'
feature_defs['users'] = '/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction/data/intermediate/feature_defs_users.pkl.gzip'

# Save files
for key in feature_defs_paths.keys():
  for open(feature_defs_paths[key], 'wb') as f:
    ft.save_features(feature_defs[key], f)
  for open(feature_matrix_paths[key], 'wb') as f:
    pd.

In [None]:
with open