<a href="https://colab.research.google.com/github/allen44/riiid-test-answer-prediction/blob/main/feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%cd /content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction/
%pwd

/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction


'/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction'

## Preprocessed data

In [2]:
import pickle
from pathlib import Path

# #Define data paths
df_train_preprocessed_path = Path('/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction/data/intermediate/df_train_preprocessed.pkl.gzip')
df_lectures_preprocessed_path = Path('/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction/data/intermediate/df_lectures_preprocessed.pkl.gzip')
df_questions_preprocessed_path = Path('/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction/data/intermediate/df_questions_preprocessed.pkl.gzip')

Using our insights gained from the EDA, when can import the data from csv with the best preprocessing for feature engineering.

In [9]:
with open(df_train_preprocessed_path, 'rb') as f:
  df_train = pickle.load(f)

with open(df_lectures_preprocessed_path, 'rb') as f:
  df_lectures = pickle.load(f)

with open(df_questions_preprocessed_path, 'rb') as f:
  df_questions = pickle.load(f)

df_train.shape, df_lectures.shape, df_questions.shape 

((101230332, 9), (418, 4), (13522, 192))

In [10]:
assert df_train['content_id'].dtype == df_lectures['lecture_id'].dtype
assert df_questions['question_id'].dtype == df_lectures['lecture_id'].dtype

## Install and import featuretools

In [11]:
# % pip install featuretools
import featuretools as ft

# Prepare data

First, we specify a dictionary with all the entities in our dataset.

In [12]:
# Define entity set
es = ft.EntitySet()

# Add df_train to entity set
es.entity_from_dataframe(entity_id='train', 
                         dataframe=df_train,
                         index='row_id')

  "integer column".format(index))


Entityset: None
  Entities:
    train [Rows: 101230332, Columns: 10]
  Relationships:
    No relationships

In [13]:
# Add df_lectures to entity set
es.entity_from_dataframe(entity_id='lectures', dataframe=df_lectures, index='lecture_id')

Entityset: None
  Entities:
    train [Rows: 101230332, Columns: 10]
    lectures [Rows: 418, Columns: 4]
  Relationships:
    No relationships

In [14]:
# Add df_questions to entity set
es.entity_from_dataframe(entity_id='questions', dataframe=df_questions, index='question_id')

Entityset: None
  Entities:
    train [Rows: 101230332, Columns: 10]
    lectures [Rows: 418, Columns: 4]
    questions [Rows: 13522, Columns: 192]
  Relationships:
    No relationships

In [15]:
es['train'].variables

[<Variable: row_id (dtype = index)>,
 <Variable: timestamp (dtype = numeric)>,
 <Variable: user_id (dtype = categorical)>,
 <Variable: content_id (dtype = categorical)>,
 <Variable: content_type_id (dtype = categorical)>,
 <Variable: task_container_id (dtype = categorical)>,
 <Variable: user_answer (dtype = categorical)>,
 <Variable: answered_correctly (dtype = categorical)>,
 <Variable: prior_question_elapsed_time (dtype = numeric)>,
 <Variable: prior_question_had_explanation (dtype = boolean)>]

In [16]:
es['lectures'].variables

[<Variable: lecture_id (dtype = index)>,
 <Variable: tag (dtype = categorical)>,
 <Variable: part (dtype = categorical)>,
 <Variable: type_of (dtype = categorical)>]

In [17]:
es['questions'].variables

[<Variable: question_id (dtype = index)>,
 <Variable: bundle_id (dtype = categorical)>,
 <Variable: correct_answer (dtype = categorical)>,
 <Variable: part (dtype = categorical)>,
 <Variable: 0 (dtype = boolean)>,
 <Variable: 1 (dtype = boolean)>,
 <Variable: 10 (dtype = boolean)>,
 <Variable: 100 (dtype = boolean)>,
 <Variable: 101 (dtype = boolean)>,
 <Variable: 102 (dtype = boolean)>,
 <Variable: 103 (dtype = boolean)>,
 <Variable: 104 (dtype = boolean)>,
 <Variable: 105 (dtype = boolean)>,
 <Variable: 106 (dtype = boolean)>,
 <Variable: 107 (dtype = boolean)>,
 <Variable: 108 (dtype = boolean)>,
 <Variable: 109 (dtype = boolean)>,
 <Variable: 11 (dtype = boolean)>,
 <Variable: 110 (dtype = boolean)>,
 <Variable: 111 (dtype = boolean)>,
 <Variable: 112 (dtype = boolean)>,
 <Variable: 113 (dtype = boolean)>,
 <Variable: 114 (dtype = boolean)>,
 <Variable: 115 (dtype = boolean)>,
 <Variable: 116 (dtype = boolean)>,
 <Variable: 117 (dtype = boolean)>,
 <Variable: 118 (dtype = boolean)>

In [18]:
import gc

del df_train
del df_lectures
del df_questions

gc.collect()

410

In [19]:
r_lectures_train= ft.Relationship(es["lectures"]["lecture_id"],
                                          es["train"]["content_id"])

r_questions_train = ft.Relationship(es["questions"]["question_id"],
                                    es["train"]["content_id"])

es = es.add_relationship(r_lectures_train)
es = es.add_relationship(r_questions_train)
es

Entityset: None
  Entities:
    train [Rows: 101230332, Columns: 10]
    lectures [Rows: 418, Columns: 4]
    questions [Rows: 13522, Columns: 192]
  Relationships:
    train.content_id -> lectures.lecture_id
    train.content_id -> questions.question_id