<a href="https://colab.research.google.com/github/allen44/riiid-test-answer-prediction/blob/main/feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

%cd /content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction/
%pwd


/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction


'/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction'

# Make Train Test Splits upon import from csv

Using our insights gained from the EDA, when can import the data from csv with an get right on to feature engineering.

In [None]:
#Choose pickle, or csv
# suffix = '.pkl.gzip'
suffix = '.csv'

pwd = '/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction'

# #Define data paths
lectures_path = f'{pwd}/data/raw/lectures{suffix}'
questions_path = f'{pwd}/data/raw/questions{suffix}'
train_path = f'{pwd}/data/raw/train{suffix}'

X_train_path = f'{pwd}/data/intermediate/train_test_splits/X_train.csv'
X_test_path = f'{pwd}/data/intermediate/train_test_splits/X_test.csv'
y_train_path = f'{pwd}/data/intermediate/train_test_splits/y_train.csv'
y_test_path = f'{pwd}/data/intermediate/train_test_splits/y_test.csv'

lectures_path, questions_path, train_path

('/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction/data/raw/lectures.csv',
 '/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction/data/raw/questions.csv',
 '/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction/data/raw/train.csv')

In [None]:
from sklearn.model_selection import train_test_split
 
# Import data from csv
X = pd.read_csv(train_path)
y = X.pop('answered_correctly') 

# Make Train Test Splits and Preprocess each split
X_train, X_test, y_train, y_test = train_test_split(
                                    X, 
                                    y, 
                                    test_size=0.20, 
                                    random_state=42,
                                    stratify=y)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Make sure the shapes are right
assert X_train.shape[0]==y_train.shape[0]
assert X_test.shape[0]==y_test.shape[0]
assert X_train.shape[1]==X_test.shape[1]

# Export splits to csv file
X_train.to_csv(X_train_path)
X_test.to_csv(X_test_path)
y_train.to_csv(y_train_path)
y_test.to_csv(y_test_path)

del X_train
del X_test
del y_train
del y_test

# Check the shapes
X_train.shape, X_test.shape, y_train.shape, y_test.shape

(80984265, 9) (20246067, 9) (80984265,) (20246067,)


((80984265, 9), (20246067, 9), (80984265,), (20246067,))

## Load train data from csv of the train test splits

In [573]:
def read_metadata(lectures, questions):
  # Read lectures and set dtypes
  lectures_dtypes = {
  'lecture_id': 'int16',
  'tag': 'string',
  'part': 'string',
  'type_of': 'string'}
  lectures = pd.read_csv(lectures_path,
                        usecols=lectures_dtypes.keys(),
                        dtype=lectures_dtypes,
                        index_col='lecture_id')
  
  # Read questions and set dtypes
  questions_dtypes = {
  'question_id': 'int16',
  'bundle_id': 'int16',
  'correct_answer': 'int8',
  'part': 'int8',
  'tags': 'string'}
  questions = pd.read_csv(questions_path,
                          usecols=questions_dtypes.keys(),
                          dtype=questions_dtypes,
                          index_col='question_id')

  return lectures, questions

lectures=pd.DataFrame()
questions=pd.DataFrame()
lectures, questions = read_metadata(lectures, questions)

In [574]:
def binarize_tags(lectures, questions):
  # Binarize question tags
  print('binarizing...')
  questions['tags'] = questions['tags'].str.split()
  questions.dropna(subset=['tags'], inplace=True)
  mlb = MultiLabelBinarizer()
  q_labels = pd.DataFrame(mlb.fit_transform(questions['tags']),
                              columns=mlb.classes_, 
                              index=questions['tags'].dropna().index,
                           dtype=bool)
  # questions.drop(columns=['tags'], inplace=True)
  questions = pd.concat([questions, q_labels], axis=1)
  print(f'binarizing: complete...questions.shape={questions.shape}')


  # Binarize lecture tags
  print('binarizing lectures...')
  # lectures.dropna(subset=['tag'], inplace=True)
  lectures['tag'] = lectures['tag'].apply(lambda x: list(x))
  l_labels = pd.DataFrame(mlb.transform(lectures['tag'].dropna()),
                              columns=mlb.classes_, 
                              index=lectures['tag'].dropna().index,
                           dtype=bool)
  # questions.drop(columns=['tags'], inplace=True)
  questions = pd.concat([questions, q_labels], axis=1)
  lectures = pd.concat([lectures, l_labels], axis=1)
  print(f'binarizing: complete...lectures.shape={lectures.shape}')
  questions.drop(columns=['tags'], inplace=True)


  return lectures, questions

lectures, questions = binarize_tags(lectures, questions)

binarizing...
binarizing: complete...questions.shape=(13522, 192)
binarizing lectures...
binarizing: complete...lectures.shape=(418, 191)


In [583]:
def read_events_and_target(nrows=1000):
 # Read events and set dtypes
  # events_dtypes = {
  #     'row_id': 'int32',
  #     'timestamp': 'int64',
  #     'user_id': 'int32',
  #     'content_id': 'int16',
  #     'content_type_id': 'int8',
  #     'task_container_id': 'int16',
  #     'user_answer': 'int8',
  #     'prior_question_elapsed_time': 'float32',
  #     'prior_question_had_explanation': 'boolean'
  # }
  events_dtypes = {
      'timestamp': 'int64',
      'user_id': 'category',
      'content_id': 'int16',
      'content_type_id': 'int8',
      'task_container_id': 'category',
      'user_answer': 'category',
      'prior_question_elapsed_time': 'float64',
      'prior_question_had_explanation': 'boolean'
  }
  events = pd.read_csv(X_train_path,
                      usecols=events_dtypes.keys(),
                      dtype=events_dtypes,
                      nrows=nrows)
  
  target_dtypes = {
      'answered_correctly': 'int8',
  }
  target = pd.read_csv(y_train_path,
                        usecols=target_dtypes.keys(),
                        dtype=target_dtypes,
                        nrows=nrows)
  return events, target
events, target = read_events_and_target()

In [596]:
def parse_questions(events, target):
  Xy = target.join(events)
  Xy_q = Xy[Xy['content_type_id']==0]
  Xy_l = Xy[Xy['content_type_id']==1]
  Xy_q['answered_correctly'] = Xy['answered_correctly'].astype(bool)
  return Xy_q, Xy_l

Xy_q, Xy_l = parse_questions(events, target)
Xy_q.shape, Xy_l.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


((978, 9), (22, 9))

In [597]:
def merge_questions_with_metadeta(Xy_q, questions):
  Xy_qq = Xy_q.merge(questions,
             how='left',
             left_on='content_id',
             right_on='question_id'
             )
  return Xy_qq

Xy_qq = merge_questions_with_metadeta(Xy_q, questions)
Xy_qq.shape

(978, 388)

In [598]:
Xy_qq

Unnamed: 0,answered_correctly,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,prior_question_elapsed_time,prior_question_had_explanation,bundle_id,correct_answer,part,0,1,10,100,101,102,103,104,105,106,107,108,109,11,110,111,112,113,114,115,116,117,118,119,12,120,121,122,...,63,64,65,66,67,68,69,7,70,71,72,73,74,75,76,77,78,79,8,80,81,82,83,84,85,86,87,88,89,9,90,91,92,93,94,95,96,97,98,99
0,True,181931368,2139681492,8524,0,230,2,24000.0,True,8524,2,5,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,True,556867526,1391186830,6173,0,141,2,13000.0,True,6173,2,5,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,10768221235,1032534391,2176,0,5088,2,29666.0,True,2174,3,3,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False
3,False,3121598051,222660498,7300,0,458,3,24000.0,True,7299,0,7,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,3284197938,1532090570,9026,0,410,1,18000.0,False,9026,3,5,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
973,True,1894923598,2068990755,9558,0,3007,3,8000.0,True,9558,3,5,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
974,False,1567244651,1831530465,423,0,1212,0,15000.0,True,423,1,2,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False
975,True,3081113728,92223077,10043,0,194,3,50500.0,True,10042,3,6,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
976,True,1519682101,1341174759,7482,0,277,0,5800.0,True,7480,0,7,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [599]:
def merge_lectures_with_metadeta(Xy_l, lectures):
  Xy_l = Xy_l.merge(lectures,
             how='left',
             left_on='content_id',
             right_on='lecture_id'
             )
  return Xy_ll

Xy_ll = merge_questions_with_metadeta(Xy_l, questions)
Xy_ll.shape

(22, 388)

In [602]:
Xy_qq.dtypes

answered_correctly        bool
timestamp                int64
user_id               category
content_id               int16
content_type_id           int8
                        ...   
95                        bool
96                        bool
97                        bool
98                        bool
99                        bool
Length: 388, dtype: object

In [608]:
Xy_qq.dropna(inplace=True, axis=1)
Xy_qq.dropna(inplace=True)

# Feature encoding and transformation

In [615]:
# Relabel categories
Xy_qq['content_id'] = Xy_qq['content_id'].astype('category')
Xy_qq['content_type_id'] = Xy_qq['content_type_id'].astype(bool)
Xy_qq['task_container_id'] = Xy_qq['task_container_id'].astype('category')
Xy_qq['user_answer'] = Xy_qq['user_answer'].astype('category')
Xy_qq['bundle_id'] = Xy_qq['bundle_id'].astype('category')
Xy_qq['correct_answer'] = Xy_qq['correct_answer'].astype('category')
Xy_qq['part'] = Xy_qq['part'].astype('category')

# Seperate the categorical and numerical variables
X_cat = Xy_qq.select_dtypes(['category'])
X_num = Xy_qq.select_dtypes(['int', 'float'])
X_bool = Xy_qq.select_dtypes(bool)

#Save the column labels for later
X_cat_columns = X_cat.columns
X_num_columns = X_num.columns

X = pd.DataFrame()

X_cat.shape[1], X_num.shape[1], X_bool.shape[1]

(7, 1, 378)

#One Hot encode the categorical features

In [617]:
# Apply OneHotEncoder() on Dataframe
from sklearn.preprocessing import OneHotEncoder

# TODO: create a OneHotEncoder object, and fit it to all of X

# 1. INSTANTIATE
enc = OneHotEncoder(drop='first', 
                    dtype=bool
                    )

# 2. FIT
enc.fit(X_cat)

# 3. Transform
X_cat_encoded = enc.transform(X_cat)
X_cat_encoded

<978x3402 sparse matrix of type '<class 'numpy.bool_'>'
	with 6189 stored elements in Compressed Sparse Row format>

# Scale the numeric features

In [618]:
from sklearn.preprocessing import StandardScaler

# 1. INSTANTIATE
scaler = StandardScaler()

# 2. FIT
X_num_scaled = scaler.fit_transform(X_num)

print(f'mean: {X_num_scaled.mean()}')
print(f'std: {X_num_scaled.std()}')

mean: -1.8163157867078227e-17
std: 1.0
