<a href="https://colab.research.google.com/github/allen44/riiid-test-answer-prediction/blob/main/feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.model_selection import train_test_split
from timeit import default_timer as timer

%cd /content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction/
%pwd


/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction


'/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction'

# Make Train Test Splits upon import from csv

Using our insights gained from the EDA, when can import the data from csv with an get right on to feature engineering.

In [2]:
#Choose pickle, or csv
# suffix = '.pkl.gzip'
suffix = '.csv'

pwd = '/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction'

# #Define data paths
lectures_path = f'{pwd}/data/raw/lectures{suffix}'
questions_path = f'{pwd}/data/raw/questions{suffix}'
train_path = f'{pwd}/data/raw/train{suffix}'

X_train_path = f'{pwd}/data/intermediate/train_test_splits/X_train.csv'
X_test_path = f'{pwd}/data/intermediate/train_test_splits/X_test.csv'
y_train_path = f'{pwd}/data/intermediate/train_test_splits/y_train.csv'
y_test_path = f'{pwd}/data/intermediate/train_test_splits/y_test.csv'

lectures_path, questions_path, train_path

('/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction/data/raw/lectures.csv',
 '/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction/data/raw/questions.csv',
 '/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction/data/raw/train.csv')

In [3]:
# Import data from csv
X = pd.read_csv(train_path, nrows=10**8)
y = X.pop('answered_correctly') 


# Make Train Test Splits and Preprocess each split
start = timer()
X_train, X_test, y_train, y_test = train_test_split(
                                    X, 
                                    y, 
                                    test_size=0.20, 
                                    random_state=42,
                                    stratify=y)
end = timer()
print(f'{round(end - start)} seconds elapsed.')
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Make sure the shapes are right
assert X_train.shape[0]==y_train.shape[0]
assert X_test.shape[0]==y_test.shape[0]
assert X_train.shape[1]==X_test.shape[1]



127 seconds elapsed.
(80000000, 9) (20000000, 9) (80000000,) (20000000,)


In [4]:
# Export splits to csv file
start = timer()

X_train.to_csv(X_train_path)
X_test.to_csv(X_test_path)
y_train.to_csv(y_train_path)
y_test.to_csv(y_test_path)

del X_train
del X_test
del y_train
del y_test

end = timer()
print(f'{round(end - start)} seconds elapsed.')

629 seconds elapsed.


# Merge events metadata with events data

In [5]:
def read_metadata_from_csv(lectures_path, questions_path):
  lectures=pd.DataFrame()
  questions=pd.DataFrame()
  
  # Read lectures and set dtypes
  lectures_dtypes = {
  'lecture_id': 'int16',
  'tag': 'string',
  'part': 'string',
  'type_of': 'string'}
  lectures = pd.read_csv(lectures_path,
                        usecols=lectures_dtypes.keys(),
                        dtype=lectures_dtypes,
                        index_col='lecture_id')
  
  # Read questions and set dtypes
  questions_dtypes = {
  'question_id': 'int16',
  'bundle_id': 'int16',
  'correct_answer': 'int8',
  'part': 'int8',
  'tags': 'string'}
  questions = pd.read_csv(questions_path,
                          usecols=questions_dtypes.keys(),
                          dtype=questions_dtypes,
                          index_col='question_id')

  return lectures, questions


# lectures, questions = read_metadata_from_csv(lectures_path, questions_path)

# lectures.shape, questions.shape

In [6]:
def binarize_tags(lectures, questions):
  from sklearn.preprocessing import MultiLabelBinarizer

  # Binarize question tags
  questions['tags'] = questions['tags'].str.split()
  questions.dropna(subset=['tags'], inplace=True)
  mlb = MultiLabelBinarizer()
  q_labels = pd.DataFrame(mlb.fit_transform(questions['tags']),
                              columns=mlb.classes_, 
                              index=questions['tags'].dropna().index,
                           dtype=bool)
  # questions.drop(columns=['tags'], inplace=True)
  questions = pd.concat([questions, q_labels], axis=1)

  # Binarize lecture tags
  l_labels = pd.DataFrame(mlb.transform(lectures['tag'].dropna()),
                              columns=mlb.classes_, 
                              index=lectures['tag'].dropna().index,
                           dtype=bool)
  questions = pd.concat([questions, q_labels], axis=1)
  lectures = pd.concat([lectures, l_labels], axis=1)
  print(f'binarizing: complete...lectures.shape={lectures.shape}')

  # Drop the now old columns
  questions.drop(columns=['tags'], inplace=True)
  lectures.drop(columns=['tag'], inplace=True)

  return lectures, questions

# lectures, questions = binarize_tags(lectures, questions)

# lectures.shape, questions.shape

In [7]:
def read_events_and_target(X_path, y_path, nrows=1000, is_test=False):
 # Read events and set dtypes
  # events_dtypes = {
  #     'row_id': 'int32',
  #     'timestamp': 'int64',
  #     'user_id': 'int32',
  #     'content_id': 'int16',
  #     'content_type_id': 'int8',
  #     'task_container_id': 'int16',
  #     'user_answer': 'int8',
  #     'prior_question_elapsed_time': 'float32',
  #     'prior_question_had_explanation': 'boolean'
  # }
  events_dtypes = {
      'timestamp': 'int64',
      'user_id': 'category',
      'content_id': 'int16',
      'content_type_id': 'int8',
      'task_container_id': 'category',
      'user_answer': 'category',
      'prior_question_elapsed_time': 'float64',
      'prior_question_had_explanation': 'boolean'
  }
  events = pd.read_csv(X_path,
                      usecols=events_dtypes.keys(),
                      dtype=events_dtypes,
                      nrows=nrows)
  
  target_dtypes = {
      'answered_correctly': 'int8',
  }
  target = pd.read_csv(y_path,
                        usecols=target_dtypes.keys(),
                        dtype=target_dtypes,
                        nrows=nrows)
  
  Xy = target.join(events)
  return Xy

# # Read a subset, nrows, of train.csv
# Xy = read_events_and_target(X_train_path, y_train_path, nrows=1000, is_test=False)
  
# Xy.shape

In [8]:
def parse_questions_and_lecture_events(Xy):

  Xy_q = Xy.loc[Xy['content_type_id']==0,:].drop(columns=['content_type_id'])
  Xy_l = Xy.loc[Xy['content_type_id']==1, ['timestamp',	'user_id', 'content_id']]
  Xy_q['answered_correctly'] = Xy['answered_correctly'].astype(bool)

  return Xy_q, Xy_l

# Xy_q, Xy_l = parse_questions_and_lecture_events(Xy)
# Xy_q.shape, Xy_l.shape

In [9]:
def merge_question_events_with_their_metadeta(Xy_q, questions):
  Xy_qq = Xy_q.merge(questions,
             how='left',
             left_on='content_id',
             right_on='question_id'
             )
  return Xy_qq

# Xy_qq = merge_question_events_with_their_metadeta(Xy_q, questions)
# Xy_qq.shape

In [10]:
def merge_lecture_events_with_their_metadeta(Xy_l, lectures):
  Xy_ll = Xy_l.merge(lectures,
             how='left',
             left_on='content_id',
             right_on='lecture_id'
             )
  return Xy_ll

# Xy_ll = merge_lecture_events_with_their_metadeta(Xy_l, lectures)
# Xy_ll.shape

# Feature encoding and transformation

In [11]:
def seperate_cat_num_bool_and_y(Xy_qq):
  # Drop NA
  Xy_qq.dropna(inplace=True)
  Xy_qq.dropna(inplace=True, axis=1)

  # Relabel categories
  cat_cols_list = ['content_id',
                   'task_container_id',
                   'user_answer',
                   'bundle_id',
                   'correct_answer',
                   'part'
                   ]
  # Xy_qq['content_id'] = Xy_qq['content_id'].astype('category')
  # Xy_qq['task_container_id'] = Xy_qq['task_container_id'].astype('category')
  # Xy_qq['user_answer'] = Xy_qq['user_answer'].astype('category')
  # Xy_qq['bundle_id'] = Xy_qq['bundle_id'].astype('category')
  # Xy_qq['correct_answer'] = Xy_qq['correct_answer'].astype('category')
  # Xy_qq['part'] = Xy_qq['part'].astype('category')

  # Seperate y
  y = Xy_qq.pop('answered_correctly')

  # Seperate the categorical and numerical variables
  # X_cat = Xy_qq.select_dtypes(['category'])
  X_cat = Xy_qq.loc[:, cat_cols_list]
  X_num = Xy_qq.select_dtypes(['int', 'float'])
  X_bool = Xy_qq.select_dtypes(bool)

  #Save the column labels for later
  X_cat_columns = X_cat.columns
  X_num_columns = X_num.columns

  X = pd.DataFrame()
  return X_cat, X_num, X_bool, y

# X_cat, X_num, X_bool, y = seperate_cat_num_bool_and_y(Xy_qq)
# X_cat.shape[1], X_num.shape[1], X_bool.shape[1], y.shape

#One Hot encode the categorical features

In [12]:
def encode_cat_feautures(X_cat, fitted_onehot=None):
  # Apply OneHotEncoder() on Dataframe
  from sklearn.preprocessing import OneHotEncoder
  # 1. INSTANTIATE
  if fitted_onehot==None: # Train data
    enc = OneHotEncoder(dtype=bool, handle_unknown='ignore')
    # 2. FIT
    enc.fit(X_cat)
  else: # Test data
    enc = fitted_onehot
  # 3. Transform
  X_cat_encoded = enc.transform(X_cat)
  # # To dataframe
  X_cat_encoded = pd.DataFrame.sparse.from_spmatrix(X_cat_encoded, 
                                                    index=X_cat.index, 
                                                    columns=enc.get_feature_names())  
  return X_cat_encoded, enc

# X_cat_encoded, onehot = encode_cat_feautures(X_cat, fitted_onehot=None)

# Scale the numeric features

In [13]:
def encode_num_feautures(X_num, fitted_scaler=None):
  from sklearn.preprocessing import StandardScaler

  start = timer()
  print('scaling')

  # 1. INSTANTIATE
  if fitted_scaler==None: # Train data
    scaler = StandardScaler()
    assert isinstance(scaler, StandardScaler)
    # 2. FIT and transform
    X_num_scaled = scaler.fit_transform(X_num)
  else: # Test Data
    assert isinstance(fitted_scaler, StandardScaler)
    scaler = fitted_scaler
    X_num_scaled = scaler.transform(X_num)
  
  end = timer()
  print(f'{round(end - start)} seconds elapsed.')

  # print(f'X_num_scaled mean: {X_num_scaled.mean()}')
  # print(f'X_num_scaled std: {X_num_scaled.std()}')

  # To dataframe
  X_num_scaled = pd.DataFrame(X_num_scaled, 
                              index=X_num.index, 
                              columns=X_num.columns)
  
  return X_num_scaled, scaler


# X_num_scaled, scaler = encode_num_feautures(X_num, fitted_scaler=None)

#  Recombine the features into one dataframe

In [14]:
def join_encoded_cat_and_scaled_num(X_cat_encoded_df, X_num_scaled_df, X_bool_df):
  # Join categorical and numerical features
  X = X_num_scaled_df.join(X_cat_encoded_df.join(X_bool_df))
  return X

# X = join_encoded_cat_and_scaled_num(X_cat_encoded, X_num_scaled, X_bool)
# X.shape

# Preprocess the test data set using the same transformers as the training data set.

In [15]:
def preprocess_data(X_path, y_path, train_cat_transformer=None, train_num_transformer=None, nrows=1000, is_test=False):
  if is_test:
    assert train_cat_transformer
    assert train_num_transformer
  lectures, questions = read_metadata_from_csv(lectures_path, questions_path)
  lectures, questions = binarize_tags(lectures, questions)
  Xy = read_events_and_target(X_path, y_path, nrows=nrows, is_test=is_test)
  Xy_q, Xy_l = parse_questions_and_lecture_events(Xy)
  Xy_qq = merge_question_events_with_their_metadeta(Xy_q, questions)
  Xy_ll = merge_lecture_events_with_their_metadeta(Xy_l, lectures)
  X_cat, X_num, X_bool, y = seperate_cat_num_bool_and_y(Xy_qq)
  X_cat_encoded, onehot = encode_cat_feautures(X_cat, fitted_onehot=train_cat_transformer)
  X_num_scaled, scaler = encode_num_feautures(X_num, fitted_scaler=train_num_transformer)
  X = join_encoded_cat_and_scaled_num(X_cat_encoded, X_num_scaled, X_bool)
  if is_test:
    return X, y
  else:
    return X, y, onehot, scaler




In [16]:
X_train, y_train, cat_enc, num_enc = preprocess_data(X_train_path, 
                                                     y_train_path, 
                                                     nrows=10**5, 
                                                     is_test=False)
X_test, y_test = preprocess_data(X_test_path, 
                                 y_test_path, 
                                 cat_enc, 
                                 num_enc, 
                                 nrows=10**4, 
                                 is_test=True)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

binarizing: complete...lectures.shape=(418, 191)
scaling
0 seconds elapsed.
binarizing: complete...lectures.shape=(418, 191)
scaling
0 seconds elapsed.


((97656, 27041), (97656,), (9779, 27041), (9779,))

In [17]:
X_train

Unnamed: 0,timestamp,prior_question_elapsed_time,x0_0,x0_1,x0_2,x0_3,x0_4,x0_5,x0_6,x0_7,x0_8,x0_9,x0_10,x0_11,x0_12,x0_13,x0_14,x0_15,x0_16,x0_17,x0_18,x0_19,x0_20,x0_21,x0_22,x0_23,x0_24,x0_25,x0_26,x0_27,x0_28,x0_29,x0_30,x0_31,x0_32,x0_33,x0_34,x0_35,x0_36,x0_37,...,63,64,65,66,67,68,69,7,70,71,72,73,74,75,76,77,78,79,8,80,81,82,83,84,85,86,87,88,89,9,90,91,92,93,94,95,96,97,98,99
0,-0.531087,-0.221087,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,-0.665738,-0.678195,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
2,-0.339468,-0.322667,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,-0.496783,1.048657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,0.063815,-0.068718,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98047,0.336911,0.528062,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
98048,0.395032,0.337600,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
98049,-0.644717,-0.525826,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
98050,-0.664609,-0.322667,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


# Save data as binary files

In [None]:
X_train.to_pickle(f'{pwd}/data/intermediate/feature_engineered/X_train.pkl.gzip' )
y_train.to_pickle(f'{pwd}/data/intermediate/feature_engineered/y_train.pkl.gzip')

X_test.to_pickle(f'{pwd}/data/intermediate/feature_engineered/X_test.pkl.gzip')
y_test.to_pickle(f'{pwd}/data/intermediate/feature_engineered/y_test.pkl.gzip')