<a href="https://colab.research.google.com/github/allen44/riiid-test-answer-prediction/blob/main/feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.model_selection import train_test_split
from timeit import default_timer as timer

%cd /content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction/
%pwd


/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction


'/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction'

# Make Train Test Splits upon import from csv

Using our insights gained from the EDA, when can import the data from csv with an get right on to feature engineering.

In [None]:
#Choose pickle, or csv
# suffix = '.pkl.gzip'
suffix = '.csv'

pwd = '/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction'

# #Define data paths
lectures_path = f'{pwd}/data/raw/lectures{suffix}'
questions_path = f'{pwd}/data/raw/questions{suffix}'
train_path = f'{pwd}/data/raw/train{suffix}'

X_train_path = f'{pwd}/data/intermediate/train_test_splits/X_train.csv'
X_test_path = f'{pwd}/data/intermediate/train_test_splits/X_test.csv'
y_train_path = f'{pwd}/data/intermediate/train_test_splits/y_train.csv'
y_test_path = f'{pwd}/data/intermediate/train_test_splits/y_test.csv'

lectures_path, questions_path, train_path

('/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction/data/raw/lectures.csv',
 '/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction/data/raw/questions.csv',
 '/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction/data/raw/train.csv')

In [None]:
nrows = 1.25*1e6

# Import data from csv
X = pd.read_csv(train_path, nrows=nrows)
y = X.pop('answered_correctly') 


# Make Train Test Splits and Preprocess each split
start = timer()
X_train, X_test, y_train, y_test = train_test_split(
                                    X, 
                                    y, 
                                    test_size=0.20, 
                                    random_state=42,
                                    stratify=y)
end = timer()
print(f'{round(end - start)} seconds elapsed.')
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Make sure the shapes are right
assert X_train.shape[0]==y_train.shape[0]
assert X_test.shape[0]==y_test.shape[0]
assert X_train.shape[1]==X_test.shape[1]



1 seconds elapsed.
(1000000, 9) (250000, 9) (1000000,) (250000,)


In [None]:
nrows

1250000.0

In [None]:
# Export splits to csv file
start = timer()

X_train.to_csv(X_train_path)
X_test.to_csv(X_test_path)
y_train.to_csv(y_train_path)
y_test.to_csv(y_test_path)

del X_train
del X_test
del y_train
del y_test

end = timer()
print(f'{round(end - start)} seconds elapsed.')

7 seconds elapsed.


# Merge events metadata with events data

In [None]:
def read_metadata_from_csv(lectures_path, questions_path):
  lectures=pd.DataFrame()
  questions=pd.DataFrame()
  
  # Read lectures and set dtypes
  lectures_dtypes = {
  'lecture_id': 'int16',
  'tag': 'string',
  'part': 'category',
  'type_of': 'category'
  }
  lectures = pd.read_csv(lectures_path,
                        usecols=lectures_dtypes.keys(),
                        dtype=lectures_dtypes,
                        index_col='lecture_id')
  
  # Read questions and set dtypes
  questions_dtypes = {
  'question_id': 'int16',
  'bundle_id': 'int16',
  'correct_answer': 'int8',
  'part': 'int8',
  'tags': 'string'}
  questions = pd.read_csv(questions_path,
                          usecols=questions_dtypes.keys(),
                          dtype=questions_dtypes,
                          index_col='question_id')

  return lectures, questions


lectures, questions = read_metadata_from_csv(lectures_path, questions_path)

lectures.shape, questions.shape

((418, 3), (13523, 4))

In [None]:
def binarize_tags(lectures, questions):
  from sklearn.preprocessing import MultiLabelBinarizer
  import re

  # Binarize question tags
  questions['tags'] = questions['tags'].str.split()
  questions.dropna(subset=['tags'], inplace=True)
  mlb = MultiLabelBinarizer()
  q_labels = pd.DataFrame(mlb.fit_transform(questions['tags']),
                              columns=mlb.classes_, 
                              index=questions['tags'].dropna().index,
                           dtype=bool)
  # questions.drop(columns=['tags'], inplace=True)
  questions = pd.concat([questions, q_labels], axis=1)

  # Binarize lecture tags
  l_labels = pd.DataFrame(mlb.transform(lectures['tag'].dropna()),
                              columns=mlb.classes_, 
                              index=lectures['tag'].dropna().index,
                           dtype=bool)
  lectures = pd.concat([lectures, l_labels], axis=1)

  # Drop the now old columns
  questions.drop(columns=['tags'], inplace=True)
  lectures.drop(columns=['tag'], inplace=True)

  # Rename the new columns
  for df in [questions, lectures]:
    for col_name in list(df.columns):
      if not re.search('^[a-zA-Z]', col_name):
        df['tag_' + col_name] = df[col_name]
        df.drop(columns=[col_name], inplace=True)

  return lectures, questions

lectures, questions = binarize_tags(lectures, questions)

lectures.shape, questions.shape

((418, 190), (13522, 191))

In [None]:
def read_events_and_target(X_path, y_path, nrows=1000, is_test=False):
 # Read events and set dtypes
  events_dtypes = {
      'timestamp': 'int64',
      'user_id': 'category',
      'content_id': 'int16',
      'content_type_id': 'int8',
      'task_container_id': 'int16',
      'user_answer': 'int8',
      'prior_question_elapsed_time': 'float16',
      'prior_question_had_explanation': 'int8'
  }
  events = pd.read_csv(X_path,
                      usecols=events_dtypes.keys(),
                      # dtype=events_dtypes,
                      nrows=nrows)
  
  target_dtypes = {
      'answered_correctly': 'int8',
  }
  target = pd.read_csv(y_path,
                        usecols=target_dtypes.keys(),
                        dtype=target_dtypes,
                        nrows=nrows)
  
  Xy = target.join(events)
  return Xy

# Read a subset, nrows, of train.csv
Xy = read_events_and_target(X_train_path, y_train_path, nrows=100000, is_test=False)
  
Xy.shape

(100000, 9)

In [None]:
def parse_questions_and_lecture_events(Xy):

  Xy_q = Xy.loc[Xy['content_type_id']==0,:].drop(columns=['content_type_id'])
  Xy_l = Xy.loc[Xy['content_type_id']==1, ['timestamp',	'user_id', 'content_id', 'task_container_id']]

  return Xy_q, Xy_l

Xy_q, Xy_l = parse_questions_and_lecture_events(Xy)
Xy_q.shape, Xy_l.shape

((97973, 8), (2027, 4))

In [None]:
def merge_question_events_with_their_metadeta(Xy_q, questions):
  Xy_qq = Xy_q.merge(questions,
             how='left',
             left_on='content_id',
             right_on='question_id', 
             )
  # Keep the initial index
  Xy_qq.index = Xy_q.index
  return Xy_qq

Xy_qq = merge_question_events_with_their_metadeta(Xy_q, questions)
Xy_qq.shape

(97973, 199)

In [None]:
def merge_lecture_events_with_their_metadeta(Xy_l, lectures):
  Xy_ll = Xy_l.merge(lectures,
             how='left',
             left_on='content_id',
             right_on='lecture_id')
  # Keep the initial index
  Xy_ll.index = Xy_l.index
  return Xy_ll

Xy_ll = merge_lecture_events_with_their_metadeta(Xy_l, lectures)
Xy_ll.shape

(2027, 194)

Feature Engineering
====


In [None]:
def add_col_agg_data_and_drop_original_col(df, col, drop=True):
  last_idx_of_df = df.shape[1]

  agg_data_df = df.groupby(col).agg(
      {'answered_correctly': ['mean', 'count'],
      'prior_question_elapsed_time': ['mean'],
       'timestamp': ['mean']
      })
  
  #Flatten multi-index columns, then merge
  agg_data_df.columns = ['_'.join(col) for col in agg_data_df.columns.values]
  df = df.merge(agg_data_df,
                how='left', 
                on=col)
  
  # Rename new columns and add to df
  new_cols = df.iloc[:, (last_idx_of_df):]
  for new_col in new_cols.columns.values:
    df[col + '_' + new_col] = df[new_col]
    df = df.drop(columns=new_col)

  #Rename count column
  unfortunately_named_col = col+'_answered_correctly_count'
  df[col+'_count'] =df[unfortunately_named_col]
  df = df.drop(columns=unfortunately_named_col)
  
  if drop:
    df = df.drop(columns=col)
  
  return df

# Xy_2 = add_col_agg_data_and_drop_original_col(Xy_qq, 'content_id')
# Xy_2

In [None]:
def feature_eng(df):
  df = add_col_agg_data_and_drop_original_col(df, 'content_id')
  df = add_col_agg_data_and_drop_original_col(df, 'bundle_id')
  df = add_col_agg_data_and_drop_original_col(df, 'task_container_id')
  return df

# Xy_2 = feature_eng(Xy_qq)
# # Show the feature-engineered columns
# Xy_2.iloc[:, Xy_qq.shape[1]-3:]

# Feature encoding and transformation

In [None]:
def drop_na_and_unneeded_cols(df):
  print(f'shape before dropping na: {df.shape}')
  # Drop NA
  df = df.dropna()
  df = df.dropna(axis=1)
  print(f'shape after dropping na: {df.shape}')

  # Drop unneeded columns
  df = df.drop(columns='user_id')
  print(f'shape after dropping user_id: {df.shape}')
  return df

# temp = drop_na_and_unneeded_cols(Xy_qq_eng)
# temp

In [None]:
def seperate_cat_num_bool_and_y(Xy_qq_eng):
  # Relabel categories
  cat_cols_list = [
                  #  'content_id',
                  #  'task_container_id',
                   'user_answer',
                  #  'bundle_id',
                   'correct_answer',
                   'part'
                   ]
  # Xy_qq_eng['content_id'] = Xy_qq_eng['content_id'].astype('category')
  # Xy_qq_eng['task_container_id'] = Xy_qq_eng['task_container_id'].astype('category')
  # Xy_qq_eng['user_answer'] = Xy_qq_eng['user_answer'].astype('category')
  # Xy_qq_eng['bundle_id'] = Xy_qq_eng['bundle_id'].astype('category')
  # Xy_qq_eng['correct_answer'] = Xy_qq_eng['correct_answer'].astype('category')
  # Xy_qq_eng['part'] = Xy_qq_eng['part'].astype('category')

  # Seperate y
  y = Xy_qq_eng['answered_correctly']
  X = Xy_qq_eng.drop(columns='answered_correctly')

  # Seperate the categorical and numerical variables
  # X_cat = Xy_qq_eng.select_dtypes(['category'])
  X_bool = X.select_dtypes(bool)
  X = X.drop(columns=X_bool.columns)
  X_cat = X.loc[:, cat_cols_list]
  X = X.drop(columns=X_cat.columns)
  X_num = X


  #Save the column labels for later
  X_cat_columns = X_cat.columns
  X_num_columns = X_num.columns

  assert X_cat.shape[1] + X_num.shape[1] + X_bool.shape[1] + 1 == Xy_qq_eng.shape[1]
  assert X_cat.shape[0] == X_num.shape[0]
  assert X_num.shape[0] == X_bool.shape[0]
  assert X_bool.shape[0] == y.shape[0]
  return X_cat, X_num, X_bool, y

# X_cat, X_num, X_bool, y = seperate_cat_num_bool_and_y(temp)
# X_cat.shape[1], X_num.shape[1], X_bool.shape[1], y.shape

#One Hot encode the categorical features

In [None]:
def encode_cat_feautures(X_cat, fitted_onehot=None):
  # Apply OneHotEncoder() on Dataframe
  from sklearn.preprocessing import OneHotEncoder
  # 1. INSTANTIATE
  if fitted_onehot==None: # Train data
    enc = OneHotEncoder(dtype=bool, handle_unknown='ignore')
    # 2. FIT
    enc.fit(X_cat)
  else: # Test data
    enc = fitted_onehot
  # 3. Transform
  X_cat_encoded = enc.transform(X_cat)
  # # To dataframe
  X_cat_encoded = pd.DataFrame.sparse.from_spmatrix(X_cat_encoded, 
                                                    index=X_cat.index, 
                                                    columns=enc.get_feature_names())
  X_cat_encoded = X_cat_encoded.sparse.to_dense()
  return X_cat_encoded, enc

# X_cat_encoded, onehot = encode_cat_feautures(X_cat, fitted_onehot=None)

# Scale the numeric features

In [None]:
def encode_num_feautures(X_num, fitted_scaler=None):
  from sklearn.preprocessing import StandardScaler

  start = timer()
  print('scaling')

  # 1. INSTANTIATE
  if fitted_scaler==None: # Train data
    scaler = StandardScaler()
    assert isinstance(scaler, StandardScaler)
    # 2. FIT and transform
    X_num_scaled = scaler.fit_transform(X_num)
  else: # Test Data
    assert isinstance(fitted_scaler, StandardScaler)
    scaler = fitted_scaler
    X_num_scaled = scaler.transform(X_num)
  
  end = timer()
  print(f'{round(end - start)} seconds elapsed.')

  # print(f'X_num_scaled mean: {X_num_scaled.mean()}')
  # print(f'X_num_scaled std: {X_num_scaled.std()}')

  # To dataframe
  X_num_scaled = pd.DataFrame(X_num_scaled, 
                              index=X_num.index, 
                              columns=X_num.columns)
  
  return X_num_scaled, scaler


# X_num_scaled, scaler = encode_num_feautures(X_num, fitted_scaler=None)

#  Recombine the features into one dataframe

In [None]:
def join_encoded_cat_and_scaled_num(X_cat_encoded_df, X_num_scaled_df, X_bool_df, debug=False):
  # Join categorical and numerical features
  X = X_num_scaled_df.join(X_cat_encoded_df.join(X_bool_df))

  if debug==True:
    print("\njoin_encoded_cat_and_scaled_num():")
    print(f"return X=\n{X}")
    print(f"X.shape= {X.shape}\n")

  return X

X = join_encoded_cat_and_scaled_num

# Preprocess the test data set using the same transformers as the training data set.

In [None]:
def preprocess_data(X_path, y_path, train_cat_transformer=None, train_num_transformer=None, nrows=1000, is_test=False):
  if is_test:
    assert train_cat_transformer
    assert train_num_transformer

  lectures, questions = read_metadata_from_csv(lectures_path, questions_path)
  lectures, questions = binarize_tags(lectures, questions)

  Xy = read_events_and_target(X_path, y_path, nrows=nrows, is_test=is_test)

  Xy_q, Xy_l = parse_questions_and_lecture_events(Xy)

  Xy_qq = merge_question_events_with_their_metadeta(Xy_q, questions)
  Xy_ll = merge_lecture_events_with_their_metadeta(Xy_l, lectures)

  Xy_qq = feature_eng(Xy_qq)

  Xy_qq_2 = drop_na_and_unneeded_cols(Xy_qq)

  X_cat, X_num, X_bool, y = seperate_cat_num_bool_and_y(Xy_qq_2)

  X_cat_encoded, onehot = encode_cat_feautures(X_cat, fitted_onehot=train_cat_transformer)
  X_num_scaled, scaler = encode_num_feautures(X_num, fitted_scaler=train_num_transformer)
  X = join_encoded_cat_and_scaled_num(X_cat_encoded, X_num_scaled, X_bool)

  if is_test:
    return X, y
  else:
    return X, y, onehot, scaler




In [None]:
%%time

X_train, y_train, cat_enc, num_enc = preprocess_data(X_train_path, 
                                                     y_train_path, 
                                                     nrows=nrows, 
                                                     is_test=False)
X_test, y_test = preprocess_data(X_test_path, 
                                 y_test_path, 
                                 cat_enc, 
                                 num_enc, 
                                 nrows=nrows, 
                                 is_test=True)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

shape before dropping na: (980138, 208)
shape after dropping na: (976295, 208)
shape after dropping user_id: (976295, 207)
scaling
2 seconds elapsed.
shape before dropping na: (245035, 208)
shape after dropping na: (244123, 208)
shape after dropping user_id: (244123, 207)
scaling
0 seconds elapsed.
CPU times: user 11.9 s, sys: 733 ms, total: 12.6 s
Wall time: 12.7 s


# Save data as binary files

In [None]:
suffix = '.pkl.gzip'

with open(f'{pwd}/data/intermediate/feature_engineered/X_train_{str(round(nrows))}{suffix}' , 'wb') as f:
  X_train.to_pickle(f)
  print(f'saved to {f}')
with open(f'{pwd}/data/intermediate/feature_engineered/y_train_{str(round(nrows))}{suffix}', 'wb') as f:
  y_train.to_pickle(f)
  print(f'saved to {f}')
with open(f'{pwd}/data/intermediate/feature_engineered/X_test_{str(round(nrows))}{suffix}' , 'wb') as f:
  X_test.to_pickle(f)
  print(f'saved to {f}')
with open(f'{pwd}/data/intermediate/feature_engineered/y_test_{str(round(nrows))}{suffix}' , 'wb') as f:
  y_test.to_pickle(f)
  print(f'saved to {f}')


saved to <_io.BufferedWriter name='/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction/data/intermediate/feature_engineered/X_train_1250000.pkl.gzip'>
saved to <_io.BufferedWriter name='/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction/data/intermediate/feature_engineered/y_train_1250000.pkl.gzip'>
saved to <_io.BufferedWriter name='/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction/data/intermediate/feature_engineered/X_test_1250000.pkl.gzip'>
saved to <_io.BufferedWriter name='/content/drive/MyDrive/Colab Notebooks/riiid-test-answer-prediction/data/intermediate/feature_engineered/y_test_1250000.pkl.gzip'>


# Future work

I could re-implement the transformation pipeline using sklearn Pipelines.