# Feature Engineering

In [1]:
import os
import pprint
from IPython.display import display, clear_output
from tqdm import tqdm
from collections import defaultdict
from datetime import datetime
import time

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
plt.style.use('seaborn-whitegrid')
import seaborn as sns
sns.set_style('whitegrid')

os.sys.path.append(os.path.abspath(r'../../'))
from configs.config import cfg 

In [2]:
def del_no_answer_data(df: pd.DataFrame, reset_idx: bool = True) -> pd.DataFrame:
    df = df[df.answerCode != -1]
    if reset_idx:
        df.reset_index(drop=True, inplace=True)
    
    return df

## 1. Data Correction

* Column
    * Name 
        * `testId` → `testID`
        * `assessmentItemID` → `assmtID`
        * `Timestamp` → `timestamp`
        * `KnowledgeTag` → `knowledgeTag`
    * New columns
        * `datasetType`
            * 0: Train set
            * 1: Validation set
            * 2: Test set
    * Order
        * [`userID`, `testID`, `assmtID`, `timestamp`, `knowledgeTag`, `answerCode`]

In [3]:
filenames = [r'train_data.csv', r'test_data.csv']
_df_dtype = {
    'userID': np.int16,
    'answerCode': np.int8,
    'KnowledgeTag': np.int16
}

for filename in filenames:
    path = os.path.join(cfg.path.data_dir, r'_ORIGINAL', filename)
    print(filename)

    # Read data.
    df = pd.read_csv(path, dtype=_df_dtype)

    # Correct column names.
    df.rename(
        columns = {
            'testId': 'testID',
            'assessmentItemID': 'assmtID',
            'Timestamp': 'timestamp',
            'KnowledgeTag': 'knowledgeTag'
        },
        inplace=True
    )
    
    # Add new columns.
    if filename == r'train_data.csv':
        df['datasetType'] = 0
    elif filename == r'val_data.csv':
        df['datasetType'] = 1
    elif filename == r'test_data.csv':
        df['datasetType'] = 2
    
    # Sort the column order.
    df = df[['userID', 'testID', 'assmtID', 'timestamp', 'knowledgeTag', 'answerCode', 'datasetType']]

    # Display
    display(df)
    display(df.info(show_counts=True))
    print()
    
    # Write data.
    path = os.path.join(cfg.path.data_dir, filename)
    df.to_csv(path, index=False)

train_data.csv


Unnamed: 0,userID,testID,assmtID,timestamp,knowledgeTag,answerCode,datasetType
0,0,A060000001,A060001001,2020-03-24 00:17:11,7224,1,0
1,0,A060000001,A060001002,2020-03-24 00:17:14,7225,1,0
2,0,A060000001,A060001003,2020-03-24 00:17:22,7225,1,0
3,0,A060000001,A060001004,2020-03-24 00:17:29,7225,1,0
4,0,A060000001,A060001005,2020-03-24 00:17:36,7225,1,0
...,...,...,...,...,...,...,...
2266581,7441,A030000071,A030071005,2020-06-05 06:50:21,438,0,0
2266582,7441,A040000165,A040165001,2020-08-21 01:06:39,8836,1,0
2266583,7441,A040000165,A040165002,2020-08-21 01:06:50,8836,1,0
2266584,7441,A040000165,A040165003,2020-08-21 01:07:36,8836,1,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2266586 entries, 0 to 2266585
Data columns (total 7 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   userID        2266586 non-null  int16 
 1   testID        2266586 non-null  object
 2   assmtID       2266586 non-null  object
 3   timestamp     2266586 non-null  object
 4   knowledgeTag  2266586 non-null  int16 
 5   answerCode    2266586 non-null  int8  
 6   datasetType   2266586 non-null  int64 
dtypes: int16(2), int64(1), int8(1), object(3)
memory usage: 80.0+ MB


None


test_data.csv


Unnamed: 0,userID,testID,assmtID,timestamp,knowledgeTag,answerCode,datasetType
0,3,A050000023,A050023001,2020-01-09 10:56:31,2626,1,2
1,3,A050000023,A050023002,2020-01-09 10:56:57,2626,1,2
2,3,A050000023,A050023003,2020-01-09 10:58:31,2625,0,2
3,3,A050000023,A050023004,2020-01-09 10:58:36,2625,0,2
4,3,A050000023,A050023006,2020-01-09 10:58:43,2623,0,2
...,...,...,...,...,...,...,...
260109,7439,A040000130,A040130001,2020-10-14 23:07:23,8832,0,2
260110,7439,A040000130,A040130002,2020-10-14 23:07:41,8832,1,2
260111,7439,A040000130,A040130003,2020-10-14 23:08:02,8244,1,2
260112,7439,A040000130,A040130004,2020-10-14 23:09:31,8244,1,2


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260114 entries, 0 to 260113
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   userID        260114 non-null  int16 
 1   testID        260114 non-null  object
 2   assmtID       260114 non-null  object
 3   timestamp     260114 non-null  object
 4   knowledgeTag  260114 non-null  int16 
 5   answerCode    260114 non-null  int8  
 6   datasetType   260114 non-null  int64 
dtypes: int16(2), int64(1), int8(1), object(3)
memory usage: 9.2+ MB


None




## 2. Test Set을 Validation Set으로 활용

데이터 하나 하나가 정말 소듕하니까...

### 2-1. Concatenate Whole Dataset

In [4]:
filenames = [r'train_data.csv', r'test_data.csv']

df = [pd.read_csv(os.path.join(cfg.path.data_dir, filename), dtype=cfg.data.df_dtype) for filename in filenames]
df = pd.concat(df, ignore_index=True)

display(df)
display(df.info(show_counts=True))
print()
print(f'The number of user IDs: {len(df.userID.unique())}')

Unnamed: 0,userID,testID,assmtID,timestamp,knowledgeTag,answerCode,datasetType
0,0,A060000001,A060001001,2020-03-24 00:17:11,7224,1,0
1,0,A060000001,A060001002,2020-03-24 00:17:14,7225,1,0
2,0,A060000001,A060001003,2020-03-24 00:17:22,7225,1,0
3,0,A060000001,A060001004,2020-03-24 00:17:29,7225,1,0
4,0,A060000001,A060001005,2020-03-24 00:17:36,7225,1,0
...,...,...,...,...,...,...,...
2526695,7439,A040000130,A040130001,2020-10-14 23:07:23,8832,0,2
2526696,7439,A040000130,A040130002,2020-10-14 23:07:41,8832,1,2
2526697,7439,A040000130,A040130003,2020-10-14 23:08:02,8244,1,2
2526698,7439,A040000130,A040130004,2020-10-14 23:09:31,8244,1,2


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2526700 entries, 0 to 2526699
Data columns (total 7 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   userID        2526700 non-null  int16 
 1   testID        2526700 non-null  object
 2   assmtID       2526700 non-null  object
 3   timestamp     2526700 non-null  object
 4   knowledgeTag  2526700 non-null  int64 
 5   answerCode    2526700 non-null  int8  
 6   datasetType   2526700 non-null  int8  
dtypes: int16(1), int64(1), int8(2), object(3)
memory usage: 86.7+ MB


None


The number of user IDs: 7442


In [5]:
df.to_csv(os.path.join(cfg.path.data_dir, r'data.csv'), index=False)

### 2-2. `answerCode`가 -1인 데이터 제거

학습할 때는 `answerCode`가 -1인 데이터를 제거한 후 사용.

In [6]:
df = del_no_answer_data(df)

display(df)
display(df.info(show_counts=True))

Unnamed: 0,userID,testID,assmtID,timestamp,knowledgeTag,answerCode,datasetType
0,0,A060000001,A060001001,2020-03-24 00:17:11,7224,1,0
1,0,A060000001,A060001002,2020-03-24 00:17:14,7225,1,0
2,0,A060000001,A060001003,2020-03-24 00:17:22,7225,1,0
3,0,A060000001,A060001004,2020-03-24 00:17:29,7225,1,0
4,0,A060000001,A060001005,2020-03-24 00:17:36,7225,1,0
...,...,...,...,...,...,...,...
2525951,7439,A040000197,A040197006,2020-08-21 07:39:45,2132,1,2
2525952,7439,A040000130,A040130001,2020-10-14 23:07:23,8832,0,2
2525953,7439,A040000130,A040130002,2020-10-14 23:07:41,8832,1,2
2525954,7439,A040000130,A040130003,2020-10-14 23:08:02,8244,1,2


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2525956 entries, 0 to 2525955
Data columns (total 7 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   userID        2525956 non-null  int16 
 1   testID        2525956 non-null  object
 2   assmtID       2525956 non-null  object
 3   timestamp     2525956 non-null  object
 4   knowledgeTag  2525956 non-null  int64 
 5   answerCode    2525956 non-null  int8  
 6   datasetType   2525956 non-null  int8  
dtypes: int16(1), int64(1), int8(2), object(3)
memory usage: 86.7+ MB


None

In [7]:
# df.to_csv(os.path.join(cfg.path.data_dir, r'data_wo_no-answer.csv'), index=False)