# Feature Engineering

In [1]:
import os
import pprint
from IPython.display import display, clear_output
from tqdm import tqdm
from collections import defaultdict
from datetime import datetime
import time

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
plt.style.use('seaborn-whitegrid')
import seaborn as sns
sns.set_style('whitegrid')

os.sys.path.append(os.path.abspath(r'../../'))
from configs.config import cfg 

In [2]:
def del_no_answer_data(df: pd.DataFrame, reset_idx: bool = True) -> pd.DataFrame:
    df = df[df.answerCode != -1]
    if reset_idx:
        df.reset_index(drop=True, inplace=True)
    
    return df

## 1. Data Correction

* Column
    * Name 
        * `testId` → `testID`
        * `assessmentItemID` → `assmtID`
        * `Timestamp` → `timestamp`
        * `KnowledgeTag` → `knowledgeTag`
    * New columns
        * `datasetType`
            * 0: Train set
            * 1: Validation set
            * 2: Test set
    * Order
        * [`userID`, `testID`, `assmtID`, `timestamp`, `knowledgeTag`, `answerCode`]

In [3]:
filenames = [r'train_data.csv', r'test_data.csv']
_df_dtype = {
    'userID': np.int16,
    'answerCode': np.int8,
    'KnowledgeTag': np.int16
}

for filename in filenames:
    path = os.path.join(cfg.path.data_dir, r'_ORIGINAL', filename)
    print(filename)

    # Read data.
    df = pd.read_csv(path, dtype=_df_dtype)

    # Correct column names.
    df.rename(
        columns = {
            'testId': 'testID',
            'assessmentItemID': 'assmtID',
            'Timestamp': 'timestamp',
            'KnowledgeTag': 'knowledgeTag'
        },
        inplace=True
    )
    
    # Add new columns.
    if filename == r'train_data.csv':
        df['datasetType'] = 0
    elif filename == r'val_data.csv':
        df['datasetType'] = 1
    elif filename == r'test_data.csv':
        df['datasetType'] = 2
    
    # Sort the column order.
    df = df[['userID', 'testID', 'assmtID', 'timestamp', 'knowledgeTag', 'answerCode', 'datasetType']]

    # Display
    display(df)
    display(df.info(show_counts=True))
    print()
    
    # Write data.
    path = os.path.join(cfg.path.data_dir, filename)
    df.to_csv(path, index=False)

train_data.csv


Unnamed: 0,userID,testID,assmtID,timestamp,knowledgeTag,answerCode,datasetType
0,0,A060000001,A060001001,2020-03-24 00:17:11,7224,1,0
1,0,A060000001,A060001002,2020-03-24 00:17:14,7225,1,0
2,0,A060000001,A060001003,2020-03-24 00:17:22,7225,1,0
3,0,A060000001,A060001004,2020-03-24 00:17:29,7225,1,0
4,0,A060000001,A060001005,2020-03-24 00:17:36,7225,1,0
...,...,...,...,...,...,...,...
2266581,7441,A030000071,A030071005,2020-06-05 06:50:21,438,0,0
2266582,7441,A040000165,A040165001,2020-08-21 01:06:39,8836,1,0
2266583,7441,A040000165,A040165002,2020-08-21 01:06:50,8836,1,0
2266584,7441,A040000165,A040165003,2020-08-21 01:07:36,8836,1,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2266586 entries, 0 to 2266585
Data columns (total 7 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   userID        2266586 non-null  int16 
 1   testID        2266586 non-null  object
 2   assmtID       2266586 non-null  object
 3   timestamp     2266586 non-null  object
 4   knowledgeTag  2266586 non-null  int16 
 5   answerCode    2266586 non-null  int8  
 6   datasetType   2266586 non-null  int64 
dtypes: int16(2), int64(1), int8(1), object(3)
memory usage: 80.0+ MB


None


test_data.csv


Unnamed: 0,userID,testID,assmtID,timestamp,knowledgeTag,answerCode,datasetType
0,3,A050000023,A050023001,2020-01-09 10:56:31,2626,1,2
1,3,A050000023,A050023002,2020-01-09 10:56:57,2626,1,2
2,3,A050000023,A050023003,2020-01-09 10:58:31,2625,0,2
3,3,A050000023,A050023004,2020-01-09 10:58:36,2625,0,2
4,3,A050000023,A050023006,2020-01-09 10:58:43,2623,0,2
...,...,...,...,...,...,...,...
260109,7439,A040000130,A040130001,2020-10-14 23:07:23,8832,0,2
260110,7439,A040000130,A040130002,2020-10-14 23:07:41,8832,1,2
260111,7439,A040000130,A040130003,2020-10-14 23:08:02,8244,1,2
260112,7439,A040000130,A040130004,2020-10-14 23:09:31,8244,1,2


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260114 entries, 0 to 260113
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   userID        260114 non-null  int16 
 1   testID        260114 non-null  object
 2   assmtID       260114 non-null  object
 3   timestamp     260114 non-null  object
 4   knowledgeTag  260114 non-null  int16 
 5   answerCode    260114 non-null  int8  
 6   datasetType   260114 non-null  int64 
dtypes: int16(2), int64(1), int8(1), object(3)
memory usage: 9.2+ MB


None




## 2. Test Set을 Validation Set으로 활용

데이터 하나 하나가 정말 소듕하니까...

### 2-1. Concatenate Whole Dataset

In [4]:
filenames = [r'train_data.csv', r'test_data.csv']

df = [pd.read_csv(os.path.join(cfg.path.data_dir, filename), dtype=cfg.data.df_dtype) for filename in filenames]
df = pd.concat(df, ignore_index=True)

display(df)
display(df.info(show_counts=True))
print()
print(f'The number of user IDs: {len(df.userID.unique())}')

Unnamed: 0,userID,testID,assmtID,timestamp,knowledgeTag,answerCode,datasetType
0,0,A060000001,A060001001,2020-03-24 00:17:11,7224,1,0
1,0,A060000001,A060001002,2020-03-24 00:17:14,7225,1,0
2,0,A060000001,A060001003,2020-03-24 00:17:22,7225,1,0
3,0,A060000001,A060001004,2020-03-24 00:17:29,7225,1,0
4,0,A060000001,A060001005,2020-03-24 00:17:36,7225,1,0
...,...,...,...,...,...,...,...
2526695,7439,A040000130,A040130001,2020-10-14 23:07:23,8832,0,2
2526696,7439,A040000130,A040130002,2020-10-14 23:07:41,8832,1,2
2526697,7439,A040000130,A040130003,2020-10-14 23:08:02,8244,1,2
2526698,7439,A040000130,A040130004,2020-10-14 23:09:31,8244,1,2


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2526700 entries, 0 to 2526699
Data columns (total 7 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   userID        2526700 non-null  int16 
 1   testID        2526700 non-null  object
 2   assmtID       2526700 non-null  object
 3   timestamp     2526700 non-null  object
 4   knowledgeTag  2526700 non-null  int64 
 5   answerCode    2526700 non-null  int8  
 6   datasetType   2526700 non-null  int8  
dtypes: int16(1), int64(1), int8(2), object(3)
memory usage: 86.7+ MB


None


The number of user IDs: 7442


In [5]:
df.to_csv(os.path.join(cfg.path.data_dir, r'data.csv'), index=False)

### 2-2. `answerCode`가 -1인 데이터 제거

학습할 때는 `answerCode`가 -1인 데이터를 제거한 후 사용.

In [6]:
df = del_no_answer_data(df)

display(df)
display(df.info(show_counts=True))

Unnamed: 0,userID,testID,assmtID,timestamp,knowledgeTag,answerCode,datasetType
0,0,A060000001,A060001001,2020-03-24 00:17:11,7224,1,0
1,0,A060000001,A060001002,2020-03-24 00:17:14,7225,1,0
2,0,A060000001,A060001003,2020-03-24 00:17:22,7225,1,0
3,0,A060000001,A060001004,2020-03-24 00:17:29,7225,1,0
4,0,A060000001,A060001005,2020-03-24 00:17:36,7225,1,0
...,...,...,...,...,...,...,...
2525951,7439,A040000197,A040197006,2020-08-21 07:39:45,2132,1,2
2525952,7439,A040000130,A040130001,2020-10-14 23:07:23,8832,0,2
2525953,7439,A040000130,A040130002,2020-10-14 23:07:41,8832,1,2
2525954,7439,A040000130,A040130003,2020-10-14 23:08:02,8244,1,2


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2525956 entries, 0 to 2525955
Data columns (total 7 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   userID        2525956 non-null  int16 
 1   testID        2525956 non-null  object
 2   assmtID       2525956 non-null  object
 3   timestamp     2525956 non-null  object
 4   knowledgeTag  2525956 non-null  int64 
 5   answerCode    2525956 non-null  int8  
 6   datasetType   2525956 non-null  int8  
dtypes: int16(1), int64(1), int8(2), object(3)
memory usage: 86.7+ MB


None

In [7]:
# df.to_csv(os.path.join(cfg.path.data_dir, r'data_wo_no-answer.csv'), index=False)

## 3. 시간 관련 정보

### 3-1. `unixTime`: Unix time (단위: second)

In [8]:
def cvt_timestamp2unix(timestamp: str) -> int:
    return int(time.mktime(datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S').timetuple()))

In [9]:
df = pd.read_csv(os.path.join(cfg.path.data_dir, r'data.csv'), dtype=cfg.data.df_dtype)
df.insert(df.columns.get_loc('timestamp') + 1, 'unixTime', df.timestamp.apply(cvt_timestamp2unix))

display(df)
display(df.info(show_counts=True))

Unnamed: 0,userID,testID,assmtID,timestamp,unixTime,knowledgeTag,answerCode,datasetType
0,0,A060000001,A060001001,2020-03-24 00:17:11,1585009031,7224,1,0
1,0,A060000001,A060001002,2020-03-24 00:17:14,1585009034,7225,1,0
2,0,A060000001,A060001003,2020-03-24 00:17:22,1585009042,7225,1,0
3,0,A060000001,A060001004,2020-03-24 00:17:29,1585009049,7225,1,0
4,0,A060000001,A060001005,2020-03-24 00:17:36,1585009056,7225,1,0
...,...,...,...,...,...,...,...,...
2526695,7439,A040000130,A040130001,2020-10-14 23:07:23,1602716843,8832,0,2
2526696,7439,A040000130,A040130002,2020-10-14 23:07:41,1602716861,8832,1,2
2526697,7439,A040000130,A040130003,2020-10-14 23:08:02,1602716882,8244,1,2
2526698,7439,A040000130,A040130004,2020-10-14 23:09:31,1602716971,8244,1,2


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2526700 entries, 0 to 2526699
Data columns (total 8 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   userID        2526700 non-null  int16 
 1   testID        2526700 non-null  object
 2   assmtID       2526700 non-null  object
 3   timestamp     2526700 non-null  object
 4   unixTime      2526700 non-null  int64 
 5   knowledgeTag  2526700 non-null  int64 
 6   answerCode    2526700 non-null  int8  
 7   datasetType   2526700 non-null  int8  
dtypes: int16(1), int64(2), int8(2), object(3)
memory usage: 106.0+ MB


None

### 3-2. `relTime2Last`: 마지막 문항을 푼 시간으로부터 해당 문항을 푼 상대적인 시각 (단위: second)

마지막으로 푼 문항의 정답 여부를 맞추는 것이 목적이므로, 시간에 대한 정보도 마지막 문항을 푼 시간에 맞춰 상대적으로 주는 것이 좋지 않을까?

※ Sliding window 등의 전처리를 사용할 경우 어차피 다시 계산해야 하므로 미리 계산해놓은 `relTime2Last`는 사용할 수 없다. 학습 직전에 선행되어야 하는 전처리 적용 후 `unixTime` 열의 모든 값에서 마지막 `timestamp`의 `unixTime`을 빼서 사용할 것.

In [10]:
def cvt_unix2rel(group):
    try:
        group.insert(group.columns.get_loc('timestamp') + 2, 'relTime2Last', group.unixTime - group.unixTime.max())    
    except ValueError as err:
        group.loc[:, ['relTime2Last']] = group.unixTime - group.unixTime.max()
        
    return group

In [11]:
grouped = df.groupby('userID')
tqdm.pandas()
df = grouped.progress_apply(cvt_unix2rel)

display(df)
display(df.info(show_counts=True))

100%|██████████| 7442/7442 [00:13<00:00, 566.23it/s] 


Unnamed: 0,userID,testID,assmtID,timestamp,unixTime,relTime2Last,knowledgeTag,answerCode,datasetType
0,0,A060000001,A060001001,2020-03-24 00:17:11,1585009031,-23685788,7224,1,0
1,0,A060000001,A060001002,2020-03-24 00:17:14,1585009034,-23685785,7225,1,0
2,0,A060000001,A060001003,2020-03-24 00:17:22,1585009042,-23685777,7225,1,0
3,0,A060000001,A060001004,2020-03-24 00:17:29,1585009049,-23685770,7225,1,0
4,0,A060000001,A060001005,2020-03-24 00:17:36,1585009056,-23685763,7225,1,0
...,...,...,...,...,...,...,...,...,...
2526695,7439,A040000130,A040130001,2020-10-14 23:07:23,1602716843,-160,8832,0,2
2526696,7439,A040000130,A040130002,2020-10-14 23:07:41,1602716861,-142,8832,1,2
2526697,7439,A040000130,A040130003,2020-10-14 23:08:02,1602716882,-121,8244,1,2
2526698,7439,A040000130,A040130004,2020-10-14 23:09:31,1602716971,-32,8244,1,2


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2526700 entries, 0 to 2526699
Data columns (total 9 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   userID        2526700 non-null  int16 
 1   testID        2526700 non-null  object
 2   assmtID       2526700 non-null  object
 3   timestamp     2526700 non-null  object
 4   unixTime      2526700 non-null  int64 
 5   relTime2Last  2526700 non-null  int64 
 6   knowledgeTag  2526700 non-null  int64 
 7   answerCode    2526700 non-null  int8  
 8   datasetType   2526700 non-null  int8  
dtypes: int16(1), int64(3), int8(2), object(3)
memory usage: 209.1+ MB


None

In [12]:
df.to_csv(os.path.join(cfg.path.data_dir, r'data.csv'), index=False)

### 3-3. `elapsedTime`: 문항을 푸는데 걸린 시간 (단위: second)

문제 풀이 시간이:

* 적당했다면 잘 풀었을 가능성이 높다.
* 너무 오래 걸렸다면 해당 문항이 어려워서 그랬을 것이다. vs. 시간을 많이 투자해서 잘 풀었을 수도 있다.
* 너무 조금 걸렸다면 찍거나 막 누르는 등 제대로 안 풀었을 것이다. vs. 그만큼 해당 문항이 쉬웠을 수도 있다.

Outliers 제거

* 각 사용자가 마지막으로 푼 문제는 정확한 풀이 시간을 계산할 수 없다.
  * -1로 맵핑
* `timestamp`가 각 문항을 풀기 시작한 시간이기 때문에 데이터에 노이즈(e.g. 시작을 눌러놓고 한참 뒤에 푸는 경우 등)가 있다.
  * Max elapsed time을 초과하는 경우 비정상적인 데이터로 간주 -2로 맵핑
  * Max elapsed time: 1 hour
* 위 두 가지 경우에 해당하는 데이터는 해당 문항에 대한 풀이 시간 분포의 median 값으로 대체
    * 문제 풀이 시간별 정답률 추이가 1시간까지 점차 감소했다가 그 이후로는 규칙성이 사라지는 경향을 띈다.

In [13]:
def cal_elapsed_time(group):
    group.sort_values(by='unixTime', axis=0, inplace=True)
    
    # Calculate elapsed times.
    # Outlier: 마지막 문항은 elapsed time을 계산할 수 없으므로 -1로 맵핑
    group.insert(group.columns.get_loc('timestamp') + 3, '_elapsedTime', (group.unixTime.values[1:] - group.unixTime.values[:-1]).tolist() + [-1])
        
    return group

In [14]:
max_elapsed_time = 60 * 60

df = pd.read_csv(os.path.join(cfg.path.data_dir, r'data.csv'), dtype=cfg.data.df_dtype)

grouped = df.groupby('userID')
tqdm.pandas()
df = grouped.progress_apply(cal_elapsed_time)
df.reset_index(drop=True, inplace=True)
# Outlier: Max elapsed time을 초과하는 경우 -2로 맵핑
df.loc[df._elapsedTime > max_elapsed_time, ['_elapsedTime']] = -2

display(df)
display(df.info(show_counts=True))
display(df.loc[df._elapsedTime >= 0, ['_elapsedTime']].describe())

100%|██████████| 7442/7442 [00:17<00:00, 434.33it/s]


Unnamed: 0,userID,testID,assmtID,timestamp,unixTime,relTime2Last,_elapsedTime,knowledgeTag,answerCode,datasetType
0,0,A060000001,A060001001,2020-03-24 00:17:11,1585009031,-23685788,3,7224,1,0
1,0,A060000001,A060001002,2020-03-24 00:17:14,1585009034,-23685785,8,7225,1,0
2,0,A060000001,A060001003,2020-03-24 00:17:22,1585009042,-23685777,7,7225,1,0
3,0,A060000001,A060001004,2020-03-24 00:17:29,1585009049,-23685770,7,7225,1,0
4,0,A060000001,A060001005,2020-03-24 00:17:36,1585009056,-23685763,11,7225,1,0
...,...,...,...,...,...,...,...,...,...,...
2526695,7441,A030000071,A030071005,2020-06-05 06:50:21,1591339821,-6632308,-2,438,0,0
2526696,7441,A040000165,A040165001,2020-08-21 01:06:39,1597971999,-130,11,8836,1,0
2526697,7441,A040000165,A040165002,2020-08-21 01:06:50,1597972010,-119,46,8836,1,0
2526698,7441,A040000165,A040165003,2020-08-21 01:07:36,1597972056,-73,73,8836,1,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2526700 entries, 0 to 2526699
Data columns (total 10 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   userID        2526700 non-null  int16 
 1   testID        2526700 non-null  object
 2   assmtID       2526700 non-null  object
 3   timestamp     2526700 non-null  object
 4   unixTime      2526700 non-null  int64 
 5   relTime2Last  2526700 non-null  int64 
 6   _elapsedTime  2526700 non-null  int64 
 7   knowledgeTag  2526700 non-null  int64 
 8   answerCode    2526700 non-null  int8  
 9   datasetType   2526700 non-null  int8  
dtypes: int16(1), int64(4), int8(2), object(3)
memory usage: 144.6+ MB


None

Unnamed: 0,_elapsedTime
count,2157252.0
mean,88.42425
std,255.1784
min,0.0
25%,10.0
50%,28.0
75%,69.0
max,3600.0


In [15]:
def cvt_elapsed_time_outlier2assmt_median(group):
    mask = group._elapsedTime >= 0    
    group.loc[~mask, ['elapsedTime']] = int(group.loc[mask, '_elapsedTime'].median())
    
    return group

In [16]:
df.insert(df.columns.get_loc('timestamp') + 3, 'elapsedTime', df._elapsedTime)

grouped = df.groupby('assmtID')
tqdm.pandas()
df = grouped.progress_apply(cvt_elapsed_time_outlier2assmt_median)
df.reset_index(drop=True, inplace=True)

display(df)
display(df.info(show_counts=True))
display(df[['elapsedTime']].describe())

100%|██████████| 9454/9454 [00:24<00:00, 381.33it/s]


Unnamed: 0,userID,testID,assmtID,timestamp,unixTime,relTime2Last,elapsedTime,_elapsedTime,knowledgeTag,answerCode,datasetType
0,0,A060000001,A060001001,2020-03-24 00:17:11,1585009031,-23685788,3,3,7224,1,0
1,0,A060000001,A060001002,2020-03-24 00:17:14,1585009034,-23685785,8,8,7225,1,0
2,0,A060000001,A060001003,2020-03-24 00:17:22,1585009042,-23685777,7,7,7225,1,0
3,0,A060000001,A060001004,2020-03-24 00:17:29,1585009049,-23685770,7,7,7225,1,0
4,0,A060000001,A060001005,2020-03-24 00:17:36,1585009056,-23685763,11,11,7225,1,0
...,...,...,...,...,...,...,...,...,...,...,...
2526695,7441,A030000071,A030071005,2020-06-05 06:50:21,1591339821,-6632308,401,-2,438,0,0
2526696,7441,A040000165,A040165001,2020-08-21 01:06:39,1597971999,-130,11,11,8836,1,0
2526697,7441,A040000165,A040165002,2020-08-21 01:06:50,1597972010,-119,46,46,8836,1,0
2526698,7441,A040000165,A040165003,2020-08-21 01:07:36,1597972056,-73,73,73,8836,1,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2526700 entries, 0 to 2526699
Data columns (total 11 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   userID        2526700 non-null  int16 
 1   testID        2526700 non-null  object
 2   assmtID       2526700 non-null  object
 3   timestamp     2526700 non-null  object
 4   unixTime      2526700 non-null  int64 
 5   relTime2Last  2526700 non-null  int64 
 6   elapsedTime   2526700 non-null  int64 
 7   _elapsedTime  2526700 non-null  int64 
 8   knowledgeTag  2526700 non-null  int64 
 9   answerCode    2526700 non-null  int8  
 10  datasetType   2526700 non-null  int8  
dtypes: int16(1), int64(5), int8(2), object(3)
memory usage: 163.9+ MB


None

Unnamed: 0,elapsedTime
count,2526700.0
mean,106.9037
std,252.6768
min,0.0
25%,12.0
50%,33.0
75%,99.0
max,3600.0


In [17]:
df.to_csv(os.path.join(cfg.path.data_dir, r'data.csv'), index=False)

## 4. Test

Test ID에 숨겨진 시험지 분류(두 번째 자리부터 네 번째 자리까지 숫자 코드) 정보를 사용하자.

* Test ID
  * 첫 자리: 항상 A
    * 미사용: 의미 없음
  * 다음 3자리: 대분류
  * 마지막 6자리: 각 분류 내 시험지 번호
    * 미사용: 분리해서 사용할 경우 중복되므로 test ID를 그대로 사용하는 것이 맞다.

### 4-1. `testCategory`: 시험지 분류

In [18]:
# DEPRECATED:
# df = pd.read_csv(os.path.join(cfg.path.data_dir, r'data.csv'), dtype=cfg.data.df_dtype)
# df.insert(df.columns.get_loc('testID') + 1, 'testCategory', df.testID.apply(lambda x: x[1:4]))
# df.insert(df.columns.get_loc('testID') + 2, 'testNum', df.testID.apply(lambda x: x[4:]))

# display(df)
# display(df.info(show_counts=True))

In [19]:
df = pd.read_csv(os.path.join(cfg.path.data_dir, r'data.csv'), dtype=cfg.data.df_dtype)
df.insert(df.columns.get_loc('testID'), 'testCategory', df.testID.apply(lambda x: x[1:4]))

display(df)
display(df.info(show_counts=True))

Unnamed: 0,userID,testCategory,testID,assmtID,timestamp,unixTime,relTime2Last,elapsedTime,_elapsedTime,knowledgeTag,answerCode,datasetType
0,0,060,A060000001,A060001001,2020-03-24 00:17:11,1585009031,-23685788,3,3,7224,1,0
1,0,060,A060000001,A060001002,2020-03-24 00:17:14,1585009034,-23685785,8,8,7225,1,0
2,0,060,A060000001,A060001003,2020-03-24 00:17:22,1585009042,-23685777,7,7,7225,1,0
3,0,060,A060000001,A060001004,2020-03-24 00:17:29,1585009049,-23685770,7,7,7225,1,0
4,0,060,A060000001,A060001005,2020-03-24 00:17:36,1585009056,-23685763,11,11,7225,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2526695,7441,030,A030000071,A030071005,2020-06-05 06:50:21,1591339821,-6632308,401,-2,438,0,0
2526696,7441,040,A040000165,A040165001,2020-08-21 01:06:39,1597971999,-130,11,11,8836,1,0
2526697,7441,040,A040000165,A040165002,2020-08-21 01:06:50,1597972010,-119,46,46,8836,1,0
2526698,7441,040,A040000165,A040165003,2020-08-21 01:07:36,1597972056,-73,73,73,8836,1,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2526700 entries, 0 to 2526699
Data columns (total 12 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   userID        2526700 non-null  int16 
 1   testCategory  2526700 non-null  object
 2   testID        2526700 non-null  object
 3   assmtID       2526700 non-null  object
 4   timestamp     2526700 non-null  object
 5   unixTime      2526700 non-null  int64 
 6   relTime2Last  2526700 non-null  int64 
 7   elapsedTime   2526700 non-null  int64 
 8   _elapsedTime  2526700 non-null  int64 
 9   knowledgeTag  2526700 non-null  int64 
 10  answerCode    2526700 non-null  int8  
 11  datasetType   2526700 non-null  int8  
dtypes: int16(1), int64(5), int8(2), object(4)
memory usage: 183.1+ MB


None

In [20]:
df.to_csv(os.path.join(cfg.path.data_dir, r'data.csv'), index=False)

## 5. Assessment Item

### 5-1. `assmtCorrectRate`: 문항별 정답률 (상대적 난이도)

시험이나 문항의 난이도가 특정한 feature로 주어지지 않았다. 대신 전체 사용자의 해당 문항에 대한 정답률을 상대적인 난이도로 활용해보자.

### 5-2. `assmtETMedian`: 문항별 풀이 소요 시간 분포의 중앙값 (상대적 난이도)

`assmtCorrectRate`과 마찬가지로 상대적 난이도로 활용 가능하다. 또한, `elapsedTime`만 단독으로 사용하는 것보다 `assmtETMedian`을 기준으로 함께 사용하면 더 효과가 좋을 것 같다.

* `elapsedTime`의 outliers는 제외하고 중앙값을 계산 

In [21]:
def cal_assmt_features(group):
    # assmtCorrectRate
    _group = group[group.answerCode != -1]
    group.loc[:, ['assmtCorrectRate']] = sum(_group.answerCode) / len(_group)
    
    # assmtETMedian
    mask = group._elapsedTime >= 0    
    group.loc[:, ['assmtETMedian']] = int(group.loc[mask, '_elapsedTime'].median())
    
    return group

In [22]:
df = pd.read_csv(os.path.join(cfg.path.data_dir, r'data.csv'), dtype=cfg.data.df_dtype)
df.insert(df.columns.get_loc('assmtID') + 1, 'assmtCorrectRate', np.nan)
df.insert(df.columns.get_loc('assmtID') + 2, 'assmtETMedian', -1)

grouped = df.groupby('assmtID')
tqdm.pandas()
df = grouped.progress_apply(cal_assmt_features)
df.reset_index(drop=True, inplace=True)

display(df)
display(df.info(show_counts=True))

100%|██████████| 9454/9454 [00:48<00:00, 194.44it/s]


Unnamed: 0,userID,testCategory,testID,assmtID,assmtCorrectRate,assmtETMedian,timestamp,unixTime,relTime2Last,elapsedTime,_elapsedTime,knowledgeTag,answerCode,datasetType
0,0,60,A060000001,A060001001,0.984000,6,2020-03-24 00:17:11,1585009031,-23685788,3,3,7224,1,0
1,0,60,A060000001,A060001002,0.968000,14,2020-03-24 00:17:14,1585009034,-23685785,8,8,7225,1,0
2,0,60,A060000001,A060001003,0.916000,10,2020-03-24 00:17:22,1585009042,-23685777,7,7,7225,1,0
3,0,60,A060000001,A060001004,0.972000,11,2020-03-24 00:17:29,1585009049,-23685770,7,7,7225,1,0
4,0,60,A060000001,A060001005,0.948000,20,2020-03-24 00:17:36,1585009056,-23685763,11,11,7225,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2526695,7441,30,A030000071,A030071005,0.446667,401,2020-06-05 06:50:21,1591339821,-6632308,401,-2,438,0,0
2526696,7441,40,A040000165,A040165001,0.643333,17,2020-08-21 01:06:39,1597971999,-130,11,11,8836,1,0
2526697,7441,40,A040000165,A040165002,0.640000,14,2020-08-21 01:06:50,1597972010,-119,46,46,8836,1,0
2526698,7441,40,A040000165,A040165003,0.786667,79,2020-08-21 01:07:36,1597972056,-73,73,73,8836,1,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2526700 entries, 0 to 2526699
Data columns (total 14 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   userID            2526700 non-null  int16  
 1   testCategory      2526700 non-null  int8   
 2   testID            2526700 non-null  object 
 3   assmtID           2526700 non-null  object 
 4   assmtCorrectRate  2526700 non-null  float64
 5   assmtETMedian     2526700 non-null  int64  
 6   timestamp         2526700 non-null  object 
 7   unixTime          2526700 non-null  int64  
 8   relTime2Last      2526700 non-null  int64  
 9   elapsedTime       2526700 non-null  int64  
 10  _elapsedTime      2526700 non-null  int64  
 11  knowledgeTag      2526700 non-null  int64  
 12  answerCode        2526700 non-null  int8   
 13  datasetType       2526700 non-null  int8   
dtypes: float64(1), int16(1), int64(6), int8(3), object(3)
memory usage: 204.8+ MB


None

In [23]:
df.to_csv(os.path.join(cfg.path.data_dir, r'data.csv'), index=False)

### 5-3. `cumInteraction`: 사용자별 누적 풀이 문항 개수 (시간순)

Sequence 길이 제한에 의해 데이터가 일부 잘리게 될텐데, 각 `timestamp`에 여태까지 총 몇 문항을 풀었는지에 대한 정보가 함께 주어지면 좋을 것 같다.

* 시너지를 낼 수 있을 것 같은 features
  * 최근 10문제 정답률: 일종의 집중도라고도 볼 수 있다. 또한 대충 찍고 있는 건 아닌지도 알 수 있다.

In [24]:
def cal_cum_interaction(group):
    group.sort_values(by='unixTime', axis=0, inplace=True)
    
    group.insert(group.columns.get_loc('assmtID') + 3, 'cumInteraction', np.arange(len(group)))
        
    return group

In [25]:
df = pd.read_csv(os.path.join(cfg.path.data_dir, r'data.csv'), dtype=cfg.data.df_dtype)

grouped = df.groupby('userID')
tqdm.pandas()
df = grouped.progress_apply(cal_cum_interaction)
df.reset_index(drop=True, inplace=True)

display(df)
display(df.info(show_counts=True))

100%|██████████| 7442/7442 [00:20<00:00, 362.63it/s]


Unnamed: 0,userID,testCategory,testID,assmtID,assmtCorrectRate,assmtETMedian,cumInteraction,timestamp,unixTime,relTime2Last,elapsedTime,_elapsedTime,knowledgeTag,answerCode,datasetType
0,0,60,A060000001,A060001001,0.984000,6,0,2020-03-24 00:17:11,1585009031,-23685788,3,3,7224,1,0
1,0,60,A060000001,A060001002,0.968000,14,1,2020-03-24 00:17:14,1585009034,-23685785,8,8,7225,1,0
2,0,60,A060000001,A060001003,0.916000,10,2,2020-03-24 00:17:22,1585009042,-23685777,7,7,7225,1,0
3,0,60,A060000001,A060001004,0.972000,11,3,2020-03-24 00:17:29,1585009049,-23685770,7,7,7225,1,0
4,0,60,A060000001,A060001005,0.948000,20,4,2020-03-24 00:17:36,1585009056,-23685763,11,11,7225,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2526695,7441,30,A030000071,A030071005,0.446667,401,4,2020-06-05 06:50:21,1591339821,-6632308,401,-2,438,0,0
2526696,7441,40,A040000165,A040165001,0.643333,17,5,2020-08-21 01:06:39,1597971999,-130,11,11,8836,1,0
2526697,7441,40,A040000165,A040165002,0.640000,14,6,2020-08-21 01:06:50,1597972010,-119,46,46,8836,1,0
2526698,7441,40,A040000165,A040165003,0.786667,79,7,2020-08-21 01:07:36,1597972056,-73,73,73,8836,1,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2526700 entries, 0 to 2526699
Data columns (total 15 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   userID            2526700 non-null  int16  
 1   testCategory      2526700 non-null  int8   
 2   testID            2526700 non-null  object 
 3   assmtID           2526700 non-null  object 
 4   assmtCorrectRate  2526700 non-null  float64
 5   assmtETMedian     2526700 non-null  int64  
 6   cumInteraction    2526700 non-null  int64  
 7   timestamp         2526700 non-null  object 
 8   unixTime          2526700 non-null  int64  
 9   relTime2Last      2526700 non-null  int64  
 10  elapsedTime       2526700 non-null  int64  
 11  _elapsedTime      2526700 non-null  int64  
 12  knowledgeTag      2526700 non-null  int64  
 13  answerCode        2526700 non-null  int8   
 14  datasetType       2526700 non-null  int8   
dtypes: float64(1), int16(1), int64(7), int8(3), objec

None

In [26]:
df.to_csv(os.path.join(cfg.path.data_dir, r'data.csv'), index=False)

### 5-4. `cumCorrectRate`: 사용자별 누적 정답률 (시간순)

행복은 성적순이 아니라지만... 성적의 등락 추이는 knowledge tracing에서 매우 중요한 정보이다.

* 단순히 맞춘 문제 개수를 세서 정답률을 구하는 것이 아니라, `cumCorrectRate`을 활용하여 쉬운 문제를 틀렸을 때와 어려운 문제를 맞췄을 때 각각에 대해 가중치를 주어 누적 정답률을 계산한다.

In [27]:
def __update_cum_correct_rate(cum_correct_rate, assmt_correct_rate, answer_code):
    cum_correct_rate[0] += assmt_correct_rate * answer_code
    cum_correct_rate[1] += assmt_correct_rate
    
    return cum_correct_rate


def cal_cum_correct_rate(group):
    group.sort_values(by='unixTime', axis=0, inplace=True)
    
    # Initialization
    r = group.index[0]
    group.at[r, 'cumCorrectRate'] = 0.5
    # The first update
    cum_correct_rate = [0, 0]
    cum_correct_rate = __update_cum_correct_rate(cum_correct_rate, group.at[r, 'assmtCorrectRate'], group.at[r, 'answerCode'])
    
    for r in group.index[1:]:
        group.at[r, 'cumCorrectRate'] = cum_correct_rate[0] / cum_correct_rate[1]
        cum_correct_rate = __update_cum_correct_rate(cum_correct_rate, group.at[r, 'assmtCorrectRate'], group.at[r, 'answerCode'])
        
    return group

In [28]:
df = pd.read_csv(os.path.join(cfg.path.data_dir, r'data.csv'), dtype=cfg.data.df_dtype)
df.insert(df.columns.get_loc('assmtID') + 4, 'cumCorrectRate', np.nan)

grouped = df.groupby('userID')
tqdm.pandas()
df = grouped.progress_apply(cal_cum_correct_rate)
df.reset_index(drop=True, inplace=True)

display(df)
display(df.info(show_counts=True))

100%|██████████| 7442/7442 [01:11<00:00, 104.74it/s]


Unnamed: 0,userID,testCategory,testID,assmtID,assmtCorrectRate,assmtETMedian,cumInteraction,cumCorrectRate,timestamp,unixTime,relTime2Last,elapsedTime,_elapsedTime,knowledgeTag,answerCode,datasetType
0,0,60,A060000001,A060001001,0.984000,6,0,0.500000,2020-03-24 00:17:11,1585009031,-23685788,3,3,7224,1,0
1,0,60,A060000001,A060001002,0.968000,14,1,1.000000,2020-03-24 00:17:14,1585009034,-23685785,8,8,7225,1,0
2,0,60,A060000001,A060001003,0.916000,10,2,1.000000,2020-03-24 00:17:22,1585009042,-23685777,7,7,7225,1,0
3,0,60,A060000001,A060001004,0.972000,11,3,1.000000,2020-03-24 00:17:29,1585009049,-23685770,7,7,7225,1,0
4,0,60,A060000001,A060001005,0.948000,20,4,1.000000,2020-03-24 00:17:36,1585009056,-23685763,11,11,7225,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2526695,7441,30,A030000071,A030071005,0.446667,401,4,0.292486,2020-06-05 06:50:21,1591339821,-6632308,401,-2,438,0,0
2526696,7441,40,A040000165,A040165001,0.643333,17,5,0.253253,2020-08-21 01:06:39,1597971999,-130,11,11,8836,1,0
2526697,7441,40,A040000165,A040165002,0.640000,14,6,0.374161,2020-08-21 01:06:50,1597972010,-119,46,46,8836,1,0
2526698,7441,40,A040000165,A040165003,0.786667,79,7,0.460983,2020-08-21 01:07:36,1597972056,-73,73,73,8836,1,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2526700 entries, 0 to 2526699
Data columns (total 16 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   userID            2526700 non-null  int16  
 1   testCategory      2526700 non-null  int8   
 2   testID            2526700 non-null  object 
 3   assmtID           2526700 non-null  object 
 4   assmtCorrectRate  2526700 non-null  float64
 5   assmtETMedian     2526700 non-null  int64  
 6   cumInteraction    2526700 non-null  int64  
 7   cumCorrectRate    2526700 non-null  float64
 8   timestamp         2526700 non-null  object 
 9   unixTime          2526700 non-null  int64  
 10  relTime2Last      2526700 non-null  int64  
 11  elapsedTime       2526700 non-null  int64  
 12  _elapsedTime      2526700 non-null  int64  
 13  knowledgeTag      2526700 non-null  int64  
 14  answerCode        2526700 non-null  int8   
 15  datasetType       2526700 non-null  int8   
dtype

None

In [29]:
df.to_csv(os.path.join(cfg.path.data_dir, r'data.csv'), index=False)

### 5-5. `retry`: 이전에 풀어본 문항을 다시 푸는 재시도 횟수 (시간순)

이전에 풀어본 문항을 다시 풀면 풀수록 정답률이 높아지지 않을까?

* 시너지를 낼 수 있을 것 같은 features
  * 가장 최근에 해당 문항를 푼 시점으로부터의 시간 간격
  * 앞서 풀었을 때 정답률 또는 맞춘 개수 (가장 최근 것만 넣을까? 전체를 넣을까?)
  
* 아예 사용하지 않는 편이 더 나을 것 같아서 다시 실험 중
  * Test set(744명)의 각 사용자가 푼 마지막 문항 중 `retry`가:
    * 1인 경우: 1.6%
    * 2인 경우: 없음
  * 분명 의미 있는 features이지만 해당 케이스가 너무 적다.

In [30]:
def cal_retry(group):
    retry_dict = defaultdict(lambda: 0)
    assmt_ids, cnts = np.unique(group.assmtID, return_counts=True)
    assmt_ids = assmt_ids[cnts >= 2]
    
    if len(assmt_ids) == 0:        
        pass
    else:
        for r in group.index:
            assmt_id = group.assmtID.loc[r]
            if assmt_id in assmt_ids:
                group.at[r, 'retry'] = retry_dict[assmt_id]
                retry_dict[assmt_id] += 1
    
    return group

In [31]:
df = pd.read_csv(os.path.join(cfg.path.data_dir, r'data.csv'), dtype=cfg.data.df_dtype)
df.insert(df.columns.get_loc('assmtID') + 5, 'retry', 0)

grouped = df.groupby('userID')
tqdm.pandas()
df = grouped.progress_apply(cal_retry)
df.reset_index(drop=True, inplace=True)

display(df)
display(df.info(show_counts=True))

100%|██████████| 7442/7442 [00:34<00:00, 213.58it/s] 


Unnamed: 0,userID,testCategory,testID,assmtID,assmtCorrectRate,assmtETMedian,cumInteraction,cumCorrectRate,retry,timestamp,unixTime,relTime2Last,elapsedTime,_elapsedTime,knowledgeTag,answerCode,datasetType
0,0,60,A060000001,A060001001,0.984000,6,0,0.500000,0,2020-03-24 00:17:11,1585009031,-23685788,3,3,7224,1,0
1,0,60,A060000001,A060001002,0.968000,14,1,1.000000,0,2020-03-24 00:17:14,1585009034,-23685785,8,8,7225,1,0
2,0,60,A060000001,A060001003,0.916000,10,2,1.000000,0,2020-03-24 00:17:22,1585009042,-23685777,7,7,7225,1,0
3,0,60,A060000001,A060001004,0.972000,11,3,1.000000,0,2020-03-24 00:17:29,1585009049,-23685770,7,7,7225,1,0
4,0,60,A060000001,A060001005,0.948000,20,4,1.000000,0,2020-03-24 00:17:36,1585009056,-23685763,11,11,7225,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2526695,7441,30,A030000071,A030071005,0.446667,401,4,0.292486,0,2020-06-05 06:50:21,1591339821,-6632308,401,-2,438,0,0
2526696,7441,40,A040000165,A040165001,0.643333,17,5,0.253253,0,2020-08-21 01:06:39,1597971999,-130,11,11,8836,1,0
2526697,7441,40,A040000165,A040165002,0.640000,14,6,0.374161,0,2020-08-21 01:06:50,1597972010,-119,46,46,8836,1,0
2526698,7441,40,A040000165,A040165003,0.786667,79,7,0.460983,0,2020-08-21 01:07:36,1597972056,-73,73,73,8836,1,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2526700 entries, 0 to 2526699
Data columns (total 17 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   userID            2526700 non-null  int16  
 1   testCategory      2526700 non-null  int8   
 2   testID            2526700 non-null  object 
 3   assmtID           2526700 non-null  object 
 4   assmtCorrectRate  2526700 non-null  float64
 5   assmtETMedian     2526700 non-null  int64  
 6   cumInteraction    2526700 non-null  int64  
 7   cumCorrectRate    2526700 non-null  float64
 8   retry             2526700 non-null  int64  
 9   timestamp         2526700 non-null  object 
 10  unixTime          2526700 non-null  int64  
 11  relTime2Last      2526700 non-null  int64  
 12  elapsedTime       2526700 non-null  int64  
 13  _elapsedTime      2526700 non-null  int64  
 14  knowledgeTag      2526700 non-null  int64  
 15  answerCode        2526700 non-null  int8   
 16  

None

In [32]:
df.to_csv(os.path.join(cfg.path.data_dir, r'data.csv'), index=False)

## 6. Knowledge Tag

특정 문항을 맞출 것인지 예측하는 것으로 task가 간략화 되었지만, 사용자별로 knowledge tracing을 하는 것이 본래의 목적이다. 주어진 데이터셋 내에서 knowledge tracing의 기준이 되는 feature를 무엇으로 하는 것이 좋을까? 시험이나 문항보다는 '분류'의 의미로써 주어진 `knowledgeTag`를 기준으로 하는 것이 가장 타당할 것 같다. 예를 들어 대분류(국어, 영어, 수학), 소분류(덧셈, 뺄셈, 곱셈, 나눗셈)에 대해 사용자의 지식 상태를 상세하게 추정하는 것이다. 고유 항목의 개수도 시험이나 문항보다 태그의 개수가 더 적어서 적합하다.

### 6-1. `cumTag`: 각 태그가 붙은 문항에 대한 사용자별 누적 풀이 문항 개수 (시간순)

### 6-2. `cumTagCorrectRate`: 각 태그가 붙은 문항에 대한 사용자별 누적 정답률 (시간순)

In [33]:
def cal_tag_features(group):
    group.sort_values(by='unixTime', axis=0, inplace=True)
    
    tag_correct_dict = defaultdict(lambda: [0, 0])
    for idx in group.index:
        tag = group.knowledgeTag.loc[idx]
        
        # 태그별 누적 문항 개수
        group.at[idx, 'cumTag'] = tag_correct_dict[tag][0]
        # 태그별 누적 정답률 (단, 해당 태그의 문항을 하나도 풀지 않았을 경우 0.5)
        group.at[idx, 'cumTagCorrectRate'] = 0.5 if tag_correct_dict[tag][0] == 0 else tag_correct_dict[tag][1] / tag_correct_dict[tag][0]
        
        # 업데이트
        # 태그별 누적 문항 개수
        tag_correct_dict[tag][0] += 1
        # 태그별 누적 정답 개수
        tag_correct_dict[tag][1] += group.answerCode.loc[idx]
    
    return group

In [34]:
df = pd.read_csv(os.path.join(cfg.path.data_dir, r'data.csv'), dtype=cfg.data.df_dtype)
df.insert(df.columns.get_loc('knowledgeTag') + 1, 'cumTag', 0)
df.insert(df.columns.get_loc('knowledgeTag') + 2, 'cumTagCorrectRate', 0.)

grouped = df.groupby('userID')
tqdm.pandas()
df = grouped.progress_apply(cal_tag_features)
df.reset_index(drop=True, inplace=True)

display(df)
display(df.info(show_counts=True))

100%|██████████| 7442/7442 [03:26<00:00, 36.03it/s] 


Unnamed: 0,userID,testCategory,testID,assmtID,assmtCorrectRate,assmtETMedian,cumInteraction,cumCorrectRate,retry,timestamp,unixTime,relTime2Last,elapsedTime,_elapsedTime,knowledgeTag,cumTag,cumTagCorrectRate,answerCode,datasetType
0,0,60,A060000001,A060001001,0.984000,6,0,0.500000,0,2020-03-24 00:17:11,1585009031,-23685788,3,3,7224,0,0.50,1,0
1,0,60,A060000001,A060001002,0.968000,14,1,1.000000,0,2020-03-24 00:17:14,1585009034,-23685785,8,8,7225,0,0.50,1,0
2,0,60,A060000001,A060001003,0.916000,10,2,1.000000,0,2020-03-24 00:17:22,1585009042,-23685777,7,7,7225,1,1.00,1,0
3,0,60,A060000001,A060001004,0.972000,11,3,1.000000,0,2020-03-24 00:17:29,1585009049,-23685770,7,7,7225,2,1.00,1,0
4,0,60,A060000001,A060001005,0.948000,20,4,1.000000,0,2020-03-24 00:17:36,1585009056,-23685763,11,11,7225,3,1.00,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2526695,7441,30,A030000071,A030071005,0.446667,401,4,0.292486,0,2020-06-05 06:50:21,1591339821,-6632308,401,-2,438,4,0.25,0,0
2526696,7441,40,A040000165,A040165001,0.643333,17,5,0.253253,0,2020-08-21 01:06:39,1597971999,-130,11,11,8836,0,0.50,1,0
2526697,7441,40,A040000165,A040165002,0.640000,14,6,0.374161,0,2020-08-21 01:06:50,1597972010,-119,46,46,8836,1,1.00,1,0
2526698,7441,40,A040000165,A040165003,0.786667,79,7,0.460983,0,2020-08-21 01:07:36,1597972056,-73,73,73,8836,2,1.00,1,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2526700 entries, 0 to 2526699
Data columns (total 19 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   userID             2526700 non-null  int16  
 1   testCategory       2526700 non-null  int8   
 2   testID             2526700 non-null  object 
 3   assmtID            2526700 non-null  object 
 4   assmtCorrectRate   2526700 non-null  float64
 5   assmtETMedian      2526700 non-null  int64  
 6   cumInteraction     2526700 non-null  int64  
 7   cumCorrectRate     2526700 non-null  float64
 8   retry              2526700 non-null  int16  
 9   timestamp          2526700 non-null  object 
 10  unixTime           2526700 non-null  int64  
 11  relTime2Last       2526700 non-null  int64  
 12  elapsedTime        2526700 non-null  int64  
 13  _elapsedTime       2526700 non-null  int64  
 14  knowledgeTag       2526700 non-null  int64  
 15  cumTag             2526700 non-n

None

In [35]:
df.to_csv(os.path.join(cfg.path.data_dir, r'data.csv'), index=False)