In [4]:
import numpy as np
import pandas as pd

from tqdm import tqdm
import matplotlib as mpl
import matplotlib.pyplot as plt
from collections import defaultdict

## Raw data load

In [5]:
raw_data = pd.read_csv('/opt/ml/input/data/FE_total_data.csv')
raw_data.sort_values(by=["userID", "Timestamp"], inplace=True)
df = raw_data.copy()

- userCurrectAnswer : 사용자 별 누적 정답의 수
- userTotalAnswer : 사용자 별 누적 문제 풀이 수

In [6]:
df['userCurrectAnswer'] = raw_data.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
df['userTotalAnswer'] = raw_data.groupby('userID')['answerCode'].cumcount()
df['cumAccuracy'] = df['userCurrectAnswer'] / df['userTotalAnswer']
df.fillna(0,inplace=True)

df.head(5)

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,dataset,userCurrectAnswer,userTotalAnswer,cumAccuracy
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,1,0.0,0,0.0
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,1,1.0,1,1.0
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,1,2.0,2,1.0
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,1,3.0,3,1.0
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,1,4.0,4,1.0


### cumCorrect : 누적 정답 수  
    
    - 현재 풀고 있는 문제까지 누적 된 정답의 수

In [4]:
raw_data['cumCorrect'] = df['userCurrectAnswer']
raw_data.to_csv('/opt/ml/input/data/FE/cumCorrect.csv')
raw_data.drop(['cumCorrect'],axis = 1,inplace=True)

### cumAccuracy : 누적 정답률  

    - 시간순서에 따른 사용자의 누적 정답률

In [5]:
raw_data['cumAccuracy'] = df['cumAccuracy']
raw_data.to_csv('/opt/ml/input/data/FE/cumAccuracy.csv')
raw_data.drop(['cumAccuracy'],axis = 1,inplace=True)

In [6]:
def set_tag_count(group):
    group.sort_values(by='Timestamp', axis=0, inplace=True)

    tag_correct_dict = defaultdict(lambda: [0,])
    for r in group.index:
        tag = group.at[r, 'KnowledgeTag']
        
        # 누적 문항 개수
        group.at[r, 'tagCount'] = tag_correct_dict[tag][0]
        
        tag_correct_dict[tag][0] += 1
    return group

### tagCount : 누적 태그의 수  
    - 현재 풀고 있는 문제를 누적해서 푼 횟수

In [7]:
raw_data.insert(raw_data.columns.get_loc('KnowledgeTag') + 1, 'tagCount', 0) #Original dataframe

group = raw_data.groupby('userID')

group = group.apply(set_tag_count)
group.reset_index(drop=True, inplace=True)

raw_data['tagCount'] = group['tagCount']

raw_data

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,tagCount,dataset
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,0,1
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,0,1
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,1,1
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,2,1
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,3,1
...,...,...,...,...,...,...,...,...
2526695,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438,4,1
2526696,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836,0,1
2526697,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836,1,1
2526698,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836,2,1


In [8]:
raw_data.to_csv('/opt/ml/input/data/FE/tagCount.csv')
raw_data.drop(['tagCount'],axis = 1,inplace=True)

### recAccuracy : 최근 누적 정답률

In [24]:
def set_rec_accuracy(group):
    group.sort_values(by='Timestamp', axis=0, inplace=True)
    recAccuracy = group['answerCode'].rolling(10, min_periods=1).mean().shift(1)
    group['recAccuracy'] = recAccuracy
    
    return group

In [28]:
raw_data['recAccuracy'] = 0

group = raw_data.groupby("userID")

group = group.apply(set_rec_accuracy)
group.reset_index(drop=True, inplace=True)

raw_data['recAccuracy'] = group['recAccuracy']
raw_data.head(10)

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,dataset,recAccuracy
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,1,1.0
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,1,1.0
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,1,1.0
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,1,1.0
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,1,1.0
5,0,A060001007,A060000001,1,2020-03-24 00:17:47,7225,1,1.0
6,0,A060003001,A060000003,0,2020-03-26 05:52:03,7226,1,0.857143
7,0,A060003002,A060000003,1,2020-03-26 05:52:10,7226,1,0.875
8,0,A060003003,A060000003,1,2020-03-26 05:53:14,7226,1,0.888889
9,0,A060003004,A060000003,1,2020-03-26 05:53:29,7226,1,0.9


In [32]:
raw_data.to_csv('/opt/ml/input/data/FE/recAccuracy.csv')
raw_data.drop(['recAccuracy'],axis = 1,inplace=True)