In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm
import matplotlib as mpl
import matplotlib.pyplot as plt
from collections import defaultdict

In [2]:
raw_data = pd.read_csv('/opt/ml/input/data/FE_total_data.csv')
raw_data.sort_values(by=["userID", "Timestamp"], inplace=True)
df = raw_data.copy()

- totalAnswer : csv 파일로 관리
- testMean : csv 파일로 관리
- testSum : csv 파일로 관리
- tagMean : csv 파일로 관리
- tagSum : csv 파일로 관리

In [3]:
# 유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산
df["user_correct_answer"] = df.groupby("userID")["answerCode"].transform(lambda x: x.cumsum().shift(1))
df["totalAnswer"] = df.groupby("userID")["answerCode"].cumcount()
df["user_acc"] = df["user_correct_answer"] / df["totalAnswer"]

    # testId와 KnowledgeTag의 전체 정답률은 한번에 계산
    # 아래 데이터는 제출용 데이터셋에 대해서도 재사용
correct_t = df.groupby(["testId"])["answerCode"].agg(["mean", "sum",'std'])
correct_t.columns = ["testMean", "testSum",'testStd']
correct_k = df.groupby(["KnowledgeTag"])["answerCode"].agg(["mean", "sum",'std'])
correct_k.columns = ["tagMean", "tagSum","tagStd"]
correct_a = df.groupby(["assessmentItemID"])["answerCode"].agg(["mean", "sum", "std"])
correct_a.columns = ['assessMean', 'assessSum', 'assessStd']


df = pd.merge(df, correct_t, on=["testId"], how="left")
df = pd.merge(df, correct_k, on=["KnowledgeTag"], how="left")
df = pd.merge(df, correct_a, on=["assessmentItemID"], how='left')
df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,dataset,user_correct_answer,totalAnswer,user_acc,testMean,testSum,testStd,tagMean,tagSum,tagStd,assessMean,assessSum,assessStd
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,1,,0,,0.952667,1429,0.212422,0.957333,718,0.202239,0.984000,246,0.125727
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,1,1.0,1,1.000000,0.952667,1429,0.212422,0.917067,3439,0.275818,0.968000,242,0.176353
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,1,2.0,2,1.000000,0.952667,1429,0.212422,0.917067,3439,0.275818,0.916000,229,0.277944
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,1,3.0,3,1.000000,0.952667,1429,0.212422,0.917067,3439,0.275818,0.972000,243,0.165304
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,1,4.0,4,1.000000,0.952667,1429,0.212422,0.917067,3439,0.275818,0.948000,237,0.222472
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2526695,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438,1,1.0,4,0.250000,0.666000,999,0.471797,0.694889,3127,0.460506,0.446667,134,0.497978
2526696,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836,1,1.0,5,0.200000,0.652500,783,0.478122,0.698551,2410,0.460845,0.643333,193,0.479816
2526697,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836,1,2.0,6,0.333333,0.652500,783,0.478122,0.698551,2410,0.460845,0.640000,192,0.480802
2526698,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836,1,3.0,7,0.428571,0.652500,783,0.478122,0.698551,2410,0.460845,0.786667,236,0.410346


In [4]:
df.to_csv('/opt/ml/input/data/FE/totalAnswer.csv', columns=["userID", "assessmentItemID", "testId", "answerCode", "Timestamp", "KnowledgeTag", "dataset", "totalAnswer"], index=False)
df.to_csv('/opt/ml/input/data/FE/testMean.csv', columns=["userID", "assessmentItemID", "testId", "answerCode", "Timestamp", "KnowledgeTag", "dataset", "testMean"], index=False)
df.to_csv('/opt/ml/input/data/FE/testSum.csv', columns=["userID", "assessmentItemID", "testId", "answerCode", "Timestamp", "KnowledgeTag", "dataset", "testSum"], index=False)
df.to_csv('/opt/ml/input/data/FE/testStd.csv', columns=["userID", "assessmentItemID", "testId", "answerCode", "Timestamp", "KnowledgeTag", "dataset", "testStd"], index=False)
df.to_csv('/opt/ml/input/data/FE/tagMean.csv', columns=["userID", "assessmentItemID", "testId", "answerCode", "Timestamp", "KnowledgeTag", "dataset", "tagMean"], index=False)
df.to_csv('/opt/ml/input/data/FE/tagSum.csv', columns=["userID", "assessmentItemID", "testId", "answerCode", "Timestamp", "KnowledgeTag", "dataset", "tagSum"], index=False)
df.to_csv('/opt/ml/input/data/FE/tagStd.csv', columns=["userID", "assessmentItemID", "testId", "answerCode", "Timestamp", "KnowledgeTag", "dataset", "tagStd"], index=False)
df.to_csv('/opt/ml/input/data/FE/assessMean.csv', columns=["userID", "assessmentItemID", "testId", "answerCode", "Timestamp", "KnowledgeTag", "dataset", "assessMean"], index=False)
df.to_csv('/opt/ml/input/data/FE/assessSum.csv', columns=["userID", "assessmentItemID", "testId", "answerCode", "Timestamp", "KnowledgeTag", "dataset", "assessSum"], index=False)
df.to_csv('/opt/ml/input/data/FE/assessStd.csv', columns=["userID", "assessmentItemID", "testId", "answerCode", "Timestamp", "KnowledgeTag", "dataset", "assessStd"], index=False)