In [41]:
import pandas as pd
import numpy as np

from tqdm import tqdm

In [42]:
# 기본 경로 설정
ORIGINAL_TRAIN_DATA = "/opt/ml/input/data/train_data.csv"
SPLITED_TRAIN_DATA = "/opt/ml/input/data/split_train_data.csv"
SPLITED_TEST_DATA = "/opt/ml/input/data/split_test_data.csv"

In [43]:
# 원본 train_data.csv
original_train_df = pd.read_csv(ORIGINAL_TRAIN_DATA)

# 원본 dataframe 재정렬 : userID로 먼저 정렬하고, 같은 유저간에는 시간순으로 정렬
original_train_df = original_train_df.sort_values(by=["userID", "Timestamp"]).reset_index(drop=True)
original_train_df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225
...,...,...,...,...,...,...
2266581,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438
2266582,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836
2266583,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836
2266584,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836


In [44]:
# 전체 사용자의 수 파악
n_user = original_train_df["userID"].nunique()

# 전체 유저 리스트
user_list = sorted(original_train_df["userID"].unique())
print (f"전체 사용자의 수 : {n_user}")

전체 사용자의 수 : 6698


In [45]:
# train과 test에 사용될 사용자의 수 파악
train_n_user = int(n_user * 0.9)
test_n_user = n_user - train_n_user
print (f"train에 사용될 사용자의 수 : {train_n_user}\ntest에 사용될 사용자의 수 : {test_n_user}")

train에 사용될 사용자의 수 : 6028
test에 사용될 사용자의 수 : 670


In [46]:
# 0 ~ 6027 번째 user는 train에 사용
# 6028 ~ 6697 번째 user는 test에 사용

test_start_user = user_list[train_n_user]
test_start_index = min(original_train_df[original_train_df["userID"] == test_start_user].index)

In [47]:
splited_train_df = original_train_df.iloc[:test_start_index]
splited_train_df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225
...,...,...,...,...,...,...
2247976,6694,A050155003,A050000155,0,2020-10-20 11:33:29,451
2247977,6694,A050155004,A050000155,0,2020-10-20 12:05:02,451
2247978,6694,A050155005,A050000155,0,2020-10-20 12:12:14,451
2247979,6694,A050155006,A050000155,0,2020-10-20 12:23:45,451


In [48]:
splited_test_df = original_train_df.iloc[test_start_index:]
splited_test_df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
2247981,6695,A040186001,A040000186,1,2020-07-20 07:09:15,2128
2247982,6695,A040186002,A040000186,1,2020-07-20 07:09:22,2128
2247983,6695,A040186003,A040000186,1,2020-07-20 07:09:41,2128
2247984,6695,A040186004,A040000186,1,2020-07-20 07:09:52,2128
2247985,6695,A040186005,A040000186,1,2020-07-20 07:10:04,2128
...,...,...,...,...,...,...
2266581,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438
2266582,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836
2266583,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836
2266584,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836


In [49]:
splited_train_df.to_csv(SPLITED_TRAIN_DATA, index=False)
splited_test_df.to_csv(SPLITED_TEST_DATA, index=False)