In [2]:
from logging import getLogger
import os
import json
import pandas as pd
import time, datetime
import torch

### 데이터 로드

In [25]:
train_data = pd.read_csv('../data/train_data.csv')
test_data  = pd.read_csv('../data/test_data.csv')

데이터 통합
- GNN에서 학습되지 않은 노드에 대해 계산 할 수 없음
- 기존 데이터셋에서 테스트 데이터셋의 사용자는 학습 데이터셋에 포함되어 있지 않음
- 이에 따라 모든 데이터를 통합하여 사용하도록 함

In [26]:
data = pd.concat([train_data, test_data])
data = data.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)

### 데이터 구성

- 데이터는 학습 데이터셋과 테스트 데이터셋으로 구분되어 있다.
- 각 데이터에는 userID, assessmentItemID, testId, answerCode, Timestamp, KnowledgeTag의 정보가 있다.
- 여기서 assessmentItemID는 문제의 고유 ID이며, answerCode는 사용자가 해당 문제의 정답을 맞췄는지 여부로, 맞췄으면 1, 틀렸으면 0으로 표기된다.
- 기본적인 협업 필터링 적용을 위해 본 실습에서는 userID, assessmentItemID, answerCode만을 사용한다.

In [27]:
userid, itemid = list(set(data.userID)), list(set(data.assessmentItemID))
n_user, n_item = len(userid), len(itemid)

print(f"Train dataset")
display(data.head(5))
print(f" Num. Users    : {n_user}")
print(f" Max. UserID   : {max(userid)}")
print(f" Num. Items    : {n_item}")
print(f" Num. Records  : {len(train_data)}")

Train dataset


Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225


 Num. Users    : 7442
 Max. UserID   : 7441
 Num. Items    : 9454
 Num. Records  : 2266586


### 데이터 전처리

중복 레코드 제거
 - RS 모델에서는 시간에 따른 변화를 고려하지 않기 때문에 최종 성적만을 바탕으로 평가한다.
 - 사용자+문제항목을 Unique key로 하여 최종 레코드만을 보존하고 나머지 제거한다.

In [28]:
data.drop_duplicates(subset = ["userID", "assessmentItemID"], 
                     keep = "last", inplace = True)

평가 항목 제거
- 테스트 데이터셋에서 answerCode가 -1인 항목은 최종 평가시 사용되는 항목으로 여기에선 사용할 수 없다.
- 아래 결과에서와 같이 User, Item 수는 변화 없이 총 레코드 수만 변한다.

In [7]:
data_old = data.copy()
n_user_old, n_item_old = n_user, n_item

data  = data[data.answerCode>=0].copy()

userid, itemid = list(set(data.userID)), list(set(data.assessmentItemID))
n_user, n_item = len(userid), len(itemid)

display(data.tail(5))
print(f" Num. Users    : {n_user}->{n_user}")
print(f" Max. UserID   : {max(userid)}")
print(f" Num. Items    : {n_item}->{n_item}")
print(f" Num. Records  : {len(data_old)}->{len(data)}")

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
2526695,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438
2526696,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836
2526697,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836
2526698,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836
2526699,7441,A040165004,A040000165,1,2020-08-21 01:08:49,8836


 Num. Users    : 7442->7442
 Max. UserID   : 7441
 Num. Items    : 9454->9454
 Num. Records  : 2476706->2475962


In [152]:
len(data)
#2474941

2475962

In [153]:
data[data['userID']==0]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225
...,...,...,...,...,...,...
740,0,A080129002,A080000129,1,2020-12-23 03:35:54,2723
741,0,A080129003,A080000129,0,2020-12-23 03:37:20,2725
742,0,A080129004,A080000129,1,2020-12-23 03:38:43,2725
743,0,A080129005,A080000129,0,2020-12-23 03:40:14,2725


In [154]:
data[data['userID']==1]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
745,1,A040013001,A040000013,1,2020-01-06 08:40:43,2048
746,1,A040013002,A040000013,1,2020-01-06 08:43:46,2048
747,1,A040013003,A040000013,1,2020-01-06 08:44:29,2047
748,1,A040013004,A040000013,1,2020-01-06 08:46:13,2047
749,1,A040013005,A040000013,0,2020-01-06 08:49:45,2047
...,...,...,...,...,...,...
1673,1,A090074004,A090000074,1,2020-11-13 02:42:08,10196
1674,1,A090074002,A090000074,1,2020-11-13 02:44:34,4243
1675,1,A090074003,A090000074,1,2020-11-13 02:45:04,4243
1676,1,A090074005,A090000074,1,2020-11-13 02:46:38,2648




# Augmentation

평가 항목 신규 생성
- 남은 테스트 항목 중, 각 사용자별 최종 레코드를 새로운 평가 항목으로 정한다.

In [129]:
eval_data = data.copy()
eval_data.drop_duplicates(subset = ["userID"],
                     keep = "last", inplace = True)
display(eval_data.head(5))
display(eval_data.tail(5))
print(f" Num. Records  : {len(eval_data)}")

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
744,0,A080129006,A080000129,0,2020-12-23 03:40:19,2725
1677,1,A090074006,A090000074,1,2020-11-13 02:47:20,2648
1953,2,A050139007,A050000139,0,2020-10-20 11:32:26,428
2988,3,A050133007,A050000133,0,2020-10-26 13:13:11,5289
3659,4,A070146007,A070000146,1,2020-12-27 02:47:31,9080


Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
2526643,7437,A060003007,A060000003,0,2020-05-22 01:53:49,7226
2526659,7438,A030188005,A030000188,1,2020-10-19 10:28:29,1934
2526674,7439,A040130004,A040000130,1,2020-10-14 23:09:31,8244
2526690,7440,A030197005,A030000197,0,2020-10-21 08:33:20,1984
2526699,7441,A040165004,A040000165,1,2020-08-21 01:08:49,8836


 Num. Records  : 7442


평가 항목을 테스트 항목에서 제거한다.

In [131]:
data1 = data.drop(index=eval_data.index)
display(data1.tail())
print(f" Num. Records  : {len(data1)}")

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
2526694,7441,A030071004,A030000071,0,2020-06-05 06:49:57,438
2526695,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438
2526696,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836
2526697,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836
2526698,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836


 Num. Records  : 2468520


data1 => Data에서 마지막 항 제거된 데이터

## 2번 적용

In [132]:
eval_data1 = data1.copy()
eval_data1.drop_duplicates(subset = ["userID"],
                     keep = "last", inplace = True)
display(eval_data1.head(5))
display(eval_data1.tail(5))
print(f" Num. Records  : {len(eval_data1)}")

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
743,0,A080129005,A080000129,0,2020-12-23 03:40:14,2725
1676,1,A090074005,A090000074,1,2020-11-13 02:46:38,2648
1952,2,A050139006,A050000139,0,2020-10-20 11:32:20,428
2987,3,A050133006,A050000133,1,2020-10-26 13:12:52,5288
3658,4,A070146006,A070000146,1,2020-12-27 02:46:51,9079


Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
2526642,7437,A060003006,A060000003,1,2020-05-22 01:53:45,7226
2526658,7438,A030188004,A030000188,1,2020-10-19 10:26:36,1934
2526673,7439,A040130003,A040000130,1,2020-10-14 23:08:02,8244
2526689,7440,A030197004,A030000197,0,2020-10-21 08:33:17,1984
2526698,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836


 Num. Records  : 7442


In [133]:
data2 = data1.drop(index=eval_data1.index)
display(data2.tail())
print(f" Num. Records  : {len(data2)}")

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
2526693,7441,A030071003,A030000071,1,2020-06-05 06:49:23,438
2526694,7441,A030071004,A030000071,0,2020-06-05 06:49:57,438
2526695,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438
2526696,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836
2526697,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836


 Num. Records  : 2461078


In [134]:
2461078+7442

2468520

## 합치기

In [139]:
data1['userID'] = data1['userID'] +max(data['userID'])
data2['userID'] = data2['userID'] +max(data1['userID'])

In [138]:
data1.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,7441,A060001001,A060000001,1,2020-03-24 00:17:11,7224
1,7441,A060001002,A060000001,1,2020-03-24 00:17:14,7225
2,7441,A060001003,A060000001,1,2020-03-24 00:17:22,7225
3,7441,A060001004,A060000001,1,2020-03-24 00:17:29,7225
4,7441,A060001005,A060000001,1,2020-03-24 00:17:36,7225


In [140]:
datasum = pd.concat([data,data1,data2])

In [141]:
datasum.tail()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
2526693,22323,A030071003,A030000071,1,2020-06-05 06:49:23,438
2526694,22323,A030071004,A030000071,0,2020-06-05 06:49:57,438
2526695,22323,A030071005,A030000071,0,2020-06-05 06:50:21,438
2526696,22323,A040165001,A040000165,1,2020-08-21 01:06:39,8836
2526697,22323,A040165002,A040000165,1,2020-08-21 01:06:50,8836


# Augmentation 일반화 (n번 돌려버리기)

In [29]:
# 데이터 몇번 증강하실? (데이터는 약 n+1배 됨)
n = 2

In [30]:
# 원래 데이터 개수
len(data)

2476706

In [31]:
datasum = data.copy()
eval_data = pd.DataFrame()
data1 = data.copy()
print(max(data1['userID']))
for i in range(n):
    print('-'*10,f'epoch{i+1}','-'*10)
    # eval_data1 : user마다 마지막 값들을 저장
    eval_data1 = data1.copy()
    eval_data1.drop_duplicates(subset = ["userID"],
                        keep = "last", inplace = True)
    print(f" Drop counts   : {len(eval_data1)}")

    # data2: 원래 데이터에서 마지막 값들을 drop 해줌
    data2 = data1.drop(index=eval_data1.index)
    print(f" length counts : {len(data2)}")

    # 새로 augment 된 데이터 userID 안겹치게, 이전 최고 id값 더해주기
    data2['userID'] = data2['userID'] +7442 
    print(f" last user num : {max(data2['userID'])}")
        
    # concat 조져버리기
    datasum = pd.concat([datasum,data2])
    eval_data = pd.concat([eval_data,eval_data1])
    data1 = data2.copy()
print('-'*28)

7441
---------- epoch1 ----------
 Drop counts   : 7442
 length counts : 2469264
 last user num : 14883
---------- epoch2 ----------
 Drop counts   : 7442
 length counts : 2461822
 last user num : 22325
----------------------------


In [32]:
len(datasum)

7407792

In [33]:
# CSV 로 저장해 버리기
datasum.to_csv('new_train.csv',index=False)

# 혹시 몰라 넣은 evaluation data
# eval_data.to_csv('new_eval.csv',index=False)