# EDA

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import csv
import os
import sys


In [2]:
DATA_PATH = '/opt/ml/input/data/train_dataset/'
train_df = pd.read_csv(DATA_PATH+'train_data.csv', parse_dates=['Timestamp'])
train_df = train_df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)
test_df = pd.read_csv(DATA_PATH+'test_data.csv', parse_dates=['Timestamp'])
test_df = test_df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)

## TRAIN VS TEST


Train과 Test는 같은 Column을 가지고 있으며, Row의 수는 다음과 같다.
약 8.7:1의 비율이다

In [38]:
train_df.columns == test_df.columns

array([ True,  True,  True,  True,  True,  True])

In [41]:
print("train_dataset length: ", len(train_df),"vs", "test_dataset length: ", len(test_df), ", ", len(train_df)/len(test_df))

train_dataset length:  2266586 vs test_dataset length:  260114 ,  8.713817787585443


특히, 우리가 예측해야할 문제의 answerCode column은 0이나 1 대신 -1로 마스킹되어 있으며, 총 744개의 문제를 예측해야한다. 

In [42]:
test_df[test_df['answerCode'] == -1]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
1035,3,A050133008,A050000133,-1,2020-10-26 13:13:57,5289
1706,4,A070146008,A070000146,-1,2020-12-27 02:47:54,9080
3023,13,A070111008,A070000111,-1,2020-12-27 04:35:09,9660
4283,17,A090064006,A090000064,-1,2020-10-30 05:48:37,2611
4670,26,A060135007,A060000135,-1,2020-10-23 11:44:18,1422
...,...,...,...,...,...,...
260052,7395,A040122005,A040000122,-1,2020-09-08 02:05:20,10615
260067,7404,A030111005,A030000111,-1,2020-10-13 09:49:18,7636
260082,7416,A050193004,A050000193,-1,2020-10-04 02:44:41,10402
260097,7417,A050193004,A050000193,-1,2020-09-06 13:09:15,10402


Train과 Test 데이터셋의 unique feature의 수

In [111]:
print(f"""--- Train_dataset INFORMATIONS ---
userID           : {train_df.userID.nunique()}
assessmentItemID : {train_df.assessmentItemID.nunique()}
testID           : {train_df.testId.nunique()}
mean answer rate : {train_df.answerCode.sum() / train_df.shape[0] * 100:.2f}%
KnowledgeTag     : {train_df.KnowledgeTag.nunique()}
{'-'*26}""")

--- Train_dataset INFORMATIONS ---
userID           : 6698
assessmentItemID : 9454
testID           : 1537
mean answer rate : 65.44%
KnowledgeTag     : 912
--------------------------


In [112]:
print(f"""--- Test_dataset INFORMATIONS ---
userID           : {test_df.userID.nunique()}
assessmentItemID : {test_df.assessmentItemID.nunique()}
testID           : {test_df.testId.nunique()}
mean answer rate : {test_df[test_df['answerCode'] != -1]['answerCode'].sum() / test_df.shape[0] * 100:.2f}%
KnowledgeTag     : {test_df.KnowledgeTag.nunique()}
{'-'*26}""")

--- Test_dataset INFORMATIONS ---
userID           : 744
assessmentItemID : 9454
testID           : 1537
mean answer rate : 65.50%
KnowledgeTag     : 912
--------------------------


In [120]:
print(f"""--- Test_dataset INFORMATIONS ---
train에는 없는 userID           : {len([i for i in pd.unique(test_df.userID) if i not in pd.unique(train_df.userID)])}
train에는 없는 assessmentItemID : {len([i for i in pd.unique(test_df.assessmentItemID) if i not in pd.unique(train_df.assessmentItemID)])}
train에는 없는 testID           : {len([i for i in pd.unique(test_df.testID) if i not in pd.unique(train_df.testID)])}
train에는 없는 KnowledgeTag     : {len([i for i in pd.unique(test_df.KnowledgeTag) if i not in pd.unique(train_df.KnowledgeTag)])}
{'-'*26}""")

KeyboardInterrupt: 

하지만 train 데이터셋과 test 데이터셋 간의 완전히 겹치는 데이터는 없습니다.
answerCode가 -1인 Row를 빼면 학습 데이터로 활용가능할 것 같습니다.

In [77]:
df = pd.merge(train_df, test_df, on=list(test_df.columns), how='outer', indicator='Exist')
df['Exist'] = np.where(df.Exist == 'both', True, False)

In [78]:
df[df['Exist']==True]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,Exist


Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
1570732,2711,A020183001,A020000183,1,2020-10-23 06:24:59,8136
280959,381,A050071002,A050000071,1,2020-07-24 03:13:56,3826
2194534,5698,A010098002,A010000098,1,2020-10-16 09:08:32,7629
396785,543,A090054002,A090000054,0,2020-10-06 03:58:33,2601
1177109,1830,A010022005,A010000022,1,2020-05-12 00:42:01,6803
1230180,1937,A030046004,A030000046,1,2020-06-02 09:52:04,373
170030,227,A020027006,A020000027,1,2020-06-24 12:00:04,7912
1693574,3028,A010079004,A010000079,1,2020-08-24 10:56:11,7620
231669,311,A060105005,A060000105,1,2020-08-28 09:10:54,708
1711079,3075,A040016003,A040000016,1,2020-07-28 10:44:13,2049
