# EDA

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

import csv
import os
import sys


In [2]:
DATA_PATH = '/opt/ml/input/data/train_dataset/'
train_df = pd.read_csv(DATA_PATH+'train_data.csv', parse_dates=['Timestamp'])
train_df = train_df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)
test_df = pd.read_csv(DATA_PATH+'test_data.csv', parse_dates=['Timestamp'])
test_df = test_df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)

## TRAIN VS TEST


Train과 Test는 같은 Column을 가지고 있으며, Row의 수는 다음과 같다.
약 8.7:1의 비율이다

In [3]:
train_df.columns == test_df.columns

array([ True,  True,  True,  True,  True,  True])

In [4]:
print("train_dataset length: ", len(train_df),"vs", "test_dataset length: ", len(test_df), ", ", len(train_df)/len(test_df))

train_dataset length:  2266586 vs test_dataset length:  260114 ,  8.713817787585443


특히, 우리가 예측해야할 문제의 answerCode column은 0이나 1 대신 -1로 마스킹되어 있으며, 총 744개의 문제를 예측해야한다. 

In [5]:
test_df[test_df['answerCode'] == -1]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
1035,3,A050133008,A050000133,-1,2020-10-26 13:13:57,5289
1706,4,A070146008,A070000146,-1,2020-12-27 02:47:54,9080
3023,13,A070111008,A070000111,-1,2020-12-27 04:35:09,9660
4283,17,A090064006,A090000064,-1,2020-10-30 05:48:37,2611
4670,26,A060135007,A060000135,-1,2020-10-23 11:44:18,1422
...,...,...,...,...,...,...
260052,7395,A040122005,A040000122,-1,2020-09-08 02:05:20,10615
260067,7404,A030111005,A030000111,-1,2020-10-13 09:49:18,7636
260082,7416,A050193004,A050000193,-1,2020-10-04 02:44:41,10402
260097,7417,A050193004,A050000193,-1,2020-09-06 13:09:15,10402


Train과 Test 데이터셋의 unique feature의 수

In [6]:
print(f"""--- Train_dataset INFORMATIONS ---
userID           : {train_df.userID.nunique()}
assessmentItemID : {train_df.assessmentItemID.nunique()}
testID           : {train_df.testId.nunique()}
mean answer rate : {train_df.answerCode.sum() / train_df.shape[0] * 100:.2f}%
KnowledgeTag     : {train_df.KnowledgeTag.nunique()}
{'-'*26}""")

--- Train_dataset INFORMATIONS ---
userID           : 6698
assessmentItemID : 9454
testID           : 1537
mean answer rate : 65.44%
KnowledgeTag     : 912
--------------------------


In [7]:
print(f"""--- Test_dataset INFORMATIONS ---
userID           : {test_df.userID.nunique()}
assessmentItemID : {test_df.assessmentItemID.nunique()}
testID           : {test_df.testId.nunique()}
mean answer rate : {test_df[test_df['answerCode'] != -1]['answerCode'].sum() / test_df.shape[0] * 100:.2f}%
KnowledgeTag     : {test_df.KnowledgeTag.nunique()}
{'-'*26}""")

--- Test_dataset INFORMATIONS ---
userID           : 744
assessmentItemID : 9454
testID           : 1537
mean answer rate : 65.50%
KnowledgeTag     : 912
--------------------------


In [8]:
train_unique_userID = pd.unique(train_df.userID)
train_unique_assessmentItemID = pd.unique(train_df.assessmentItemID)
train_unique_testId = pd.unique(train_df.testId)
train_unique_KnowledgeTag = pd.unique(train_df.KnowledgeTag)

각 column의 value의 경우, userID는 하나도 일치하지 않고 나머지 comlumn 모두 일치합니다.

In [10]:

print(f"""--- Test_dataset INFORMATIONS ---
train에는 없는 userID           : {len([i for i in tqdm(pd.unique(test_df.userID)) if i not in train_unique_userID])}
train에는 없는 assessmentItemID : {len([i for i in tqdm(pd.unique(test_df.assessmentItemID)) if i not in train_unique_assessmentItemID])}
train에는 없는 testID           : {len([i for i in tqdm(pd.unique(test_df.testId)) if i not in train_unique_testId])}
train에는 없는 KnowledgeTag     : {len([i for i in tqdm(pd.unique(test_df.KnowledgeTag)) if i not in train_unique_KnowledgeTag])}
{'-'*26}""")

  0%|          | 0/744 [00:00<?, ?it/s]

  0%|          | 0/9454 [00:00<?, ?it/s]

  0%|          | 0/1537 [00:00<?, ?it/s]

  0%|          | 0/912 [00:00<?, ?it/s]

--- Test_dataset INFORMATIONS ---
train에는 없는 userID           : 744
train에는 없는 assessmentItemID : 0
train에는 없는 testID           : 0
train에는 없는 KnowledgeTag     : 0
--------------------------


하지만 train 데이터셋과 test 데이터셋 간의 완전히 겹치는 데이터는 없습니다.
answerCode가 -1인 Row를 빼면 학습 데이터로 활용가능할 것 같습니다.

In [11]:
df = pd.merge(train_df, test_df, on=list(test_df.columns), how='outer', indicator='Exist')
df['Exist'] = np.where(df.Exist == 'both', True, False)

In [12]:
df[df['Exist']==True]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,Exist


In [24]:
len(list(test_df.groupby('userID')))

744

In [25]:
new_df = train_df.sort_values(by=['userID','Timestamp'], axis=0)
columns = ['userID', 'assessmentItemID', 'testId', 'answerCode', 'KnowledgeTag']
group = new_df[columns].groupby('userID').apply(
        lambda r: (
            r['testId'].values, 
            r['assessmentItemID'].values,
            r['KnowledgeTag'].values,
            r['answerCode'].values
        )
    )


In [26]:
group

userID
0       ([A060000001, A060000001, A060000001, A0600000...
1       ([A040000013, A040000013, A040000013, A0400000...
2       ([A030000050, A030000050, A030000050, A0300000...
5       ([A080000001, A080000001, A080000001, A0800000...
6       ([A030000016, A030000016, A030000016, A0300000...
                              ...                        
7436    ([A050000095, A050000095, A050000095, A0500000...
7437    ([A040000072, A040000072, A040000072, A0400000...
7438    ([A080000002, A080000002, A080000002, A0800000...
7440    ([A050000096, A050000096, A050000096, A0500000...
7441    ([A030000071, A030000071, A030000071, A0300000...
Length: 6698, dtype: object

In [33]:
test_df[test_df['answerCode'] == -1]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
1035,3,A050133008,A050000133,-1,2020-10-26 13:13:57,5289
1706,4,A070146008,A070000146,-1,2020-12-27 02:47:54,9080
3023,13,A070111008,A070000111,-1,2020-12-27 04:35:09,9660
4283,17,A090064006,A090000064,-1,2020-10-30 05:48:37,2611
4670,26,A060135007,A060000135,-1,2020-10-23 11:44:18,1422
...,...,...,...,...,...,...
260052,7395,A040122005,A040000122,-1,2020-09-08 02:05:20,10615
260067,7404,A030111005,A030000111,-1,2020-10-13 09:49:18,7636
260082,7416,A050193004,A050000193,-1,2020-10-04 02:44:41,10402
260097,7417,A050193004,A050000193,-1,2020-09-06 13:09:15,10402


In [3]:
new_column = train_df['assessmentItemID'].apply(lambda x : x[:3])
new_column.name = "grade"

In [6]:
new_column

0          A06
1          A06
2          A06
3          A06
4          A06
          ... 
2266581    A03
2266582    A04
2266583    A04
2266584    A04
2266585    A04
Name: grade, Length: 2266586, dtype: object

In [13]:
new_train_df = train_df.head(40)

In [27]:
train_df['answerCode'].mean()

0.654378435232548

In [30]:
avgs = train_df.groupby('userID')['answerCode'].mean()

In [31]:
avgs

userID
0       0.630872
1       0.853162
2       0.612319
5       0.795918
6       0.442997
          ...   
7436    0.466667
7437    0.375000
7438    0.750000
7440    0.400000
7441    0.555556
Name: answerCode, Length: 6698, dtype: float64

In [32]:
train_df.loc[train_df['userID'],'avg'] = 

KeyError: 'userID'

In [24]:
train_df.sample(20)
    

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,avg
1422381,2356,A080058006,A080000058,1,2020-06-27 07:50:30,4965,
271743,370,A080067004,A080000067,1,2020-06-27 08:43:56,2869,
449654,626,A030015001,A030000015,1,2020-04-27 07:34:17,7310,
285322,388,A040048003,A040000048,1,2020-05-12 05:23:06,2070,
1035672,1551,A080089002,A080000089,0,2020-09-25 05:25:39,1110,
23844,32,A070159002,A070000159,0,2020-11-17 08:02:52,9121,
613425,874,A050045006,A050000045,0,2020-06-23 10:36:20,3729,
1291671,2076,A070025004,A070000025,1,2020-03-30 20:05:08,5654,
809312,1178,A060071004,A060000071,1,2020-09-02 11:13:26,9557,
1841668,3511,A080032008,A080000032,1,2020-07-06 07:35:31,4784,


In [36]:
new_train_df[lambda x : x['answerCode']==1]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225
5,0,A060001007,A060000001,1,2020-03-24 00:17:47,7225
7,0,A060003002,A060000003,1,2020-03-26 05:52:10,7226
8,0,A060003003,A060000003,1,2020-03-26 05:53:14,7226
9,0,A060003004,A060000003,1,2020-03-26 05:53:29,7226
10,0,A060003005,A060000003,1,2020-03-26 05:53:48,7226


In [37]:
len(train_df['avg'])

2266586

In [38]:
len(train_df.groupby('userID')['answerCode'].transform('mean'))

2266586

In [39]:
temp = train_df.groupby('userID')['answerCode'].transform('mean')

In [1]:
a =[1,2,3,]
b=4

In [2]:
a + b 

TypeError: can only concatenate list (not "int") to list