In [13]:
import numpy as np
import pandas as pd

import os
import random

import warnings

warnings.filterwarnings(action='ignore')

path = '../data/' # 환경에 맞게 바꿔주시면 됩니다.

train = pd.read_csv(path + 'train_data.csv')
test = pd.read_csv(path + 'test_data.csv')

# 기본적인 EDA

* userID 사용자의 고유 번호입니다. 총 7,442명의 학생
* assessmentItemID 사용자가 푼 문항의 일련 번호
* testId 사용자가 푼 문항이 포함된 시험지의 일련 번호
* answerCode 사용자가 푼 문항의 정답 여부를 담고 있는 이진 (0/1) 데이터
* Timestamp 사용자가 문항을 푼 시간 정보
* KnowledgeTag 사용자가 푼 문항의 고유 태그


In [14]:
train.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225


In [15]:
test.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,3,A050023001,A050000023,1,2020-01-09 10:56:31,2626
1,3,A050023002,A050000023,1,2020-01-09 10:56:57,2626
2,3,A050023003,A050000023,0,2020-01-09 10:58:31,2625
3,3,A050023004,A050000023,0,2020-01-09 10:58:36,2625
4,3,A050023006,A050000023,0,2020-01-09 10:58:43,2623


In [16]:
# test summary
print(f"""--- BASIC INFORMATIONS Train/Test Data ---
                   Train        Test
userID           : {train.userID.nunique()}         {test.userID.nunique()}
assessmentItemID : {train.assessmentItemID.nunique()}         {test.assessmentItemID.nunique()}
testID           : {train.testId.nunique()}         {test.testId.nunique()}
mean answer rate : {train.answerCode.sum() / train.shape[0] * 100:.2f}%       {test.answerCode.sum() / test.shape[0] * 100:.2f}%
KnowledgeTag     : {train.KnowledgeTag.nunique()}          {test.KnowledgeTag.nunique()}
{'-'*26}""")

--- BASIC INFORMATIONS Train/Test Data ---
                   Train        Test
userID           : 6698         744
assessmentItemID : 9454         9454
testID           : 1537         1537
mean answer rate : 65.44%       65.22%
KnowledgeTag     : 912          912
--------------------------


In [17]:
# train test 같은 유저
print(len(set(train['userID'].unique()) - set(test['userID'].unique())))
print(len(set(train['userID'].unique())))

6698
6698


train test 같은 유저 X

### Timestamp

In [18]:
type(train['Timestamp'][0])

str

In [19]:
train['Timestamp'] = pd.to_datetime(train['Timestamp'])

In [20]:
type(train['Timestamp'][0])

pandas._libs.tslibs.timestamps.Timestamp

In [21]:
train['month'] = train['Timestamp'].dt.month
train.groupby('month')['answerCode'].mean().sort_values()

month
12    0.496304
11    0.505544
5     0.643317
4     0.649545
6     0.651967
8     0.657599
7     0.665650
9     0.668872
10    0.670372
3     0.682643
2     0.687222
1     0.712199
Name: answerCode, dtype: float64

월별로 평균정답률에 차이 있음 -> 최대 1월 / 최소 12월

### assessmentItemID : 문항번호 -> 시험지별로

In [22]:
def percentile(s):
    return np.sum(s) / len(s)

In [23]:
print(sum(train['assessmentItemID'].apply(
    lambda x: x[1:7]) == train['testId'].apply(lambda x: x[1:4]+x[7:])
))
print(len(train))

2266586
2266586


assessmentItemID(문항번호)의 1-6번째 번호 == testId(시험지번호)의 1-3번째 + 마지막 3자리

In [24]:
train['testId'].apply(lambda x: x[4:7]).unique()

array(['000'], dtype=object)

testID(시험지번호) 가운데 3자리 000

In [25]:
for i in range(3):
    print(f'{i}번째 value_count')
    print(train['assessmentItemID'].str[i].value_counts())
    print('') 

0번째 value_count
A    2266586
Name: assessmentItemID, dtype: int64

1번째 value_count
0    2266586
Name: assessmentItemID, dtype: int64

2번째 value_count
7    279164
5    275773
3    273762
1    272082
2    268327
4    267323
6    264434
8    246336
9    119385
Name: assessmentItemID, dtype: int64



assessmentItemID
* 1번째 단어 A : 유일
* 2번째 단어 0 : 유일
* 3번째 단어는 1-9 : 비교적 균등 / 최소 9 최대 7

In [26]:
train['category_2'] = train['assessmentItemID'].str[2]

tem = train.groupby('category_2')['answerCode']

dc = pd.DataFrame({'mean' : tem.mean(), 'cnt':tem.count()}).reset_index()
dc = dc.sort_values('mean', ascending=False)
dc['category_difficulty'] = range(1,10)
# type(dc['difficulty'][0]) # numpy.int64
dc

Unnamed: 0,category_2,mean,cnt,category_difficulty
0,1,0.800876,272082,1
1,2,0.737593,268327,2
5,6,0.709232,264434,3
2,3,0.702238,273762,4
3,4,0.684056,267323,5
4,5,0.658208,275773,6
6,7,0.521876,279164,7
7,8,0.502598,246336,8
8,9,0.449948,119385,9


카테고리(assessmentItemID(문항번호)의 인덱스 2) 별로 평균정답률, 개수 -> 난이도??

정답률 순서대로 sort하면 난이도로 사용할 수 있을듯

In [27]:
train.groupby('testId')['answerCode'].mean()

testId
A010000001    0.926183
A010000002    0.931613
A010000003    0.846440
A010000004    0.883117
A010000005    0.855172
                ...   
A090000070    0.418605
A090000071    0.336111
A090000072    0.418519
A090000073    0.529630
A090000074    0.425926
Name: answerCode, Length: 1537, dtype: float64

시험지 카테고리가 1에서 9로 갈수록 정답률이 낮아지는 경향

-->> 카테고리 난이도를 시험지의 난이도로 대체할 수 있을까??

In [28]:
train = train.merge(dc[['category_2','category_difficulty']], how='left', on='category_2')
train.head()#.category_difficulty.value_counts()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,month,category_2,category_difficulty
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,3,6,3
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,3,6,3
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,3,6,3
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,3,6,3
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,3,6,3


In [29]:
train['test_paper'] = train['assessmentItemID'].str[2:7]
ttt = train.groupby('test_paper')['answerCode'].mean().sort_values()
ttt

test_paper
90035    0.327186
90071    0.336111
80047    0.344178
80071    0.350895
70075    0.352535
           ...   
40188    0.940647
60002    0.945205
60001    0.947683
10145    0.952077
40187    0.955474
Name: answerCode, Length: 1537, dtype: float64

시험지 별로(미션EDA 참고) 정답률 비교 -> 꽤 많이 차이가 남 : 0.327186~0.955474

난이도와 함께 보면 어떨까

    카테고리 난이도 == 시험지 난이도

In [30]:
ttt.index

Index(['90035', '90071', '80047', '80071', '70075', '80029', '80049', '90017',
       '90024', '70039',
       ...
       '10002', '10161', '10133', '20015', '60186', '40188', '60002', '60001',
       '10145', '40187'],
      dtype='object', name='test_paper', length=1537)

In [31]:
# train['paper_difficulty'] = 0

# train[train['userID'].index=='90035']['userID']
# for i in ttt.index:
#     print(train[train['userID'].index==i])
#     # train.loc[train[train['userID'].index==i].index, 'paper_difficulty'] = ttt[i]

In [32]:
t = train.groupby(['userID', 'test_paper']).apply(lambda x:(set(x['category_difficulty'].values), 
                                                                x['answerCode'].values,
                                                                x['answerCode'].values.mean()))

In [33]:
t

userID  test_paper
0       20172         ({2}, [1, 1, 1, 1, 0, 0], 0.6666666666666666)
        20174                           ({2}, [1, 1, 1, 1, 0], 0.8)
        20181                        ({2}, [1, 0, 1, 1, 0, 0], 0.5)
        20182                           ({2}, [1, 1, 1, 1, 1], 1.0)
        20183                     ({2}, [1, 1, 1, 1, 1, 1, 1], 1.0)
                                          ...                      
7440    30136                           ({4}, [1, 1, 1, 0, 0], 0.6)
        30197                           ({4}, [1, 0, 0, 0, 0], 0.2)
        50096                           ({6}, [1, 0, 0, 1, 0], 0.4)
7441    30071                           ({4}, [0, 0, 1, 0, 0], 0.2)
        40165                              ({5}, [1, 1, 1, 1], 1.0)
Length: 365164, dtype: object

한 유저가 같은 시험지(같은 난이도)에 대해서 여러번 푼 적이 있음

같은 시험지여도 매번 틀리거나 매번 맞추지 않음

    유저별로 보았을때, 시험지의 난이도와 정답률의 큰 상관관계는 보이지 않음

    어느정도는 상관관계가 보이긴 함 -> 난이도가 낮을때 정답률이 높음


In [34]:
train['test_paper'].value_counts()

80128    3960
80131    3640
90015    3624
90011    3616
90009    3616
         ... 
30186     180
10166     176
10178     176
10181     176
10179     172
Name: test_paper, Length: 1537, dtype: int64

시험지 별로 푼 횟수가 생각보다 좀 차이남 -> 172~3960

많이 푼 시험지는 난이도가 낮을까??

In [35]:
tt = train.groupby('test_paper').apply(lambda x: [x['userID'].count(),
                                            x['category_difficulty'].mean(),
                                            ])

In [36]:
tt.sort_values(ascending=False)

test_paper
80128    [3960, 8.0]
80131    [3640, 8.0]
90015    [3624, 9.0]
90009    [3616, 9.0]
90011    [3616, 9.0]
            ...     
10168     [180, 1.0]
10181     [176, 1.0]
10178     [176, 1.0]
10166     [176, 1.0]
10179     [172, 1.0]
Length: 1537, dtype: object

In [37]:
tt.sort_values(ascending=False).head(10)

test_paper
80128    [3960, 8.0]
80131    [3640, 8.0]
90015    [3624, 9.0]
90009    [3616, 9.0]
90011    [3616, 9.0]
90016    [3608, 9.0]
90003    [3600, 9.0]
90013    [3600, 9.0]
90001    [3576, 9.0]
40184    [3497, 5.0]
dtype: object

In [38]:
tt.sort_values(ascending=False).tail(10)

test_paper
20189    [188, 2.0]
30196    [184, 4.0]
10183    [184, 1.0]
10164    [184, 1.0]
30186    [180, 4.0]
10168    [180, 1.0]
10181    [176, 1.0]
10178    [176, 1.0]
10166    [176, 1.0]
10179    [172, 1.0]
dtype: object

* 수정

시험지 번호를 기준으로 본 것에서 카테고리 번호까지 포함한 시험지 자체로 판단함

    위의 결과에서 보면 가장 많이 푼 시험지들의 난이도가 적게 푼 시험지에 비해 난이도가 낮음

### 시험지 안의 문제별로

In [39]:
train.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,month,category_2,category_difficulty,test_paper
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,3,6,3,60001
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,3,6,3,60001
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,3,6,3,60001
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,3,6,3,60001
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,3,6,3,60001


In [40]:
train['test_question']=train['assessmentItemID'].str[-3:]

In [41]:
train.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,month,category_2,category_difficulty,test_paper,test_question
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,3,6,3,60001,1
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,3,6,3,60001,2
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,3,6,3,60001,3
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,3,6,3,60001,4
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,3,6,3,60001,5


In [42]:
train.groupby(['test_paper', 'test_question']).apply(lambda x:(x['category_difficulty'].mean(),
                                                            x['answerCode'].mean()))

test_paper  test_question
10001       001              (1.0, 0.9305993690851735)
            002              (1.0, 0.9589905362776026)
            003              (1.0, 0.9053627760252366)
            004              (1.0, 0.9495268138801262)
            005               (1.0, 0.886435331230284)
                                       ...            
90074       002              (9.0, 0.4444444444444444)
            003              (9.0, 0.5777777777777777)
            004              (9.0, 0.3333333333333333)
            005                             (9.0, 0.4)
            006              (9.0, 0.4666666666666667)
Length: 9454, dtype: object

시험지 내 문항별로 체크했을 경우 시험지의 난이도에 따라 정답률이 달라짐

In [43]:
summary = train.groupby('test_question')['answerCode']

pd.DataFrame({'mean':summary.mean(), 'cnt':summary.count()}).reset_index()

Unnamed: 0,test_question,mean,cnt
0,1,0.749916,371755
1,2,0.720062,370972
2,3,0.687773,371640
3,4,0.663364,369705
4,5,0.599134,360607
5,6,0.555685,197772
6,7,0.515399,127121
7,8,0.457156,67150
8,9,0.481729,18417
9,10,0.527892,5342


문항은 001~013까지

005번까지는 cnt 개수가 비슷하나 그 이후에는 감소

정답률도 점차 감소

    * 문제 번호가 커지면 난이도 상승??
    * 그래서 문제를 푼 수가 적은 것인가??

### KnowledgeTage

In [44]:
train.KnowledgeTag.value_counts()

7597    12892
7600    10734
2010    10176
7621     9709
8127     9006
        ...  
5142       47
2871       46
4978       46
7125       45
2613       44
Name: KnowledgeTag, Length: 912, dtype: int64

knowledgetag에 따라 문제 수가 다름

In [45]:
tttt = train.groupby('KnowledgeTag').apply(lambda x:(x['answerCode'].mean(),
                                                x['category_difficulty'].mean()))

KnowledgeTag 별로 평균정답률의 폭이 매우 다름



In [46]:
tttt.sort_values()

KnowledgeTag
8911      (0.1889400921658986, 7.0)
10332                   (0.25, 6.0)
9125      (0.2570093457943925, 7.0)
10590    (0.26813880126182965, 9.0)
5763     (0.27802690582959644, 7.0)
                    ...            
1580      (0.9418960244648318, 3.0)
1582      (0.9492803289924606, 3.0)
1577      (0.9504608294930875, 3.0)
7224      (0.9550224887556222, 3.0)
1878      (0.9777777777777777, 4.0)
Length: 912, dtype: object

tag별 난이도가 높을 수록 정답률이 낮게 형성 되는 경향

정답률이 가장 높은 5개 태그에서 난이도가 3, 4정도 -->> 난이도 1인건??

In [47]:
ind = tttt.index
a,b,aa,bb = [],[],[],[]
for i in ind:
    if tttt[i][1]==1.0:
        a.append(tttt[i][0])
    elif tttt[i][1]==4.0:
        aa.append(tttt[i][0])

    elif tttt[i][1]==9.0:
        b.append(tttt[i][0])
    elif tttt[i][1]==7.0:
        bb.append(tttt[i][0])

In [48]:
np.mean(a)  # 1

0.8082000701218859

In [49]:
np.mean(aa) # 4

0.7044329387848381

In [50]:
np.mean(b)  # 9

0.4498757809088943

In [51]:
np.mean(bb) # 7

0.521113337515962

전체적으로 봤을때

카테고리 난이도가 1일때의 정답률은 4일때의 정답률보다 크지만 정답률이 제일 큰 태그는 난이도가 4이고

카테고리 난이도가 9일때의 정답률은 7일때의 정답률보다 작지만 정답률이 제일 낮은 태그는 난이도가 7이다.



    사람마다의 차이??

### 유저별 차이

In [52]:
tem = train.groupby('userID')['answerCode']
tem = pd.DataFrame({'mean' : tem.mean(), 'cnt':tem.count()}).reset_index()
tem

Unnamed: 0,userID,mean,cnt
0,0,0.630872,745
1,1,0.853162,933
2,2,0.612319,276
3,5,0.795918,833
4,6,0.442997,921
...,...,...,...
6693,7436,0.466667,15
6694,7437,0.375000,16
6695,7438,0.750000,16
6696,7440,0.400000,15


유저별로 푼 문제수가 다르고 정답률도 다르다

유저별 정답률도 명시하면 좋을듯

In [53]:
train

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,month,category_2,category_difficulty,test_paper,test_question
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,3,6,3,60001,001
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,3,6,3,60001,002
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,3,6,3,60001,003
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,3,6,3,60001,004
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,3,6,3,60001,005
...,...,...,...,...,...,...,...,...,...,...,...
2266581,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438,6,3,4,30071,005
2266582,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836,8,4,5,40165,001
2266583,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836,8,4,5,40165,002
2266584,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836,8,4,5,40165,003


In [54]:
# train.to_csv('../data/train_data_2.csv')

In [56]:
t_res = train.drop(['month','category_2','category_difficulty','test_paper'], axis=1)
t_res

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,test_question
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,001
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,002
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,003
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,004
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,005
...,...,...,...,...,...,...,...
2266581,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438,005
2266582,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836,001
2266583,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836,002
2266584,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836,003


In [57]:
t_res.to_csv('../data/train_data_3.csv')