# 1. 데이터 불러오기

In [60]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_theme(color_codes=True)
import missingno as msno

In [217]:
%%time
dtype = {
    'userID': 'int16',
    'answerCode': 'int8',
    'KnowledgeTag': 'int16'
}   

DATA_PATH = '../data/test_data.csv'
df = pd.read_csv(DATA_PATH, dtype=dtype, parse_dates=['Timestamp'])
# DATA_PATH = '../data/train_data.csv'
# df1 = pd.read_csv(DATA_PATH, dtype=dtype, parse_dates=['Timestamp'])
# df = pd.concat([df,df1])
df = df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)

CPU times: user 424 ms, sys: 24 ms, total: 448 ms
Wall time: 448 ms


In [62]:
df.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,3,A050023001,A050000023,1,2020-01-09 10:56:31,2626
1,3,A050023002,A050000023,1,2020-01-09 10:56:57,2626
2,3,A050023003,A050000023,0,2020-01-09 10:58:31,2625
3,3,A050023004,A050000023,0,2020-01-09 10:58:36,2625
4,3,A050023006,A050000023,0,2020-01-09 10:58:43,2623


In [218]:
print(f"""--- BASIC INFORMATIONS ---
userID           : {df.userID.nunique()}
assessmentItemID : {df.assessmentItemID.nunique()}
testID           : {df.testId.nunique()}
mean answer rate : {df.answerCode.sum() / df.shape[0] * 100:.2f}%
KnowledgeTag     : {df.KnowledgeTag.nunique()}
{'-'*26}""")

--- BASIC INFORMATIONS ---
userID           : 744
assessmentItemID : 9454
testID           : 1537
mean answer rate : 65.22%
KnowledgeTag     : 912
--------------------------


In [106]:
presub = df[df['answerCode']==-1]
presub = presub.drop(columns=['Timestamp','assessmentItemID','userID','answerCode']).reset_index(drop=True)
presub.head()

Unnamed: 0,KnowledgeTag
0,5289
1,9080
2,9660
3,2611
4,1422


# 2. User 별 평균 점수 적용

In [64]:
def percentile(s):
    return np.sum(s) / len(s)

In [65]:
df = df.drop(columns=['testId'])
# df1 = df1.drop(columns=['testId'])

In [66]:
userdf = df.groupby("userID").agg({
    'assessmentItemID': 'count',
    'answerCode': percentile
    })

userdf = userdf.reset_index(drop=False)
userdf.head(10)

Unnamed: 0,userID,assessmentItemID,answerCode
0,3,1036,0.69112
1,4,671,0.691505
2,13,1317,0.694002
3,17,1260,0.81746
4,26,387,0.754522
5,29,854,0.845433
6,45,1084,0.687269
7,53,693,0.520924
8,58,811,0.362515
9,64,1270,0.832283


In [67]:
userRB = userdf['answerCode']

In [68]:
userRB.head()

0    0.691120
1    0.691505
2    0.694002
3    0.817460
4    0.754522
Name: answerCode, dtype: float64

In [69]:
# userRB.to_csv('userRB.csv')

# 3. 문항별 평균 점수 적용

## 3-1. 문제 대분류


In [210]:

itemdf = pd.concat([df['assessmentItemID'].apply(lambda x: x[2:3]),df['answerCode']],axis = 1)

itemdf.head(100)

Unnamed: 0,assessmentItemID,answerCode
0,5,1
1,5,1
2,5,0
3,5,0
4,5,0
...,...,...
95,2,1
96,2,1
97,2,1
98,2,1


In [211]:
itemdf.describe()

Unnamed: 0,answerCode
count,260114.0
mean,0.652172
std,0.48225
min,-1.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [175]:
itemscore = itemdf.groupby('assessmentItemID').agg({
    "answerCode":percentile
})
itemscore = itemscore.reset_index()

In [176]:
itemscore

Unnamed: 0,assessmentItemID,answerCode
0,1,0.804377
1,2,0.736795
2,3,0.697083
3,4,0.644724
4,5,0.662618
5,6,0.733989
6,7,0.515691
7,8,0.477395
8,9,0.498492


In [219]:
df.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,3,A050023001,A050000023,1,2020-01-09 10:56:31,2626
1,3,A050023002,A050000023,1,2020-01-09 10:56:57,2626
2,3,A050023003,A050000023,0,2020-01-09 10:58:31,2625
3,3,A050023004,A050000023,0,2020-01-09 10:58:36,2625
4,3,A050023006,A050000023,0,2020-01-09 10:58:43,2623


In [225]:
presub1 = df[df['answerCode']==-1]
presub1 = presub1.drop(columns=['Timestamp','KnowledgeTag','userID','answerCode','testId']).reset_index(drop=True)
presub1 = pd.DataFrame(presub1['assessmentItemID'].apply(lambda x: x[2:3]))
presub1.describe()

Unnamed: 0,assessmentItemID
count,744
unique,9
top,7
freq,123


In [229]:
presub1.head()

Unnamed: 0,assessmentItemID
0,5
1,7
2,7
3,9
4,6


In [226]:
answer = []
for i in range(len(presub1)):
    answer.append(float(itemscore[itemscore['assessmentItemID']==presub1.iloc[i,0]]['answerCode'].values))
answer


[0.6626179514807197,
 0.5156907239318131,
 0.5156907239318131,
 0.49849164288626174,
 0.7339892217069808,
 0.73679495572477,
 0.6447237544081743,
 0.6447237544081743,
 0.5156907239318131,
 0.5156907239318131,
 0.6626179514807197,
 0.5156907239318131,
 0.8043766578249337,
 0.6626179514807197,
 0.6626179514807197,
 0.73679495572477,
 0.5156907239318131,
 0.73679495572477,
 0.7339892217069808,
 0.5156907239318131,
 0.7339892217069808,
 0.6447237544081743,
 0.6626179514807197,
 0.5156907239318131,
 0.5156907239318131,
 0.8043766578249337,
 0.8043766578249337,
 0.8043766578249337,
 0.6626179514807197,
 0.4773951745139377,
 0.6626179514807197,
 0.6447237544081743,
 0.49849164288626174,
 0.4773951745139377,
 0.5156907239318131,
 0.49849164288626174,
 0.7339892217069808,
 0.5156907239318131,
 0.73679495572477,
 0.5156907239318131,
 0.7339892217069808,
 0.4773951745139377,
 0.5156907239318131,
 0.4773951745139377,
 0.5156907239318131,
 0.5156907239318131,
 0.7339892217069808,
 0.477395174513937

In [227]:
ans = pd.DataFrame(answer, columns=['prediction'])
user = list(range(len(ans)))
user = pd.DataFrame(user,columns=['user'])

submission = pd.concat([user,ans],axis=1)

In [228]:
submission.head()

Unnamed: 0,user,prediction
0,0,0.662618
1,1,0.515691
2,2,0.515691
3,3,0.498492
4,4,0.733989


In [230]:
submission.to_csv('ItemID_RB.csv')

## 3-2 KnowledgeTag 

In [116]:
knowtagdf = df.groupby('KnowledgeTag').agg({
    'answerCode':percentile
})
knowtagdf =knowtagdf.reset_index()

In [117]:
knowtagdf.head(500)

Unnamed: 0,KnowledgeTag,answerCode
0,23,0.670000
1,24,0.714286
2,25,0.750000
3,26,0.578358
4,30,0.475410
...,...,...
495,5780,0.492063
496,5781,0.443137
497,5782,0.421818
498,5783,0.454545


In [119]:
knowtagdf[knowtagdf['KnowledgeTag']==23]['answerCode']

#평균 : 0.6544

0    0.67
Name: answerCode, dtype: float64

In [123]:
presub['KnowledgeTag']

0       5289
1       9080
2       9660
3       2611
4       1422
       ...  
739    10615
740     7636
741    10402
742    10402
743     8832
Name: KnowledgeTag, Length: 744, dtype: int16

In [146]:
answer = []
for n in range(len(presub)):
    answer.append(float(knowtagdf[knowtagdf['KnowledgeTag']==presub.iloc[n,0]]['answerCode'].values ))
answer


[0.5426621160409556,
 0.5656934306569343,
 0.44675324675324674,
 0.5142857142857142,
 0.6027667984189723,
 0.6424242424242425,
 0.6470588235294118,
 0.6650485436893204,
 0.34347826086956523,
 0.5656934306569343,
 0.7189922480620154,
 0.5485436893203883,
 0.8271889400921659,
 0.6058823529411764,
 0.732824427480916,
 0.7836644591611479,
 0.4864864864864865,
 0.8337408312958435,
 0.772020725388601,
 0.5656934306569343,
 0.7620967741935484,
 0.49427480916030536,
 0.6063348416289592,
 0.5738636363636364,
 0.5353535353535354,
 0.8271889400921659,
 0.8271889400921659,
 0.8503649635036497,
 0.5605095541401274,
 0.44545454545454544,
 0.4985754985754986,
 0.5974025974025974,
 0.3333333333333333,
 0.34210526315789475,
 0.549792531120332,
 0.42298850574712643,
 0.7223650385604113,
 0.3672316384180791,
 0.5252918287937743,
 0.6733333333333333,
 0.7012345679012346,
 0.4496124031007752,
 0.5359116022099447,
 0.5485436893203883,
 0.4152542372881356,
 0.549792531120332,
 0.7737226277372263,
 0.55052264

In [160]:
ans = pd.DataFrame(answer, columns=['prediction'])
user = list(range(len(ans)))
user = pd.DataFrame(user,columns=['user'])

submission = pd.concat([user,ans],axis=1)

In [161]:
submission.head()

Unnamed: 0,user,prediction
0,0,0.542662
1,1,0.565693
2,2,0.446753
3,3,0.514286
4,4,0.602767


In [162]:
submission.to_csv('KnowledgeTag.csv')