In [1]:
import pandas as pd
import numpy as np
import os
import random

from catboost import CatBoostClassifier,CatBoostRegressor
from xgboost import XGBClassifier
import lightgbm as lgb

In [2]:
data_dir = '/opt/ml/input/data'
train_data = os.path.join(data_dir, 'train_data.csv')
test_data = os.path.join(data_dir, 'test_data.csv')
train_df = pd.read_csv(train_data, parse_dates=['Timestamp'])
test_df = pd.read_csv(test_data, parse_dates=['Timestamp'])

#둘 중에 하나 선택
df = pd.concat([train_df, test_df])
#df = pd.concat([train_df, test_df[test_df['answerCode']!=-1]])

df = df.sort_values(by=['userID', 'Timestamp']).reset_index()
df.drop(['index'], axis=1, inplace=True)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2526700 entries, 0 to 2526699
Data columns (total 6 columns):
 #   Column            Dtype         
---  ------            -----         
 0   userID            int64         
 1   assessmentItemID  object        
 2   testId            object        
 3   answerCode        int64         
 4   Timestamp         datetime64[ns]
 5   KnowledgeTag      int64         
dtypes: datetime64[ns](1), int64(3), object(2)
memory usage: 115.7+ MB


In [4]:
df.isna().sum()

userID              0
assessmentItemID    0
testId              0
answerCode          0
Timestamp           0
KnowledgeTag        0
dtype: int64

In [5]:
df['userID'].nunique()

7442

# CatBoost 사용을 위한 Feature engineering

-> assessmentItemID에 testID의 정보가 모두 담겨져 있으므로, 필요 없음.
-> assessmentItemID에 앞의 3자리에서 가운데 숫자가 0~9로 포진되어있음 -> 분류로 사용가능.


후보
1. 시험지 종류
2. 문제 종류
3. 과거에 그 문제를 푼 횟수
4. 과거에 그 문제를 맞춘 횟수
5. 학생 별 정답률
6. 문제 별 정답률 
7. 문제 풀이 소요 시간
8. 학생 별 평균 문제 풀이 시간
9. 문제 별 평균 문제 풀이 시간
10. 최근 문제 풀이 정답 여부(5~10개 정도)
11. 학생 별 태그 정답률
12. 태그 별 정답률
13. 과거에 그 태그를 푼 횟수
14. 과거에 그 태그를 맞춘 횟수
 

In [6]:
df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225
...,...,...,...,...,...,...
2526695,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438
2526696,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836
2526697,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836
2526698,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836


In [7]:
df.groupby('userID').last()

Unnamed: 0_level_0,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,A080129006,A080000129,0,2020-12-23 03:40:19,2725
1,A090074006,A090000074,1,2020-11-13 02:47:20,2648
2,A050139007,A050000139,0,2020-10-20 11:32:26,428
3,A050133008,A050000133,-1,2020-10-26 13:13:57,5289
4,A070146008,A070000146,-1,2020-12-27 02:47:54,9080
...,...,...,...,...,...
7437,A060003007,A060000003,0,2020-05-22 01:53:49,7226
7438,A030188005,A030000188,1,2020-10-19 10:28:29,1934
7439,A040130005,A040000130,-1,2020-10-14 23:10:03,8832
7440,A030197005,A030000197,0,2020-10-21 08:33:20,1984


시험지 번호, 문제 번호 넣기.

In [8]:
df['exam_num']=df['assessmentItemID'].str[2].astype(np.int32)
df['test_num']=df['assessmentItemID'].str[4:7].astype(np.int32)
df['question_num']=df['assessmentItemID'].str[7:].astype(np.int32)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2526700 entries, 0 to 2526699
Data columns (total 9 columns):
 #   Column            Dtype         
---  ------            -----         
 0   userID            int64         
 1   assessmentItemID  object        
 2   testId            object        
 3   answerCode        int64         
 4   Timestamp         datetime64[ns]
 5   KnowledgeTag      int64         
 6   exam_num          int32         
 7   test_num          int32         
 8   question_num      int32         
dtypes: datetime64[ns](1), int32(3), int64(3), object(2)
memory usage: 144.6+ MB


과거 해당 문제를 맞춘 횟수

In [10]:
df['user_shift'] = df.groupby(['userID','exam_num','test_num', 'question_num'])['answerCode'].shift().fillna(0)
df['user_shift'] = df['user_shift'].astype(np.int64)
df['past_correct_answer'] = df.groupby(['userID','exam_num','test_num', 'question_num'])['user_shift'].cumsum()

In [11]:
df['past_correct_answer'].unique()

array([0, 1, 2])

In [12]:
df[df['past_correct_answer']==2]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,exam_num,test_num,question_num,user_shift,past_correct_answer
141393,165,A090007002,A090000007,0,2020-04-23 01:11:13,4725,9,7,2,1,2
141395,165,A090007004,A090000007,0,2020-04-23 01:11:27,4697,9,7,4,1,2
141406,165,A090009001,A090000009,1,2020-04-28 01:19:28,5141,9,9,1,1,2
141407,165,A090009002,A090000009,1,2020-04-28 01:19:54,10741,9,9,2,1,2
141417,165,A090011004,A090000011,0,2020-04-29 23:30:32,9728,9,11,4,1,2
...,...,...,...,...,...,...,...,...,...,...,...
2025184,3414,A030018003,A030000018,0,2020-06-12 12:13:00,7310,3,18,3,1,2
2117859,3717,A020184001,A020000184,1,2020-09-19 09:52:46,8092,2,184,1,1,2
2117877,3717,A020188003,A020000188,1,2020-09-20 06:01:13,8096,2,188,3,1,2
2135281,3775,A020118001,A020000118,1,2020-10-23 09:03:19,8014,2,118,1,1,2


과거에 해당 태그를 맞춘 횟수

In [13]:
df['user_tag_shift'] = df.groupby(['userID', 'KnowledgeTag'])['answerCode'].shift().fillna(0)
df['user_tag_shift'] = df['user_tag_shift'].astype(np.int64)
df['past_correct_tag_answer'] = df.groupby(['userID', 'KnowledgeTag'])['user_tag_shift'].cumsum()

In [14]:
df['past_correct_tag_answer'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61])

In [15]:
df[(df['userID']==165) & (df['exam_num']==9) & (df['test_num']==7) & (df['question_num']==2)]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,exam_num,test_num,question_num,user_shift,past_correct_answer,user_tag_shift,past_correct_tag_answer
141201,165,A090007002,A090000007,1,2020-02-24 20:20:30,4725,9,7,2,0,0,0,0
141327,165,A090007002,A090000007,1,2020-03-20 05:20:39,4725,9,7,2,1,1,1,4
141393,165,A090007002,A090000007,0,2020-04-23 01:11:13,4725,9,7,2,1,2,0,6


In [16]:
df[(df['userID']==1) & (df['KnowledgeTag']==2111) & (df['exam_num']==4) & (df['test_num']==155) & (df['question_num']==6)]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,exam_num,test_num,question_num,user_shift,past_correct_answer,user_tag_shift,past_correct_tag_answer
1171,1,A040155006,A040000155,0,2020-06-21 23:01:44,2111,4,155,6,0,0,0,3
1402,1,A040155006,A040000155,1,2020-08-17 10:02:07,2111,4,155,6,0,0,0,7


In [17]:
df[(df['userID']==1) & (df['KnowledgeTag']==2111) & (df['exam_num']==4)]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,exam_num,test_num,question_num,user_shift,past_correct_answer,user_tag_shift,past_correct_tag_answer
1166,1,A040155001,A040000155,1,2020-06-21 22:57:14,2111,4,155,1,0,0,0,0
1167,1,A040155002,A040000155,1,2020-06-21 22:57:55,2111,4,155,2,0,0,1,1
1168,1,A040155003,A040000155,0,2020-06-21 22:58:33,2111,4,155,3,0,0,1,2
1169,1,A040155004,A040000155,1,2020-06-21 22:58:49,2111,4,155,4,0,0,0,2
1170,1,A040155005,A040000155,0,2020-06-21 23:00:10,2111,4,155,5,0,0,1,3
1171,1,A040155006,A040000155,0,2020-06-21 23:01:44,2111,4,155,6,0,0,0,3
1397,1,A040155001,A040000155,1,2020-08-17 09:56:27,2111,4,155,1,1,1,0,3
1398,1,A040155002,A040000155,1,2020-08-17 09:57:06,2111,4,155,2,1,1,1,4
1399,1,A040155003,A040000155,1,2020-08-17 09:58:03,2111,4,155,3,0,0,1,5
1400,1,A040155004,A040000155,1,2020-08-17 09:58:29,2111,4,155,4,1,1,1,6


과거에 해당 문제를 푼 횟수(틀렸는지 맞았는지 상관 X)

In [18]:
df['past_solve_problem'] = df.groupby(['userID','exam_num','test_num', 'question_num']).cumcount()

In [19]:
df[(df['userID']==165) & (df['exam_num']==9) & (df['test_num']==7) & (df['question_num']==2)]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,exam_num,test_num,question_num,user_shift,past_correct_answer,user_tag_shift,past_correct_tag_answer,past_solve_problem
141201,165,A090007002,A090000007,1,2020-02-24 20:20:30,4725,9,7,2,0,0,0,0,0
141327,165,A090007002,A090000007,1,2020-03-20 05:20:39,4725,9,7,2,1,1,1,4,1
141393,165,A090007002,A090000007,0,2020-04-23 01:11:13,4725,9,7,2,1,2,0,6,2


과거에 해당 태그를 푼 횟수(틀렸는지 맞았는지 상관 X)

In [20]:
df['past_solve_tag'] = df.groupby(['userID', 'KnowledgeTag']).cumcount()

In [21]:
df[df['past_solve_problem']==2]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,exam_num,test_num,question_num,user_shift,past_correct_answer,user_tag_shift,past_correct_tag_answer,past_solve_problem,past_solve_tag
141392,165,A090007001,A090000007,0,2020-04-23 01:11:09,4724,9,7,1,0,1,0,4,2,5
141393,165,A090007002,A090000007,0,2020-04-23 01:11:13,4725,9,7,2,1,2,0,6,2,10
141394,165,A090007003,A090000007,0,2020-04-23 01:11:24,4725,9,7,3,0,1,0,6,2,11
141395,165,A090007004,A090000007,0,2020-04-23 01:11:27,4697,9,7,4,1,2,1,7,2,12
141396,165,A090007005,A090000007,0,2020-04-23 01:11:30,4725,9,7,5,1,1,0,6,2,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2135281,3775,A020118001,A020000118,1,2020-10-23 09:03:19,8014,2,118,1,1,2,0,8,2,15
2135282,3775,A020118002,A020000118,1,2020-10-23 09:03:33,8014,2,118,2,1,2,1,9,2,16
2135283,3775,A020118003,A020000118,0,2020-10-23 09:03:50,8014,2,118,3,0,0,1,10,2,17
2135284,3775,A020118004,A020000118,1,2020-10-23 09:04:13,8014,2,118,4,1,1,0,10,2,18


In [22]:
df[df['past_solve_tag']==2]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,exam_num,test_num,question_num,user_shift,past_correct_answer,user_tag_shift,past_correct_tag_answer,past_solve_problem,past_solve_tag
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,6,1,4,0,0,1,2,0,2
8,0,A060003003,A060000003,1,2020-03-26 05:53:14,7226,6,3,3,0,0,1,1,0,2
16,0,A060005004,A060000005,1,2020-03-31 05:03:26,7228,6,5,4,0,0,1,2,0,2
22,0,A060007003,A060000007,1,2020-04-02 04:54:04,7229,6,7,3,0,0,1,2,0,2
35,0,A060009003,A060000009,1,2020-04-07 01:43:31,7230,6,9,3,0,0,1,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2526679,7440,A050096004,A050000096,1,2020-08-19 04:58:27,5267,5,96,4,0,0,0,1,0,2
2526684,7440,A030136004,A030000136,0,2020-09-23 09:21:43,7691,3,136,4,0,0,1,2,0,2
2526688,7440,A030197003,A030000197,0,2020-10-21 08:33:15,1984,3,197,3,0,0,0,1,0,2
2526693,7441,A030071003,A030000071,1,2020-06-05 06:49:23,438,3,71,3,0,0,0,0,0,2


In [23]:
df[(df['userID']==165) & (df['KnowledgeTag']==4724) & (df['exam_num']==9) & (df['test_num']==7) & (df['question_num']==1)]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,exam_num,test_num,question_num,user_shift,past_correct_answer,user_tag_shift,past_correct_tag_answer,past_solve_problem,past_solve_tag
141200,165,A090007001,A090000007,1,2020-02-24 20:20:28,4724,9,7,1,0,0,0,0,0,0
141326,165,A090007001,A090000007,0,2020-03-20 05:19:58,4724,9,7,1,1,1,1,4,1,4
141392,165,A090007001,A090000007,0,2020-04-23 01:11:09,4724,9,7,1,0,1,0,4,2,5


In [24]:
df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,exam_num,test_num,question_num,user_shift,past_correct_answer,user_tag_shift,past_correct_tag_answer,past_solve_problem,past_solve_tag
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,6,1,1,0,0,0,0,0,0
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,6,1,2,0,0,0,0,0,0
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,6,1,3,0,0,1,1,0,1
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,6,1,4,0,0,1,2,0,2
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,6,1,5,0,0,1,3,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2526695,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438,3,71,5,0,0,0,1,0,4
2526696,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836,4,165,1,0,0,0,0,0,0
2526697,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836,4,165,2,0,0,1,1,0,1
2526698,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836,4,165,3,0,0,1,2,0,2


In [25]:
df[df['answerCode']==-1]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,exam_num,test_num,question_num,user_shift,past_correct_answer,user_tag_shift,past_correct_tag_answer,past_solve_problem,past_solve_tag
2989,3,A050133008,A050000133,-1,2020-10-26 13:13:57,5289,5,133,8,0,0,0,9,0,11
3660,4,A070146008,A070000146,-1,2020-12-27 02:47:54,9080,7,146,8,0,0,1,2,0,3
10860,13,A070111008,A070000111,-1,2020-12-27 04:35:09,9660,7,111,8,0,0,1,2,0,6
15278,17,A090064006,A090000064,-1,2020-10-30 05:48:37,2611,9,64,6,0,0,1,5,0,5
23531,26,A060135007,A060000135,-1,2020-10-23 11:44:18,1422,6,135,7,0,0,0,4,0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525938,7395,A040122005,A040000122,-1,2020-09-08 02:05:20,10615,4,122,5,0,0,0,0,0,2
2526081,7404,A030111005,A030000111,-1,2020-10-13 09:49:18,7636,3,111,5,0,0,1,2,0,4
2526282,7416,A050193004,A050000193,-1,2020-10-04 02:44:41,10402,5,193,4,0,0,0,2,0,3
2526297,7417,A050193004,A050000193,-1,2020-09-06 13:09:15,10402,5,193,4,0,0,0,2,0,3


학생 별 정답률 및 문제 별 정답률

In [26]:
df['total_problem_count'] = df.groupby(['assessmentItemID'])['answerCode'].cumcount()
df['answer_problem_count'] = df.groupby(['assessmentItemID'])['answerCode'].cumsum()

df['total_student_count'] = df.groupby(['userID'])['answerCode'].cumcount()
df['answer_student_count'] = df.groupby(['userID'])['answerCode'].cumsum()

df['total_tag_count'] = df.groupby(['KnowledgeTag'])['answerCode'].cumcount()
df['answer_tag_count'] = df.groupby(['KnowledgeTag'])['answerCode'].cumsum()


In [27]:
df[df['userID']==1]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,exam_num,test_num,question_num,user_shift,...,user_tag_shift,past_correct_tag_answer,past_solve_problem,past_solve_tag,total_problem_count,answer_problem_count,total_student_count,answer_student_count,total_tag_count,answer_tag_count
745,1,A040013001,A040000013,1,2020-01-06 08:40:43,2048,4,13,1,0,...,0,0,0,0,0,1,0,1,0,1
746,1,A040013002,A040000013,1,2020-01-06 08:43:46,2048,4,13,2,0,...,1,1,0,1,0,1,1,2,1,2
747,1,A040013003,A040000013,1,2020-01-06 08:44:29,2047,4,13,3,0,...,0,0,0,0,0,1,2,3,0,1
748,1,A040013004,A040000013,1,2020-01-06 08:46:13,2047,4,13,4,0,...,1,1,0,1,0,1,3,4,1,2
749,1,A040013005,A040000013,0,2020-01-06 08:49:45,2047,4,13,5,0,...,1,2,0,2,0,0,4,4,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1673,1,A090074004,A090000074,1,2020-11-13 02:42:08,10196,9,74,4,0,...,1,2,0,2,0,1,928,792,2,3
1674,1,A090074002,A090000074,1,2020-11-13 02:44:34,4243,9,74,2,0,...,0,6,0,7,0,1,929,793,7,7
1675,1,A090074003,A090000074,1,2020-11-13 02:45:04,4243,9,74,3,0,...,1,7,0,8,0,1,930,794,8,8
1676,1,A090074005,A090000074,1,2020-11-13 02:46:38,2648,9,74,5,0,...,1,3,0,4,0,1,931,795,4,4


In [28]:
df[df['assessmentItemID']=='A060001001']

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,exam_num,test_num,question_num,user_shift,...,user_tag_shift,past_correct_tag_answer,past_solve_problem,past_solve_tag,total_problem_count,answer_problem_count,total_student_count,answer_student_count,total_tag_count,answer_tag_count
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,6,1,1,0,...,0,0,0,0,0,1,0,1,0,1
7630,10,A060001001,A060000001,1,2020-03-05 02:55:20,7224,6,1,1,0,...,0,0,0,0,1,2,0,1,1,2
17006,20,A060001001,A060000001,1,2020-03-03 05:52:03,7224,6,1,1,0,...,0,0,0,0,2,3,0,1,4,5
28513,32,A060001001,A060000001,1,2020-03-03 23:27:08,7224,6,1,1,0,...,0,0,0,0,3,4,0,1,6,7
32252,36,A060001001,A060000001,1,2020-02-28 16:38:08,7224,6,1,1,0,...,0,0,0,0,4,5,0,1,8,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2487509,6297,A060001001,A060000001,1,2020-03-03 14:42:20,7224,6,1,1,0,...,0,0,0,0,245,242,0,1,745,714
2489454,6338,A060001001,A060000001,1,2020-01-16 01:25:29,7224,6,1,1,0,...,0,0,0,0,246,243,0,1,746,715
2491808,6383,A060001001,A060000001,1,2020-04-03 05:13:35,7224,6,1,1,0,...,0,0,0,0,247,244,0,1,747,716
2511268,6830,A060001001,A060000001,1,2020-03-04 01:18:06,7224,6,1,1,0,...,0,0,0,0,248,245,0,1,748,717


In [29]:
df[df['KnowledgeTag']==7224]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,exam_num,test_num,question_num,user_shift,...,user_tag_shift,past_correct_tag_answer,past_solve_problem,past_solve_tag,total_problem_count,answer_problem_count,total_student_count,answer_student_count,total_tag_count,answer_tag_count
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,6,1,1,0,...,0,0,0,0,0,1,0,1,0,1
7630,10,A060001001,A060000001,1,2020-03-05 02:55:20,7224,6,1,1,0,...,0,0,0,0,1,2,0,1,1,2
9544,13,A060011001,A060000011,1,2020-02-08 05:58:57,7224,6,11,1,0,...,0,0,0,0,0,1,0,1,2,3
9554,13,A060002006,A060000002,1,2020-02-08 06:16:02,7224,6,2,6,0,...,1,1,0,1,0,1,10,11,3,4
17006,20,A060001001,A060000001,1,2020-03-03 05:52:03,7224,6,1,1,0,...,0,0,0,0,2,3,0,1,4,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2487509,6297,A060001001,A060000001,1,2020-03-03 14:42:20,7224,6,1,1,0,...,0,0,0,0,245,242,0,1,745,714
2489454,6338,A060001001,A060000001,1,2020-01-16 01:25:29,7224,6,1,1,0,...,0,0,0,0,246,243,0,1,746,715
2491808,6383,A060001001,A060000001,1,2020-04-03 05:13:35,7224,6,1,1,0,...,0,0,0,0,247,244,0,1,747,716
2511268,6830,A060001001,A060000001,1,2020-03-04 01:18:06,7224,6,1,1,0,...,0,0,0,0,248,245,0,1,748,717


In [30]:
P = df.groupby('userID')['answer_student_count'].last()
P

userID
0       470
1       796
2       169
3       716
4       464
       ... 
7437      6
7438     12
7439     10
7440      6
7441      5
Name: answer_student_count, Length: 7442, dtype: int64

In [31]:
Q = df.groupby('userID')['total_student_count'].last()
Q

userID
0        744
1        932
2        275
3       1035
4        670
        ... 
7437      15
7438      15
7439      15
7440      14
7441       8
Name: total_student_count, Length: 7442, dtype: int64

In [32]:
X = df.groupby('assessmentItemID')['answer_problem_count'].last()
X

assessmentItemID
A010001001    323
A010001002    336
A010001003    317
A010001004    331
A010001005    309
             ... 
A090074002     24
A090074003     29
A090074004     18
A090074005     20
A090074006     18
Name: answer_problem_count, Length: 9454, dtype: int64

In [33]:
Y = df.groupby('assessmentItemID')['total_problem_count'].last()
Y

assessmentItemID
A010001001    349
A010001002    349
A010001003    349
A010001004    349
A010001005    349
             ... 
A090074002     49
A090074003     49
A090074004     49
A090074005     49
A090074006     49
Name: total_problem_count, Length: 9454, dtype: int64

In [34]:
M = df.groupby('KnowledgeTag')['answer_tag_count'].last()
M

KnowledgeTag
23       1186
24        250
25         85
26       1649
30        318
         ... 
11253    3513
11265     337
11269    1508
11270     144
11271     271
Name: answer_tag_count, Length: 912, dtype: int64

In [35]:
N = df.groupby('KnowledgeTag')['total_tag_count'].last()
N

KnowledgeTag
23       1999
24        399
25         99
26       2699
30        599
         ... 
11253    4799
11265     499
11269    2699
11270     299
11271     599
Name: total_tag_count, Length: 912, dtype: int64

In [36]:
student_rate_dict = (P/Q).to_dict()
problem_rate_dict = (X/Y).to_dict()
tag_rate_dict = (M/N).to_dict()

In [37]:
df['student_correct_answer_rate'] = df['userID'].map(student_rate_dict)
df['problem_correct_answer_rate'] = df['assessmentItemID'].map(problem_rate_dict)
df['tag_correct_answer_rate'] = df['KnowledgeTag'].map(tag_rate_dict)


In [38]:
#df.drop(['user_shift', 'user_tag_shift'], inplace=True, axis=1)
df.drop(['user_shift', 'user_tag_shift', 'total_problem_count', 'answer_problem_count', 'total_student_count', 'answer_student_count', 'total_tag_count', 'answer_tag_count'], inplace=True, axis=1)

In [39]:
df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,exam_num,test_num,question_num,past_correct_answer,past_correct_tag_answer,past_solve_problem,past_solve_tag,student_correct_answer_rate,problem_correct_answer_rate,tag_correct_answer_rate
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,6,1,1,0,0,0,0,0.63172,0.987952,0.958611
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,6,1,2,0,0,0,0,0.63172,0.971888,0.917311
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,6,1,3,0,1,0,1,0.63172,0.919679,0.917311
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,6,1,4,0,2,0,2,0.63172,0.975904,0.917311
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,6,1,5,0,3,0,3,0.63172,0.951807,0.917311
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2526695,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438,3,71,5,0,1,0,4,0.62500,0.448161,0.695043
2526696,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836,4,165,1,0,0,0,0,0.62500,0.645485,0.698753
2526697,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836,4,165,2,0,1,0,1,0.62500,0.642140,0.698753
2526698,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836,4,165,3,0,2,0,2,0.62500,0.789298,0.698753


학생 문제풀이 평균 시간, 문제 당 평균 풀이 시간 채우기.

In [40]:
diff = df.groupby('userID')['Timestamp'].diff().fillna(pd.Timedelta(seconds=-1)) #마지막 문제는 새로 전처리 해야함.
diff = diff.apply(lambda x:x.total_seconds())

In [41]:
diff

0               -1.0
1                3.0
2                8.0
3                7.0
4                7.0
             ...    
2526695         24.0
2526696    6632178.0
2526697         11.0
2526698         46.0
2526699         73.0
Name: Timestamp, Length: 2526700, dtype: float64

In [42]:
df[740:754]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,exam_num,test_num,question_num,past_correct_answer,past_correct_tag_answer,past_solve_problem,past_solve_tag,student_correct_answer_rate,problem_correct_answer_rate,tag_correct_answer_rate
740,0,A080129002,A080000129,1,2020-12-23 03:35:54,2723,8,129,2,0,1,0,1,0.63172,0.35589,0.475594
741,0,A080129003,A080000129,0,2020-12-23 03:37:20,2725,8,129,3,0,0,0,0,0.63172,0.205514,0.297061
742,0,A080129004,A080000129,1,2020-12-23 03:38:43,2725,8,129,4,0,0,0,1,0.63172,0.433584,0.297061
743,0,A080129005,A080000129,0,2020-12-23 03:40:14,2725,8,129,5,0,1,0,2,0.63172,0.403509,0.297061
744,0,A080129006,A080000129,0,2020-12-23 03:40:19,2725,8,129,6,0,1,0,3,0.63172,0.14787,0.297061
745,1,A040013001,A040000013,1,2020-01-06 08:40:43,2048,4,13,1,0,0,0,0,0.854077,0.802676,0.613674
746,1,A040013002,A040000013,1,2020-01-06 08:43:46,2048,4,13,2,0,1,0,1,0.854077,0.622074,0.613674
747,1,A040013003,A040000013,1,2020-01-06 08:44:29,2047,4,13,3,0,0,0,0,0.854077,0.795987,0.576941
748,1,A040013004,A040000013,1,2020-01-06 08:46:13,2047,4,13,4,0,1,0,1,0.854077,0.391304,0.576941
749,1,A040013005,A040000013,0,2020-01-06 08:49:45,2047,4,13,5,0,2,0,2,0.854077,0.404682,0.576941


In [43]:
df['solve_time'] = diff.shift(-1).apply(lambda x:600 if x>600 else x)
df['solve_time'] = df['solve_time'].apply(lambda x:np.nan if x==-1 else x)


In [44]:
df[df['solve_time'] != df['solve_time']].head(10)

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,exam_num,test_num,question_num,past_correct_answer,past_correct_tag_answer,past_solve_problem,past_solve_tag,student_correct_answer_rate,problem_correct_answer_rate,tag_correct_answer_rate,solve_time
744,0,A080129006,A080000129,0,2020-12-23 03:40:19,2725,8,129,6,0,1,0,3,0.63172,0.14787,0.297061,
1677,1,A090074006,A090000074,1,2020-11-13 02:47:20,2648,9,74,6,0,4,0,5,0.854077,0.367347,0.431438,
1953,2,A050139007,A050000139,0,2020-10-20 11:32:26,428,5,139,7,0,0,0,6,0.614545,0.120482,0.674247,
2989,3,A050133008,A050000133,-1,2020-10-26 13:13:57,5289,5,133,8,0,9,0,11,0.691787,0.53012,0.559249,
3660,4,A070146008,A070000146,-1,2020-12-27 02:47:54,9080,7,146,8,0,2,0,3,0.692537,0.563758,0.541781,
4493,5,A080138007,A080000138,1,2020-12-11 22:48:28,8431,8,138,7,0,3,0,3,0.796875,0.558233,0.466493,
5414,6,A030145005,A030000145,0,2020-10-26 09:52:14,7817,3,145,5,0,1,0,9,0.443478,0.615385,0.614803,
6226,7,A090052006,A090000052,1,2020-11-02 02:02:59,2600,9,52,6,0,0,0,1,0.607891,0.417671,0.349079,
6762,8,A070060006,A070000060,0,2020-12-25 00:28:18,1260,7,60,6,0,12,0,16,0.642991,0.329317,0.462058,
7629,9,A070158008,A070000158,1,2020-12-28 21:08:31,9797,7,158,8,0,5,0,7,0.682448,0.555556,0.539307,


In [45]:
df.groupby('userID')[['solve_time']].median()['solve_time'].to_numpy()

array([27., 86., 43., ..., 35., 27., 43.])

In [46]:
df.loc[(df['solve_time']!=df['solve_time']),'solve_time']

744       NaN
1677      NaN
1953      NaN
2989      NaN
3660      NaN
           ..
2526643   NaN
2526659   NaN
2526675   NaN
2526690   NaN
2526699   NaN
Name: solve_time, Length: 7442, dtype: float64

In [47]:
df.loc[(df['solve_time']!=df['solve_time']),'solve_time'] = df.groupby('userID')[['solve_time']].median()['solve_time'].to_numpy()

In [48]:
df.isna().sum()

userID                         0
assessmentItemID               0
testId                         0
answerCode                     0
Timestamp                      0
KnowledgeTag                   0
exam_num                       0
test_num                       0
question_num                   0
past_correct_answer            0
past_correct_tag_answer        0
past_solve_problem             0
past_solve_tag                 0
student_correct_answer_rate    0
problem_correct_answer_rate    0
tag_correct_answer_rate        0
solve_time                     0
dtype: int64

In [49]:
df[df['solve_time'] != df['solve_time']]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,exam_num,test_num,question_num,past_correct_answer,past_correct_tag_answer,past_solve_problem,past_solve_tag,student_correct_answer_rate,problem_correct_answer_rate,tag_correct_answer_rate,solve_time


In [50]:
df.groupby('userID')['solve_time'].last()

userID
0       27.0
1       86.0
2       43.0
3       42.0
4       54.0
        ... 
7437    14.0
7438    81.0
7439    35.0
7440    27.0
7441    43.0
Name: solve_time, Length: 7442, dtype: float64

In [51]:
df.loc[(df['solve_time']!=df['solve_time']) & (df['answerCode']==0),'solve_time'] = 600

In [52]:
df.isna().sum()

userID                         0
assessmentItemID               0
testId                         0
answerCode                     0
Timestamp                      0
KnowledgeTag                   0
exam_num                       0
test_num                       0
question_num                   0
past_correct_answer            0
past_correct_tag_answer        0
past_solve_problem             0
past_solve_tag                 0
student_correct_answer_rate    0
problem_correct_answer_rate    0
tag_correct_answer_rate        0
solve_time                     0
dtype: int64

In [53]:
df['student_average_time'] = df['userID'].map(df.groupby('userID')['solve_time'].mean().to_dict())
df['problem_average_time'] = df['assessmentItemID'].map(df.groupby('assessmentItemID')['solve_time'].mean().to_dict())
df['tag_average_time'] = df['KnowledgeTag'].map(df.groupby('KnowledgeTag')['solve_time'].mean().to_dict())

In [54]:
df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,exam_num,test_num,question_num,past_correct_answer,past_correct_tag_answer,past_solve_problem,past_solve_tag,student_correct_answer_rate,problem_correct_answer_rate,tag_correct_answer_rate,solve_time,student_average_time,problem_average_time,tag_average_time
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,6,1,1,0,0,0,0,0.63172,0.987952,0.958611,3.0,118.644295,13.660000,17.728000
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,6,1,2,0,0,0,0,0.63172,0.971888,0.917311,8.0,118.644295,26.112000,135.267200
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,6,1,3,0,1,0,1,0.63172,0.919679,0.917311,7.0,118.644295,19.180000,135.267200
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,6,1,4,0,2,0,2,0.63172,0.975904,0.917311,7.0,118.644295,18.076000,135.267200
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,6,1,5,0,3,0,3,0.63172,0.951807,0.917311,11.0,118.644295,35.468000,135.267200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2526695,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438,3,71,5,0,1,0,4,0.62500,0.448161,0.695043,600.0,110.333333,569.653333,165.499333
2526696,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836,4,165,1,0,0,0,0,0.62500,0.645485,0.698753,11.0,110.333333,46.160000,157.797826
2526697,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836,4,165,2,0,1,0,1,0.62500,0.642140,0.698753,46.0,110.333333,30.363333,157.797826
2526698,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836,4,165,3,0,2,0,2,0.62500,0.789298,0.698753,73.0,110.333333,106.356667,157.797826


In [55]:
df.isna().sum()

userID                         0
assessmentItemID               0
testId                         0
answerCode                     0
Timestamp                      0
KnowledgeTag                   0
exam_num                       0
test_num                       0
question_num                   0
past_correct_answer            0
past_correct_tag_answer        0
past_solve_problem             0
past_solve_tag                 0
student_correct_answer_rate    0
problem_correct_answer_rate    0
tag_correct_answer_rate        0
solve_time                     0
student_average_time           0
problem_average_time           0
tag_average_time               0
dtype: int64

년 월 일 추가.

In [56]:
df['year'] = df['Timestamp'].dt.year
df['month'] = df['Timestamp'].dt.month
df['day'] = df['Timestamp'].dt.day

In [57]:
df.groupby(['userID'])['answerCode'].count().min()

9

최근 10개 데이터 추가.

In [58]:
df['last_answerCode1'] = df.groupby('userID')['answerCode'].shift(1).fillna(method='bfill')
df['last_answerCode2'] = df.groupby("userID")['answerCode'].shift(2).fillna(method='bfill')
df['last_answerCode3'] = df.groupby("userID")['answerCode'].shift(3).fillna(method='bfill')
df['last_answerCode4'] = df.groupby("userID")['answerCode'].shift(4).fillna(method='bfill')
df['last_answerCode5'] = df.groupby("userID")['answerCode'].shift(5).fillna(method='bfill')
df['last_answerCode6'] = df.groupby("userID")['answerCode'].shift(6).fillna(method='bfill')
df['last_answerCode7'] = df.groupby("userID")['answerCode'].shift(7).fillna(method='bfill')
df['last_answerCode8'] = df.groupby("userID")['answerCode'].shift(8).fillna(method='bfill')
df['last_answerCode9'] = df.groupby("userID")['answerCode'].shift(9).fillna(method='bfill').fillna(1)
df['last_answerCode10'] = df.groupby("userID")['answerCode'].shift(10).fillna(method='bfill').fillna(1)


In [59]:
df.groupby(['userID'])['answerCode'].count().min()

9

In [60]:

df['last_answerCode1'] = df['last_answerCode1'].astype(int)
df['last_answerCode2'] = df['last_answerCode2'].astype(int)
df['last_answerCode3'] = df['last_answerCode3'].astype(int)
df['last_answerCode4'] = df['last_answerCode4'].astype(int)
df['last_answerCode5'] = df['last_answerCode5'].astype(int)
df['last_answerCode6'] = df['last_answerCode6'].astype(int)
df['last_answerCode7'] = df['last_answerCode7'].astype(int)
df['last_answerCode8'] = df['last_answerCode8'].astype(int)
df['last_answerCode9'] = df['last_answerCode9'].astype(int)
df['last_answerCode10'] = df['last_answerCode10'].astype(int)
df['solve_time'] = df['solve_time'].astype(int)


In [61]:
#지금 다 되어 있으니까 -1 부분만 떼서 validation 데이터 만들고 catboost로 학습시켜.
train_y = df.loc[df['answerCode']!=-1, 'answerCode'].reset_index()
train_x = df[df['answerCode']!=-1].drop(['answerCode'], axis=1).reset_index()
test_x = df[df['answerCode']==-1].drop(['answerCode'], axis=1).reset_index()

train_x.drop(['index', 'testId', 'assessmentItemID', 'Timestamp'], axis=1, inplace=True)
test_x.drop(['index', 'testId', 'assessmentItemID', 'Timestamp'], axis=1, inplace=True)
train_y.drop(['index'], axis=1, inplace=True)

Validation 셋 만들기

In [62]:
df[(df['answerCode']==-1)].index

Int64Index([   2989,    3660,   10860,   15278,   23531,   26895,   39887,
              47628,   51927,   57352,
            ...
            2525476, 2525493, 2525558, 2525712, 2525876, 2525938, 2526081,
            2526282, 2526297, 2526675],
           dtype='int64', length=744)

In [63]:
valid_df = df.drop(df[(df['answerCode']==-1)].index, axis=0)

In [64]:
valid_df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,exam_num,test_num,question_num,past_correct_answer,...,last_answerCode1,last_answerCode2,last_answerCode3,last_answerCode4,last_answerCode5,last_answerCode6,last_answerCode7,last_answerCode8,last_answerCode9,last_answerCode10
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,6,1,1,0,...,1,1,1,1,1,1,1,1,1,1
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,6,1,2,0,...,1,1,1,1,1,1,1,1,1,1
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,6,1,3,0,...,1,1,1,1,1,1,1,1,1,1
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,6,1,4,0,...,1,1,1,1,1,1,1,1,1,1
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,6,1,5,0,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2526695,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438,3,71,5,0,...,0,1,0,0,0,0,0,0,1,1
2526696,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836,4,165,1,0,...,0,0,1,0,0,0,0,0,1,1
2526697,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836,4,165,2,0,...,1,0,0,1,0,0,0,0,1,1
2526698,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836,4,165,3,0,...,1,1,0,0,1,0,0,0,1,1


In [65]:
validset = valid_df.groupby('userID').last().reset_index()
validset

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,exam_num,test_num,question_num,past_correct_answer,...,last_answerCode1,last_answerCode2,last_answerCode3,last_answerCode4,last_answerCode5,last_answerCode6,last_answerCode7,last_answerCode8,last_answerCode9,last_answerCode10
0,0,A080129006,A080000129,0,2020-12-23 03:40:19,2725,8,129,6,0,...,0,1,0,1,1,0,1,0,1,1
1,1,A090074006,A090000074,1,2020-11-13 02:47:20,2648,9,74,6,0,...,1,1,1,1,0,1,1,1,1,1
2,2,A050139007,A050000139,0,2020-10-20 11:32:26,428,5,139,7,0,...,0,0,0,0,0,0,1,1,1,1
3,3,A050133007,A050000133,0,2020-10-26 13:13:11,5289,5,133,7,0,...,1,1,1,1,1,1,0,1,1,0
4,4,A070146007,A070000146,1,2020-12-27 02:47:31,9080,7,146,7,0,...,1,1,1,0,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7437,7437,A060003007,A060000003,0,2020-05-22 01:53:49,7226,6,3,7,0,...,1,0,0,0,0,0,0,0,0,1
7438,7438,A030188005,A030000188,1,2020-10-19 10:28:29,1934,3,188,5,0,...,1,1,0,1,1,1,0,1,1,0
7439,7439,A040130004,A040000130,1,2020-10-14 23:09:31,8244,4,130,4,0,...,1,1,0,1,0,0,1,1,1,0
7440,7440,A030197005,A030000197,0,2020-10-21 08:33:20,1984,3,197,5,0,...,0,0,0,1,0,0,1,1,1,0


In [66]:
valid_x = validset.drop(['answerCode', 'testId', 'assessmentItemID', 'Timestamp'], axis=1)
valid_y = validset.loc[:, 'answerCode'].reset_index().drop(['index'],axis=1)

In [67]:
valid_x

Unnamed: 0,userID,KnowledgeTag,exam_num,test_num,question_num,past_correct_answer,past_correct_tag_answer,past_solve_problem,past_solve_tag,student_correct_answer_rate,...,last_answerCode1,last_answerCode2,last_answerCode3,last_answerCode4,last_answerCode5,last_answerCode6,last_answerCode7,last_answerCode8,last_answerCode9,last_answerCode10
0,0,2725,8,129,6,0,1,0,3,0.631720,...,0,1,0,1,1,0,1,0,1,1
1,1,2648,9,74,6,0,4,0,5,0.854077,...,1,1,1,1,0,1,1,1,1,1
2,2,428,5,139,7,0,0,0,6,0.614545,...,0,0,0,0,0,0,1,1,1,1
3,3,5289,5,133,7,0,9,0,10,0.691787,...,1,1,1,1,1,1,0,1,1,0
4,4,9080,7,146,7,0,1,0,2,0.692537,...,1,1,1,0,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7437,7437,7226,6,3,7,0,1,0,6,0.400000,...,1,0,0,0,0,0,0,0,0,1
7438,7438,1934,3,188,5,0,3,0,4,0.800000,...,1,1,0,1,1,1,0,1,1,0
7439,7439,8244,4,130,4,0,1,0,1,0.666667,...,1,1,0,1,0,0,1,1,1,0
7440,7440,1984,3,197,5,0,1,0,4,0.428571,...,0,0,0,1,0,0,1,1,1,0


In [68]:
valid_y

Unnamed: 0,answerCode
0,0
1,1
2,0
3,0
4,1
...,...
7437,0
7438,1
7439,1
7440,0


In [69]:
train_x

Unnamed: 0,userID,KnowledgeTag,exam_num,test_num,question_num,past_correct_answer,past_correct_tag_answer,past_solve_problem,past_solve_tag,student_correct_answer_rate,...,last_answerCode1,last_answerCode2,last_answerCode3,last_answerCode4,last_answerCode5,last_answerCode6,last_answerCode7,last_answerCode8,last_answerCode9,last_answerCode10
0,0,7224,6,1,1,0,0,0,0,0.63172,...,1,1,1,1,1,1,1,1,1,1
1,0,7225,6,1,2,0,0,0,0,0.63172,...,1,1,1,1,1,1,1,1,1,1
2,0,7225,6,1,3,0,1,0,1,0.63172,...,1,1,1,1,1,1,1,1,1,1
3,0,7225,6,1,4,0,2,0,2,0.63172,...,1,1,1,1,1,1,1,1,1,1
4,0,7225,6,1,5,0,3,0,3,0.63172,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525951,7441,438,3,71,5,0,1,0,4,0.62500,...,0,1,0,0,0,0,0,0,1,1
2525952,7441,8836,4,165,1,0,0,0,0,0.62500,...,0,0,1,0,0,0,0,0,1,1
2525953,7441,8836,4,165,2,0,1,0,1,0.62500,...,1,0,0,1,0,0,0,0,1,1
2525954,7441,8836,4,165,3,0,2,0,2,0.62500,...,1,1,0,0,1,0,0,0,1,1


In [70]:
train_x

Unnamed: 0,userID,KnowledgeTag,exam_num,test_num,question_num,past_correct_answer,past_correct_tag_answer,past_solve_problem,past_solve_tag,student_correct_answer_rate,...,last_answerCode1,last_answerCode2,last_answerCode3,last_answerCode4,last_answerCode5,last_answerCode6,last_answerCode7,last_answerCode8,last_answerCode9,last_answerCode10
0,0,7224,6,1,1,0,0,0,0,0.63172,...,1,1,1,1,1,1,1,1,1,1
1,0,7225,6,1,2,0,0,0,0,0.63172,...,1,1,1,1,1,1,1,1,1,1
2,0,7225,6,1,3,0,1,0,1,0.63172,...,1,1,1,1,1,1,1,1,1,1
3,0,7225,6,1,4,0,2,0,2,0.63172,...,1,1,1,1,1,1,1,1,1,1
4,0,7225,6,1,5,0,3,0,3,0.63172,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525951,7441,438,3,71,5,0,1,0,4,0.62500,...,0,1,0,0,0,0,0,0,1,1
2525952,7441,8836,4,165,1,0,0,0,0,0.62500,...,0,0,1,0,0,0,0,0,1,1
2525953,7441,8836,4,165,2,0,1,0,1,0.62500,...,1,0,0,1,0,0,0,0,1,1
2525954,7441,8836,4,165,3,0,2,0,2,0.62500,...,1,1,0,0,1,0,0,0,1,1


In [71]:
test_x

Unnamed: 0,userID,KnowledgeTag,exam_num,test_num,question_num,past_correct_answer,past_correct_tag_answer,past_solve_problem,past_solve_tag,student_correct_answer_rate,...,last_answerCode1,last_answerCode2,last_answerCode3,last_answerCode4,last_answerCode5,last_answerCode6,last_answerCode7,last_answerCode8,last_answerCode9,last_answerCode10
0,3,5289,5,133,8,0,9,0,11,0.691787,...,0,1,1,1,1,1,1,0,1,1
1,4,9080,7,146,8,0,2,0,3,0.692537,...,1,1,1,1,0,1,1,0,0,1
2,13,9660,7,111,8,0,2,0,6,0.694529,...,1,0,0,0,1,1,0,1,0,0
3,17,2611,9,64,6,0,5,0,5,0.818110,...,1,1,1,1,1,1,1,1,1,1
4,26,1422,6,135,7,0,4,0,6,0.756477,...,0,1,0,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,7395,10615,4,122,5,0,0,0,2,0.260870,...,0,0,1,0,0,0,0,0,0,0
740,7404,7636,3,111,5,0,2,0,4,0.428571,...,1,1,0,0,0,1,0,0,0,1
741,7416,10402,5,193,4,0,2,0,3,0.428571,...,0,1,1,0,0,0,1,1,0,0
742,7417,10402,5,193,4,0,2,0,3,0.071429,...,0,1,1,0,0,0,0,0,0,0


In [72]:
train_x.columns

Index(['userID', 'KnowledgeTag', 'exam_num', 'test_num', 'question_num',
       'past_correct_answer', 'past_correct_tag_answer', 'past_solve_problem',
       'past_solve_tag', 'student_correct_answer_rate',
       'problem_correct_answer_rate', 'tag_correct_answer_rate', 'solve_time',
       'student_average_time', 'problem_average_time', 'tag_average_time',
       'year', 'month', 'day', 'last_answerCode1', 'last_answerCode2',
       'last_answerCode3', 'last_answerCode4', 'last_answerCode5',
       'last_answerCode6', 'last_answerCode7', 'last_answerCode8',
       'last_answerCode9', 'last_answerCode10'],
      dtype='object')

In [73]:
train_y

Unnamed: 0,answerCode
0,1
1,1
2,1
3,1
4,1
...,...
2525951,0
2525952,1
2525953,1
2525954,1


In [74]:
cat_params={
    "task_type" : "GPU",
    "devices" : '0',
    "random_state": 42,
    'learning_rate': 0.01, 
    'bagging_temperature': 0.15696396388661144, 
    'n_estimators': 4000, 
    'max_depth': 15, 
    'random_strength': 13, 
    'l2_leaf_reg': 5.887526043950164e-06, 
    'min_child_samples': 9, 
    'max_bin': 297, 
    'od_type': 'Iter',
}

xgb_params={
    "learning_rate" : 0.02,
    "n_estimators" : 1000,
    "max_depth":9
}


lgb_params = {'learning_rate': 0.01,
          'num_iterations':1000,
          'max_depth': 16,
          'boosting': 'gbdt',
          'objective': 'binary',
          'is_training_metric': True,
          'num_leaves': 144,
          'feature_fraction': 0.9,
          'bagging_fraction': 0.7,
          'bagging_freq': 5,
          'seed':2020}


In [75]:
train_x.columns

Index(['userID', 'KnowledgeTag', 'exam_num', 'test_num', 'question_num',
       'past_correct_answer', 'past_correct_tag_answer', 'past_solve_problem',
       'past_solve_tag', 'student_correct_answer_rate',
       'problem_correct_answer_rate', 'tag_correct_answer_rate', 'solve_time',
       'student_average_time', 'problem_average_time', 'tag_average_time',
       'year', 'month', 'day', 'last_answerCode1', 'last_answerCode2',
       'last_answerCode3', 'last_answerCode4', 'last_answerCode5',
       'last_answerCode6', 'last_answerCode7', 'last_answerCode8',
       'last_answerCode9', 'last_answerCode10'],
      dtype='object')

In [76]:
#train_x.drop(['past_solve_tag', 'past_correct_tag_answer','tag_correct_answer_rate','tag_average_time'],axis=1,inplace=True)

In [81]:
train_x['student_correct_answer_rate'].nunique()

5204

In [77]:
train_x

Unnamed: 0,userID,KnowledgeTag,exam_num,test_num,question_num,past_correct_answer,past_correct_tag_answer,past_solve_problem,past_solve_tag,student_correct_answer_rate,...,last_answerCode1,last_answerCode2,last_answerCode3,last_answerCode4,last_answerCode5,last_answerCode6,last_answerCode7,last_answerCode8,last_answerCode9,last_answerCode10
0,0,7224,6,1,1,0,0,0,0,0.63172,...,1,1,1,1,1,1,1,1,1,1
1,0,7225,6,1,2,0,0,0,0,0.63172,...,1,1,1,1,1,1,1,1,1,1
2,0,7225,6,1,3,0,1,0,1,0.63172,...,1,1,1,1,1,1,1,1,1,1
3,0,7225,6,1,4,0,2,0,2,0.63172,...,1,1,1,1,1,1,1,1,1,1
4,0,7225,6,1,5,0,3,0,3,0.63172,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525951,7441,438,3,71,5,0,1,0,4,0.62500,...,0,1,0,0,0,0,0,0,1,1
2525952,7441,8836,4,165,1,0,0,0,0,0.62500,...,0,0,1,0,0,0,0,0,1,1
2525953,7441,8836,4,165,2,0,1,0,1,0.62500,...,1,0,0,1,0,0,0,0,1,1
2525954,7441,8836,4,165,3,0,2,0,2,0.62500,...,1,1,0,0,1,0,0,0,1,1


In [78]:
model = CatBoostClassifier(**cat_params)
#cat_feat = [0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 16,17,18,19,20,21,22,23,24,25,26,27,28]
model.fit(train_x, train_y, eval_set = (valid_x, valid_y))

pre = model.predict(test_x)

submit = pd.read_csv('/opt/ml/input/data/sample_submission.csv')
submit['prediction'] = pre
submit.to_csv('CatBoost_v4_tuning.csv', index=False)

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


0:	learn: 0.6878667	test: 0.6898120	best: 0.6898120 (0)	total: 74.4ms	remaining: 4m 57s
1:	learn: 0.6826173	test: 0.6868393	best: 0.6868393 (1)	total: 389ms	remaining: 12m 56s
2:	learn: 0.6776332	test: 0.6836956	best: 0.6836956 (2)	total: 702ms	remaining: 15m 34s
3:	learn: 0.6733159	test: 0.6809052	best: 0.6809052 (3)	total: 767ms	remaining: 12m 46s
4:	learn: 0.6683666	test: 0.6778710	best: 0.6778710 (4)	total: 1.08s	remaining: 14m 23s
5:	learn: 0.6637507	test: 0.6749574	best: 0.6749574 (5)	total: 1.19s	remaining: 13m 9s
6:	learn: 0.6592043	test: 0.6718550	best: 0.6718550 (6)	total: 1.5s	remaining: 14m 18s
7:	learn: 0.6546713	test: 0.6691497	best: 0.6691497 (7)	total: 1.82s	remaining: 15m 8s
8:	learn: 0.6502245	test: 0.6663870	best: 0.6663870 (8)	total: 2.14s	remaining: 15m 47s
9:	learn: 0.6461879	test: 0.6638999	best: 0.6638999 (9)	total: 2.16s	remaining: 14m 22s
10:	learn: 0.6421218	test: 0.6611140	best: 0.6611140 (10)	total: 2.48s	remaining: 14m 58s
11:	learn: 0.6384719	test: 0.6585

SystemError: <method '_train' of '_catboost._CatBoost' objects> returned a result with an exception set

In [None]:
train_y

Unnamed: 0,answerCode
0,1
1,1
2,1
3,1
4,1
...,...
2525951,0
2525952,1
2525953,1
2525954,1


In [None]:
from xgboost import XGBClassifier

model = XGBClassifier()
model.fit(train_x, train_y, eval_set = [(valid_x, valid_y)], eval_metric = 'auc',early_stopping_rounds=100)

pre = model.predict(test_x)

submit = pd.read_csv('/opt/ml/input/data/sample_submission.csv')
submit['prediction'] = pre
submit.to_csv('XGBoost_tuning.csv', index=False)




[0]	validation_0-auc:0.77605
[1]	validation_0-auc:0.78277
[2]	validation_0-auc:0.78740
[3]	validation_0-auc:0.79362
[4]	validation_0-auc:0.79419
[5]	validation_0-auc:0.79742
[6]	validation_0-auc:0.79876
[7]	validation_0-auc:0.80123
[8]	validation_0-auc:0.80206
[9]	validation_0-auc:0.80250
[10]	validation_0-auc:0.80387
[11]	validation_0-auc:0.80416
[12]	validation_0-auc:0.80513
[13]	validation_0-auc:0.80599
[14]	validation_0-auc:0.80627
[15]	validation_0-auc:0.80693
[16]	validation_0-auc:0.80746
[17]	validation_0-auc:0.80789
[18]	validation_0-auc:0.80819
[19]	validation_0-auc:0.80876
[20]	validation_0-auc:0.80917
[21]	validation_0-auc:0.80939
[22]	validation_0-auc:0.80983
[23]	validation_0-auc:0.80989
[24]	validation_0-auc:0.80993
[25]	validation_0-auc:0.81043
[26]	validation_0-auc:0.81062
[27]	validation_0-auc:0.81107
[28]	validation_0-auc:0.81130
[29]	validation_0-auc:0.81135
[30]	validation_0-auc:0.81156
[31]	validation_0-auc:0.81171
[32]	validation_0-auc:0.81192
[33]	validation_0-au

In [None]:
import lightgbm as lgb

lgb_train = lgb.Dataset(train_x, train_y)
lgb_test = lgb.Dataset(valid_x, valid_y)

model = lgb.train(lgb_params, lgb_train, valid_sets=[lgb_train, lgb_test], verbose_eval=100, num_boost_round=500, early_stopping_rounds=100)

pre = model.predict(test_x)

submit = pd.read_csv('/opt/ml/input/data/sample_submission.csv')
submit['prediction'] = pre
submit.to_csv('LGBM_tuning.csv', index=False)



[LightGBM] [Info] Number of positive: 1653588, number of negative: 872368
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2686
[LightGBM] [Info] Number of data points in the train set: 2525956, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654638 -> initscore=0.639491
[LightGBM] [Info] Start training from score 0.639491
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.49891	valid_1's binary_logloss: 0.588887
[200]	training's binary_logloss: 0.464938	valid_1's binary_logloss: 0.551566
[300]	training's binary_logloss: 0.454588	valid_1's binary_logloss: 0.539333
[400]	training's binary_logloss: 0.450308	valid_1's binary_logloss: 0.533754
[500]	training's binary_logloss: 0.448003	valid_1's binary_logloss: 0.53068
[600]	training's binary_logloss: 0.446317	valid_1's binary_logloss: 0.52856
[700]	training's binary_logloss: 0.444955	valid_1's binary_logloss: 0.526616
[800]	tra