In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
plt.style.use('seaborn')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
pd.set_option('display.min_rows', 500)
import warnings
warnings.filterwarnings('ignore')

def concat_and_export(train_fe, test_fe):
    
    train_fe['kind']='train'
    test_fe['kind']='test'
    
    df = pd.concat([train_fe,test_fe])
    data_dir = '/opt/ml/input/data' # 경로
    write_path=f'{data_dir}/after_fe_train_test_cumsum.pkl'
    df.to_pickle(write_path,index=False)
    print(f"Write: {write_path}")
    
def export(df, output='after_fe_train_test.pkl'):
    data_dir = '/opt/ml/input/data' # 경로
    write_path=f'{data_dir}/{output}'
    df.to_pickle(write_path)
    print(f"Write: {write_path}")
    
path='../../data/'
train = pd.read_csv(f"{path}/train_data.csv")
test = pd.read_csv(f"{path}/test_data.csv")
sub = pd.read_csv(f"{path}/sample_submission.csv")

In [2]:
day_dict = {'Tuesday': 0,
 'Thursday': 1,
 'Monday': 2,
 'Saturday': 3,
 'Friday': 4,
 'Wednesday': 5,
 'Sunday': 6}

def feature_engineering(df):
    df2 = df.copy()
    #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df2.sort_values(by=['userID','Timestamp'], inplace=True)
    
    #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산
    df2['user_correct_answer'] = df2.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df2['user_total_answer'] = df2.groupby('userID')['answerCode'].cumcount()
    df2['user_acc'] = df2['user_correct_answer']/df2['user_total_answer']
    df2['month'] = pd.to_datetime(df2.Timestamp).dt.month
    df2['day'] = pd.to_datetime(df2.Timestamp).dt.day
    df2['hour'] = pd.to_datetime(df2.Timestamp).dt.hour
    df2['dayname'] = pd.to_datetime(df2.Timestamp).dt.day_name().map(day_dict)
    df2['big_category'] = df2.testId.map(lambda x:x[2]).astype(int)
    df2['problem_num'] = df2.assessmentItemID.map(lambda x: int(x[-3:]))
    df2['mid_category'] = df2.testId.map(lambda x: int(x[-3:]))

    # testId와 KnowledgeTag의 전체 정답률은 한번에 계산
    # 아래 데이터는 제출용 데이터셋에 대해서도 재사용
    correct_t = df2.groupby(['testId'])['answerCode'].agg(['mean', 'std', 'sum'])
    correct_t.columns = ["test_mean", "test_std", 'test_sum']
    correct_k = df2.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'std', 'sum'])
    correct_k.columns = ["tag_mean", 'tag_std', 'tag_sum']

    df2 = pd.merge(df2, correct_t, on=['testId'], how="left")
    df2 = pd.merge(df2, correct_k, on=['KnowledgeTag'], how="left")
    
    # 유저별 문제푼 시간, solvesec_3600, time_category
    df2['Timestamp2'] = pd.to_datetime(df2.Timestamp)
    df2['solvetime'] = df2.groupby('userID')['Timestamp2'].diff().fillna(pd.Timedelta(seconds=0))
    df2['solvesec'] = df2.solvetime.map(lambda x : x.total_seconds())
    df2['solvesec_3600'] = df2.solvesec
    df2.loc[df2.solvesec>=3600,'solvesec_3600']=3600

    df2['time_category'] = ''
    tc = [0,5,7,10,60,600,1200,2400,3600]
    df2.loc[(df2.solvesec==0), 'time_category'] = "0 - [0,0]"
    for i in range(len(tc)-1):
        s,e = tc[i],tc[i+1]
        df2.loc[(df2.solvesec>s) & (df2.solvesec<=e),'time_category']=f"{i+1} - ({s}, {e}]"
    df2.loc[(df2.solvesec>=tc[-1]),'time_category'] = f"{i+2} - ({e}, )"
    timecat2idx={k:v for v,k in enumerate(sorted(df2.time_category.unique()))}
    df2['time_category'] = df2.time_category.map(timecat2idx)
    
    return df2

In [3]:
train_fe = feature_engineering(train)
test_fe = feature_engineering(test)
train_fe.shape, test_fe.shape

((2266586, 27), (260114, 27))

In [4]:
train_fe['kind']='train'
test_fe['kind']='test'
df = pd.concat([train_fe,test_fe])

In [5]:
df[:3]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,month,...,test_sum,tag_mean,tag_std,tag_sum,Timestamp2,solvetime,solvesec,solvesec_3600,time_category,kind
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,,0,,3,...,1268,0.955022,0.20741,637,2020-03-24 00:17:11,0 days 00:00:00,0.0,0.0,0,train
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,1.0,1,1.0,3,...,1268,0.913187,0.281603,3040,2020-03-24 00:17:14,0 days 00:00:03,3.0,3.0,1,train
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,2.0,2,1.0,3,...,1268,0.913187,0.281603,3040,2020-03-24 00:17:22,0 days 00:00:08,8.0,8.0,3,train


In [37]:
df['solvesec_cumsum'] = df.groupby(['userID','testId'])['solvesec_3600'].cumsum()%3601

In [53]:
df['solvesec_cumsum2'] = df.groupby(['userID','testId'])['solvesec_3600'].cumsum()%3600

In [38]:
df.solvesec_cumsum.describe()

count    2.526700e+06
mean     7.900849e+02
std      1.247025e+03
min      0.000000e+00
25%      4.900000e+01
50%      1.670000e+02
75%      7.030000e+02
max      3.600000e+03
Name: solvesec_cumsum, dtype: float64

In [41]:
df['solvecumsum_category'] = ''
tc = [0,5,7,10,60,600,1200,2400,3600,7200]
df.loc[(df.solvesec_cumsum==0), 'solvecumsum_category'] = "0 - [0,0]"
print(0)
for i in range(len(tc)-1):
    s,e = tc[i],tc[i+1]
    df.loc[(df.solvesec_cumsum>s) & (df.solvesec_cumsum<=e),'solvecumsum_category']=f"{i+1} - ({s}, {e}]"
    print(s,e)
df.loc[(df.solvesec_cumsum>=tc[-1]),'solvecumsum_category'] = f"{i+2} - ({e}, )"
print(e)

0
0 5
5 7
7 10
10 60
60 600
600 1200
1200 2400
2400 3600
3600 7200
7200


In [42]:
time_grp = df.groupby('solvecumsum_category')['answerCode'].agg(['mean','count'])
time_grp['C'] = time_grp.index.str.extract(r'(\d+)').astype(int)[0].tolist()
time_grp.sort_values('C')[['mean','count']]

Unnamed: 0_level_0,mean,count
solvecumsum_category,Unnamed: 1_level_1,Unnamed: 2_level_1
"0 - [0,0]",0.479131,14735
"1 - (0, 5]",0.27578,90895
"2 - (5, 7]",0.395883,40224
"3 - (7, 10]",0.44235,59003
"4 - (10, 60]",0.646462,515006
"5 - (60, 600]",0.68474,1129444
"6 - (600, 1200]",0.648981,168475
"7 - (1200, 2400]",0.638723,108947
"8 - (2400, 3600]",0.733706,399971


In [44]:
df[df.time_category==0]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,month,...,time_category,test_mean,test_std,test_sum,tag_mean,tag_std,tag_sum,kind,solvesec_cumsum,solvecumsum_category
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,,0,,3,...,0,0.947683,0.222749,1268,0.955022,0.207410,637,train,0.0,"0 - [0,0]"
123,0,A080016008,A080000016,0,2020-05-07 00:52:47,4672,88.0,123,0.715447,5,...,0,0.602778,0.489549,651,0.432649,0.495554,970,train,321.0,"5 - (60, 600]"
146,0,A080020006,A080000020,0,2020-05-14 01:35:29,4673,102.0,146,0.698630,5,...,0,0.612500,0.487414,637,0.439641,0.496483,783,train,519.0,"5 - (60, 600]"
263,0,A080039002,A080000039,0,2020-06-11 02:02:22,4688,170.0,263,0.646388,6,...,0,0.533046,0.499266,371,0.376241,0.484512,1289,train,18.0,"4 - (10, 60]"
469,0,A080078008,A080000078,0,2020-08-29 00:26:11,23,289.0,469,0.616205,8,...,0,0.691576,0.462157,509,0.584444,0.492954,1052,train,41.0,"4 - (10, 60]"
553,0,A080090002,A080000090,1,2020-09-18 20:13:07,1110,344.0,553,0.622061,9,...,0,0.739130,0.439907,204,0.570035,0.495114,3272,train,258.0,"5 - (60, 600]"
556,0,A080092002,A080000092,1,2020-09-22 18:57:09,1110,346.0,556,0.622302,9,...,0,0.755435,0.430414,278,0.570035,0.495114,3272,train,47.0,"4 - (10, 60]"
559,0,A080092008,A080000092,1,2020-09-22 19:02:06,1110,348.0,559,0.622540,9,...,0,0.755435,0.430414,278,0.570035,0.495114,3272,train,344.0,"5 - (60, 600]"
745,1,A040013001,A040000013,1,2020-01-06 08:40:43,2048,,0,,1,...,0,0.595472,0.490986,789,0.616900,0.486297,971,train,0.0,"0 - [0,0]"
1678,2,A030050001,A030000050,1,2020-01-10 11:02:53,407,,0,,1,...,0,0.771212,0.420212,1018,0.766520,0.423094,3306,train,0.0,"0 - [0,0]"


In [45]:
solvecumsum_category2idx={k:v for v,k in enumerate(sorted(df.solvecumsum_category.unique()))}
df['solvecumsum_category'] = df.solvecumsum_category.map(solvecumsum_category2idx)
df.solvecumsum_category.value_counts()

5    1129444
4     515006
8     399971
6     168475
7     108947
1      90895
3      59003
2      40224
0      14735
Name: solvecumsum_category, dtype: int64

In [46]:
df[:3]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,month,...,time_category,test_mean,test_std,test_sum,tag_mean,tag_std,tag_sum,kind,solvesec_cumsum,solvecumsum_category
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,,0,,3,...,0,0.947683,0.222749,1268,0.955022,0.20741,637,train,0.0,0
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,1.0,1,1.0,3,...,1,0.947683,0.222749,1268,0.913187,0.281603,3040,train,3.0,1
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,2.0,2,1.0,3,...,3,0.947683,0.222749,1268,0.913187,0.281603,3040,train,11.0,4


In [55]:
df.solvesec_cumsum2.describe()

count    2.526700e+06
mean     2.901079e+02
std      5.412119e+02
min      0.000000e+00
25%      1.600000e+01
50%      9.000000e+01
75%      2.820000e+02
max      3.599000e+03
Name: solvesec_cumsum2, dtype: float64

In [54]:
df2 = df.copy()
export(df2, output='after_fe_train_test_cumsum.pkl')

Write: /opt/ml/input/data/after_fe_train_test_cumsum.pkl


In [56]:
df[df.userID==7331][['userID','assessmentItemID','testId','answerCode','user_correct_answer','user_total_answer','user_acc']]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,user_correct_answer,user_total_answer,user_acc
2264881,7331,A060130001,A060000130,1,,0,
2264882,7331,A060130002,A060000130,0,1.0,1,1.0
2264883,7331,A060130003,A060000130,1,1.0,2,0.5
2264884,7331,A060130004,A060000130,0,2.0,3,0.666667
2264885,7331,A060130005,A060000130,0,2.0,4,0.5
2264886,7331,A060130006,A060000130,1,2.0,5,0.4
2264887,7331,A060130007,A060000130,1,3.0,6,0.5
2264888,7331,A020118001,A020000118,1,4.0,7,0.571429
2264889,7331,A020118002,A020000118,1,5.0,8,0.625
2264890,7331,A020118003,A020000118,0,6.0,9,0.666667
