In [33]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
plt.style.use('seaborn')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
pd.set_option('display.min_rows', 500)
import warnings
warnings.filterwarnings('ignore')

def concat_and_export(train_fe, test_fe):
    
    train_fe['kind']='train'
    test_fe['kind']='test'
    
    df = pd.concat([train_fe,test_fe])
    data_dir = '/opt/ml/input/data' # 경로
    write_path=f'{data_dir}/after_fe_train_test.pkl'
    df.to_pickle(write_path)
    print(f"Write: {write_path}")
    
def export(df):
    data_dir = '/opt/ml/input/data' # 경로
    write_path=f'{data_dir}/after_fe_train_test.pkl'
    df.to_pickle(write_path)
    print(f"Write: {write_path}")
    
path='../../data/'
train = pd.read_csv(f"{path}/train_data.csv")
test = pd.read_csv(f"{path}/test_data.csv")
sub = pd.read_csv(f"{path}/sample_submission.csv")

day_dict = {'Tuesday': 0,
 'Thursday': 1,
 'Monday': 2,
 'Saturday': 3,
 'Friday': 4,
 'Wednesday': 5,
 'Sunday': 6}

def feature_engineering(df):
    df2 = df.copy()
    #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df2.sort_values(by=['userID','Timestamp'], inplace=True)
    
    #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산
    df2['user_correct_answer'] = df2.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df2['user_total_answer'] = df2.groupby('userID')['answerCode'].cumcount()
    df2['user_acc'] = df2['user_correct_answer']/df2['user_total_answer']
    df2['month'] = pd.to_datetime(df2.Timestamp).dt.month
    df2['day'] = pd.to_datetime(df2.Timestamp).dt.day
    df2['hour'] = pd.to_datetime(df2.Timestamp).dt.hour
    df2['dayname'] = pd.to_datetime(df2.Timestamp).dt.day_name().map(day_dict)
    df2['big_category'] = df2.testId.map(lambda x:x[2]).astype(int)
    df2['problem_num'] = df2.assessmentItemID.map(lambda x: int(x[-3:]))
    df2['mid_category'] = df2.testId.map(lambda x: int(x[-3:]))

    # testId와 KnowledgeTag의 전체 정답률은 한번에 계산
    # 아래 데이터는 제출용 데이터셋에 대해서도 재사용
    correct_t = df2.groupby(['testId'])['answerCode'].agg(['mean', 'std', 'sum'])
    correct_t.columns = ["test_mean", "test_std", 'test_sum']
    correct_k = df2.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'std', 'sum'])
    correct_k.columns = ["tag_mean", 'tag_std', 'tag_sum']

    df2 = pd.merge(df2, correct_t, on=['testId'], how="left")
    df2 = pd.merge(df2, correct_k, on=['KnowledgeTag'], how="left")
    
    return df2

train_fe = feature_engineering(train)
test_fe = feature_engineering(test)
train_fe.shape, test_fe.shape

train_fe['kind']='train'
test_fe['kind']='test'
df = pd.concat([train_fe,test_fe])

((2266586, 22), (260114, 22))

In [34]:
df = df.sort_values(['userID','Timestamp'])

df.reset_index(drop=True,inplace=True)

df['Timestamp_start'] = pd.to_datetime(df.Timestamp)

In [35]:
df['Timestamp_fin'] = df.groupby('userID')['Timestamp_start'].shift(-1)

In [36]:
df['solvetime'] = df.Timestamp_fin - df.Timestamp_start

In [38]:
df.solvetime = df.solvetime.fillna(pd.Timedelta(seconds=0))

In [39]:
df['solvesec'] = df.solvetime.map(lambda x : x.total_seconds())

In [55]:
df[df.userID==7331]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,month,...,test_sum,tag_mean,tag_std,tag_sum,kind,Timestamp_start,Timestamp_fin,solvetime,solvesec,time_category
2524767,7331,A060130001,A060000130,1,2020-10-05 11:33:34,714,,0,,10,...,745,0.487421,0.499901,2073,train,2020-10-05 11:33:34,2020-10-05 11:35:06,0 days 00:01:32,92.0,"5 - (60, 600]"
2524768,7331,A060130002,A060000130,0,2020-10-05 11:35:06,714,1.0,1,1.0,10,...,745,0.487421,0.499901,2073,train,2020-10-05 11:35:06,2020-10-05 11:38:51,0 days 00:03:45,225.0,"5 - (60, 600]"
2524769,7331,A060130003,A060000130,1,2020-10-05 11:38:51,714,1.0,2,0.5,10,...,745,0.487421,0.499901,2073,train,2020-10-05 11:38:51,2020-10-05 11:42:05,0 days 00:03:14,194.0,"5 - (60, 600]"
2524770,7331,A060130004,A060000130,0,2020-10-05 11:42:05,714,2.0,3,0.666667,10,...,745,0.487421,0.499901,2073,train,2020-10-05 11:42:05,2020-10-05 11:53:15,0 days 00:11:10,670.0,"6 - (600, 1200]"
2524771,7331,A060130005,A060000130,0,2020-10-05 11:53:15,714,2.0,4,0.5,10,...,745,0.487421,0.499901,2073,train,2020-10-05 11:53:15,2020-10-05 12:02:13,0 days 00:08:58,538.0,"5 - (60, 600]"
2524772,7331,A060130006,A060000130,1,2020-10-05 12:02:13,714,2.0,5,0.4,10,...,745,0.487421,0.499901,2073,train,2020-10-05 12:02:13,2020-10-05 12:13:36,0 days 00:11:23,683.0,"6 - (600, 1200]"
2524773,7331,A060130007,A060000130,1,2020-10-05 12:13:36,714,3.0,6,0.5,10,...,745,0.487421,0.499901,2073,train,2020-10-05 12:13:36,2020-10-25 03:02:10,19 days 14:48:34,1694914.0,"9 - (3600, )"
2524774,7331,A020118001,A020000118,1,2020-10-25 03:02:10,8014,4.0,7,0.571429,10,...,1067,0.792901,0.405277,3239,train,2020-10-25 03:02:10,2020-10-25 03:02:18,0 days 00:00:08,8.0,"3 - (7, 10]"
2524775,7331,A020118002,A020000118,1,2020-10-25 03:02:18,8014,5.0,8,0.625,10,...,1067,0.792901,0.405277,3239,train,2020-10-25 03:02:18,2020-10-25 03:03:31,0 days 00:01:13,73.0,"5 - (60, 600]"
2524776,7331,A020118003,A020000118,0,2020-10-25 03:03:31,8014,6.0,9,0.666667,10,...,1067,0.792901,0.405277,3239,train,2020-10-25 03:03:31,2020-10-25 03:03:51,0 days 00:00:20,20.0,"4 - (10, 60]"


## 타임 카테고리

### answercode -1 제외

In [43]:
df2 = df.copy()

In [44]:
df2 = df2[df2.answerCode!=-1]

In [45]:
df.shape, df2.shape

((2526700, 28), (2525956, 28))

In [46]:
df2['time_category'] = ''

tc = [0,5,7,10,60,600,1200,2400,3600]
df2.loc[(df2.solvesec==0), 'time_category'] = "0 - [0,0]"
print(0)
for i in range(len(tc)-1):
    s,e = tc[i],tc[i+1]
    df2.loc[(df2.solvesec>s) & (df2.solvesec<=e),'time_category']=f"{i+1} - ({s}, {e}]"
    print(s,e)
df2.loc[(df2.solvesec>=tc[-1]),'time_category'] = f"{i+2} - ({e}, )"
print(e)

time_grp = df2.groupby('time_category')['answerCode'].agg(['mean','count'])
time_grp['C'] = time_grp.index.str.extract(r'(\d+)').astype(int)[0].tolist()
time_grp.sort_values('C')[['mean','count']]

0
0 5
5 7
7 10
10 60
60 600
600 1200
1200 2400
2400 3600
3600


Unnamed: 0_level_0,mean,count
time_category,Unnamed: 1_level_1,Unnamed: 2_level_1
"0 - [0,0]",0.460475,14978
"1 - (0, 5]",0.269916,364747
"2 - (5, 7]",0.529562,69176
"3 - (7, 10]",0.682586,102204
"4 - (10, 60]",0.77853,1008903
"5 - (60, 600]",0.761771,552226
"6 - (600, 1200]",0.696136,27384
"7 - (1200, 2400]",0.649585,16763
"8 - (2400, 3600]",0.62791,7560
"9 - (3600, )",0.555264,362015


In [48]:
df2['solvesec_3600'] = df2.solvesec

In [49]:
df2.loc[df2.solvesec_3600>=3600,'solvesec_3600']=3600

In [51]:
timecat2idx={k:v for v,k in enumerate(sorted(df2.time_category.unique()))}
df2['time_category'] = df2.time_category.map(timecat2idx)
df2.time_category.value_counts()

4    1008903
5     552226
1     364747
9     362015
3     102204
2      69176
6      27384
7      16763
0      14978
8       7560
Name: time_category, dtype: int64

In [52]:
df2

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,month,...,tag_mean,tag_std,tag_sum,kind,Timestamp_start,Timestamp_fin,solvetime,solvesec,time_category,solvesec_3600
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,,0,,3,...,0.955022,0.207410,637,train,2020-03-24 00:17:11,2020-03-24 00:17:14,0 days 00:00:03,3.0,1,3.0
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,1.0,1,1.000000,3,...,0.913187,0.281603,3040,train,2020-03-24 00:17:14,2020-03-24 00:17:22,0 days 00:00:08,8.0,3,8.0
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,2.0,2,1.000000,3,...,0.913187,0.281603,3040,train,2020-03-24 00:17:22,2020-03-24 00:17:29,0 days 00:00:07,7.0,2,7.0
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,3.0,3,1.000000,3,...,0.913187,0.281603,3040,train,2020-03-24 00:17:29,2020-03-24 00:17:36,0 days 00:00:07,7.0,2,7.0
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,4.0,4,1.000000,3,...,0.913187,0.281603,3040,train,2020-03-24 00:17:36,2020-03-24 00:17:47,0 days 00:00:11,11.0,4,11.0
5,0,A060001007,A060000001,1,2020-03-24 00:17:47,7225,5.0,5,1.000000,3,...,0.913187,0.281603,3040,train,2020-03-24 00:17:47,2020-03-26 05:52:03,2 days 05:34:16,192856.0,9,3600.0
6,0,A060003001,A060000003,0,2020-03-26 05:52:03,7226,6.0,6,1.000000,3,...,0.799552,0.400380,3570,train,2020-03-26 05:52:03,2020-03-26 05:52:10,0 days 00:00:07,7.0,2,7.0
7,0,A060003002,A060000003,1,2020-03-26 05:52:10,7226,6.0,7,0.857143,3,...,0.799552,0.400380,3570,train,2020-03-26 05:52:10,2020-03-26 05:53:14,0 days 00:01:04,64.0,5,64.0
8,0,A060003003,A060000003,1,2020-03-26 05:53:14,7226,7.0,8,0.875000,3,...,0.799552,0.400380,3570,train,2020-03-26 05:53:14,2020-03-26 05:53:29,0 days 00:00:15,15.0,4,15.0
9,0,A060003004,A060000003,1,2020-03-26 05:53:29,7226,8.0,9,0.888889,3,...,0.799552,0.400380,3570,train,2020-03-26 05:53:29,2020-03-26 05:53:48,0 days 00:00:19,19.0,4,19.0


In [53]:
df3 = df2.drop(['Timestamp_start','Timestamp_fin','solvetime'],axis=1)
export(df3)

Write: /opt/ml/input/data/after_fe_train_test.pkl
