In [3]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
plt.style.use('seaborn')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
pd.set_option('display.min_rows', 500)
import warnings
warnings.filterwarnings('ignore')

def concat_and_export(train_fe, test_fe):
    
    train_fe['kind']='train'
    test_fe['kind']='test'
    
    df = pd.concat([train_fe,test_fe])
    data_dir = '/opt/ml/input/data' # 경로
    write_path=f'{data_dir}/after_fe_train_test_cumsum.pkl'
    df.to_pickle(write_path,index=False)
    print(f"Write: {write_path}")
    
def export(df, output='after_fe_train_test.pkl'):
    data_dir = '/opt/ml/input/data' # 경로
    write_path=f'{data_dir}/{output}'
    df.to_pickle(write_path)
    print(f"Write: {write_path}")
    
path='../../data/'
train = pd.read_csv(f"{path}/train_data.csv")
test = pd.read_csv(f"{path}/test_data.csv")
sub = pd.read_csv(f"{path}/sample_submission.csv")

day_dict = {'Tuesday': 0,
 'Thursday': 1,
 'Monday': 2,
 'Saturday': 3,
 'Friday': 4,
 'Wednesday': 5,
 'Sunday': 6}

def feature_engineering(df):
    df2 = df.copy()
    #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df2.sort_values(by=['userID','Timestamp'], inplace=True)
    
    # 유저별 문제푼 시간, solvesec_3600, time_category
    df2['Timestamp2'] = pd.to_datetime(df2.Timestamp)
    df2['solvetime'] = df2.groupby('userID')['Timestamp2'].diff().fillna(pd.Timedelta(seconds=0))
    df2['solvesec'] = df2.solvetime.map(lambda x : x.total_seconds())
    df2['solvesec_3600'] = df2.solvesec
    df2.loc[df2.solvesec>=3600,'solvesec_3600']=3600

    df2['time_category'] = ''
    tc = [0,5,7,10,60,600,1200,2400,3600]
    df2.loc[(df2.solvesec==0), 'time_category'] = "0 - [0,0]"
    for i in range(len(tc)-1):
        s,e = tc[i],tc[i+1]
        df2.loc[(df2.solvesec>s) & (df2.solvesec<=e),'time_category']=f"{i+1} - ({s}, {e}]"
    df2.loc[(df2.solvesec>=tc[-1]),'time_category'] = f"{i+2} - ({e}, )"
    timecat2idx={k:v for v,k in enumerate(sorted(df2.time_category.unique()))}
    df2['time_category'] = df2.time_category.map(timecat2idx)
    
    
    
    #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산
    df2['user_correct_answer'] = df2.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df2['user_total_answer'] = df2.groupby('userID')['answerCode'].cumcount()
    df2['user_acc'] = df2['user_correct_answer']/df2['user_total_answer']
    df2['month'] = pd.to_datetime(df2.Timestamp).dt.month
    df2['day'] = pd.to_datetime(df2.Timestamp).dt.day
    df2['hour'] = pd.to_datetime(df2.Timestamp).dt.hour
    df2['dayname'] = pd.to_datetime(df2.Timestamp).dt.day_name().map(day_dict)
    df2['big_category'] = df2.testId.map(lambda x:x[2]).astype(int)
    df2['problem_num'] = df2.assessmentItemID.map(lambda x: int(x[-3:]))
    df2['mid_category'] = df2.testId.map(lambda x: int(x[-3:]))

    # testId와 KnowledgeTag의 전체 정답률은 한번에 계산
    # 아래 데이터는 제출용 데이터셋에 대해서도 재사용
    correct_t = df2.groupby(['testId'])['answerCode'].agg(['mean', 'std', 'sum'])
    correct_t.columns = ["test_mean", "test_std", 'test_sum']
    correct_k = df2.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'std', 'sum'])
    correct_k.columns = ["tag_mean", 'tag_std', 'tag_sum']

    df2 = pd.merge(df2, correct_t, on=['testId'], how="left")
    df2 = pd.merge(df2, correct_k, on=['KnowledgeTag'], how="left")
    
    return df2

train_fe = feature_engineering(train)
test_fe = feature_engineering(test)
train_fe.shape, test_fe.shape

((2266586, 27), (260114, 27))

In [4]:
train_fe['kind']='train'
test_fe['kind']='test'
df = pd.concat([train_fe,test_fe])

## 전체 데이터

### 문제푼시간 초단위 측정 및 내보내기

In [5]:
df = df.sort_values(['userID','Timestamp'])

df.reset_index(drop=True,inplace=True)

df['Timestamp2'] = pd.to_datetime(df.Timestamp)
df['solvetime'] = df.groupby('userID')['Timestamp2'].diff()
df['solvesec'] = df.solvetime.map(lambda x : x.total_seconds())

df.solvesec.describe()

count    2.519258e+06
mean     5.279512e+04
std      4.059280e+05
min      0.000000e+00
25%      1.300000e+01
50%      3.700000e+01
75%      1.360000e+02
max      2.560230e+07
Name: solvesec, dtype: float64

In [31]:
df['solvesec_3600'] = df.solvesec
df.loc[df.solvesec>=3600,'solvesec_3600']=3600

In [8]:
# 문제푼 시간 초단위 측정 내보내기
df2 = df.drop(['Timestamp2','solvetime'],axis=1)
export(df2,output='after_fe_train_test_solvetime2.pkl')

Write: /opt/ml/input/data/after_fe_train_test_solvetime2.pkl


## 3600초, 3200, 2400, 1800, 1200, 800, 600 이상 변환

In [9]:
df['solvesec_3600_2'] = df.solvesec
df.loc[df.solvesec>=3600,'solvesec_3600_2']=3600

In [11]:
df['solvesec_3200_2'] = df.solvesec
df.loc[df.solvesec>=3200,'solvesec_3200_2']=3200

In [12]:
df['solvesec_2400_2'] = df.solvesec
df.loc[df.solvesec>=2400,'solvesec_2400_2']=2400

In [13]:
df['solvesec_1800_2'] = df.solvesec
df.loc[df.solvesec>=1800,'solvesec_1800_2']=1800

In [14]:
df['solvesec_1200_2'] = df.solvesec
df.loc[df.solvesec>=1200,'solvesec_1200_2']=1200

In [15]:
df['solvesec_800_2'] = df.solvesec
df.loc[df.solvesec>=800,'solvesec_800_2']=800

In [16]:
df['solvesec_600_2'] = df.solvesec
df.loc[df.solvesec>=600,'solvesec_600_2']=600

In [17]:
# 문제푼 시간 초단위 측정 내보내기
df2 = df.drop(['Timestamp2','solvetime'],axis=1)
export(df2,output='after_fe_train_test_solvetime2.pkl')

Write: /opt/ml/input/data/after_fe_train_test_solvetime2.pkl


### time category 나누기

In [18]:
df['time_category'] = ''

tc = [0,5,7,10,60,600,1200,2400]
df.loc[(df.solvesec==0), 'time_category'] = "0 - [0,0]"
print(0)
for i in range(len(tc)-1):
    s,e = tc[i],tc[i+1]
    df.loc[(df.solvesec>s) & (df.solvesec<=e),'time_category']=f"{i+1} - ({s}, {e}]"
    print(s,e)
df.loc[(df.solvesec>=tc[-1]),'time_category'] = f"{i+2} - ({e}, )"
print(e)

0
0 5
5 7
7 10
10 60
60 600
600 1200
1200 2400
2400


In [20]:
df[:2]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,Timestamp2,solvetime,solvesec,solvesec_3600,...,tag_std,tag_sum,kind,solvesec_3600_2,solvesec_3200_2,solvesec_2400_2,solvesec_1800_2,solvesec_1200_2,solvesec_800_2,solvesec_600_2
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,2020-03-24 00:17:11,NaT,,0.0,...,0.20741,637,train,,,,,,,
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,2020-03-24 00:17:14,0 days 00:00:03,3.0,3.0,...,0.281603,3040,train,3.0,3.0,3.0,3.0,3.0,3.0,3.0


In [28]:
df.loc[df.time_category=='','time_category']='9999'

In [29]:
time_grp = df.groupby('time_category')['answerCode'].agg(['mean','count'])
time_grp['C'] = time_grp.index.str.extract(r'(\d+)').astype(int)[0].tolist()
time_grp.sort_values('C')[['mean','count']]

Unnamed: 0_level_0,mean,count
time_category,Unnamed: 1_level_1,Unnamed: 2_level_1
"0 - [0,0]",0.449879,8280
"1 - (0, 5]",0.246009,364747
"2 - (5, 7]",0.491269,69176
"3 - (7, 10]",0.664348,102204
"4 - (10, 60]",0.751549,1008903
"5 - (60, 600]",0.702995,552226
"6 - (600, 1200]",0.682187,27384
"7 - (1200, 2400]",0.697362,16756
"8 - (2400, )",0.745031,369582
9999,0.717146,7442


In [31]:
timecat2idx={k:v for v,k in enumerate(sorted(df.time_category.unique()))}
df['time_category'] = df.time_category.map(timecat2idx)
df.time_category.value_counts()

4    1008903
5     552226
8     369582
1     364747
3     102204
2      69176
6      27384
7      16756
0       8280
9       7442
Name: time_category, dtype: int64

In [35]:
df[:10][['userID','time_category']]

Unnamed: 0,userID,time_category
0,0,9
1,0,1
2,0,3
3,0,2
4,0,2
5,0,4
6,0,8
7,0,2
8,0,5
9,0,4


In [36]:
df2 = df.drop(['Timestamp2','solvetime'],axis=1)
export(df2, output='after_fe_train_test_solvetime2.pkl')

Write: /opt/ml/input/data/after_fe_train_test_solvetime2.pkl
