In [2]:
import pandas as pd
import numpy as np
import os
import random

In [3]:
data_dir = '/opt/ml/input/data/'
test = pd.read_csv(os.path.join(data_dir,'test_data.csv'))
sub = pd.read_csv(os.path.join(data_dir,'sample_submission.csv'))

In [4]:
def feature_engineering(df):
    
    #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df.sort_values(by=['userID','Timestamp'], inplace=True)
    
    #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산
    df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount()
    df['user_acc'] = df['user_correct_answer']/df['user_total_answer']

    # testId와 KnowledgeTag의 전체 정답률은 한번에 계산
    # 아래 데이터는 제출용 데이터셋에 대해서도 재사용
    correct_t = df.groupby(['testId'])['answerCode'].agg(['mean', 'sum'])
    correct_t.columns = ["test_mean", 'test_sum']
    correct_k = df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'sum'])
    correct_k.columns = ["tag_mean", 'tag_sum']

    df = pd.merge(df, correct_t, on=['testId'], how="left")
    df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left")
    
    return df
def percentile(s):
    return np.sum(s) / len(s)

In [5]:
# FEATURE ENGINEERING
test = feature_engineering(test)
# LEAVE LAST INTERACTION ONLY
test_df = test[test['userID'] != test['userID'].shift(-1)]

# DELETE LAST INTERACTION ONLY
test = test[test['userID'] == test['userID'].shift(-1)]

# 집계
stu_test_groupby = test.groupby('userID').agg({
    'assessmentItemID': 'count',
    'answerCode': percentile
})

# DROP ANSWERCODE
test_df = test_df.drop(['answerCode'], axis=1)
test_df

Unnamed: 0,userID,assessmentItemID,testId,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,test_mean,test_sum,tag_mean,tag_sum
1035,3,A050133008,A050000133,2020-10-26 13:13:57,5289,717.0,1035,0.692754,0.661765,90,0.542662,159
1706,4,A070146008,A070000146,2020-12-27 02:47:54,9080,465.0,670,0.694030,0.740385,77,0.565693,155
3023,13,A070111008,A070000111,2020-12-27 04:35:09,9660,915.0,1316,0.695289,0.417857,117,0.446753,172
4283,17,A090064006,A090000064,2020-10-30 05:48:37,2611,1031.0,1259,0.818904,0.625000,30,0.514286,36
4670,26,A060135007,A060000135,2020-10-23 11:44:18,1422,293.0,386,0.759067,0.678571,133,0.602767,305
...,...,...,...,...,...,...,...,...,...,...,...,...
260052,7395,A040122005,A040000122,2020-09-08 02:05:20,10615,7.0,23,0.304348,0.753846,147,0.654902,167
260067,7404,A030111005,A030000111,2020-10-13 09:49:18,7636,7.0,14,0.500000,0.866667,156,0.834661,419
260082,7416,A050193004,A050000193,2020-10-04 02:44:41,10402,7.0,14,0.500000,0.750000,75,0.792517,233
260097,7417,A050193004,A050000193,2020-09-06 13:09:15,10402,2.0,14,0.142857,0.750000,75,0.792517,233


In [6]:
# MAKE PREDICTION
total_preds = stu_test_groupby.answerCode
total_preds.mean()

0.6273123980807259

In [36]:
# SAVE OUTPUT
output_dir = 'output/'
write_path = os.path.join(output_dir, "User_ave_ans.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(total_preds):
        w.write('{},{}\n'.format(id,p))

writing prediction : output/User_ave_ans.csv


In [12]:
# SAVE OUTPUT
output_dir = 'output/'
write_path = os.path.join(output_dir, "User_0.4999_ans.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(total_preds):
        w.write('{},{}\n'.format(id,0.4999))

writing prediction : output/User_0.4999_ans.csv
