## 1.라이브러리 세팅

In [28]:
import numpy as np
import pandas as pd
import random
import os
import warnings
warnings.filterwarnings("ignore")

import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

import pdb
import wandb
from tqdm import tqdm

In [None]:
# wandb.login()

In [47]:
%%time
dtype = {
    'userID': 'int16',
    'answerCode': 'int8',
    'KnowledgeTag': 'int16'
}   

# 데이터 경로 맞춰주세요!
# 혹시 코랩환경을 사용하신다면 왼쪽 폴더모양 아이콘을 눌러 "train_data.csv"를 드래그&드롭으로 업로드한 후 사용해주세요
DATA_PATH = '/opt/ml/input/data/'
df = pd.read_csv(DATA_PATH+'train_data.csv' , dtype=dtype, parse_dates=['Timestamp'])

CPU times: user 2.49 s, sys: 172 ms, total: 2.66 s
Wall time: 2.66 s


## 2. 피쳐 엔지니어링(ELO)

In [48]:
def elo(df):
    def get_new_theta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return theta + learning_rate_theta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def get_new_beta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return beta - learning_rate_beta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def learning_rate_theta(nb_answers):
        return max(0.3 / (1 + 0.01 * nb_answers), 0.04)

    def learning_rate_beta(nb_answers):
        return 1 / (1 + 0.05 * nb_answers)

    def probability_of_good_answer(theta, beta, left_asymptote):
        return left_asymptote + (1 - left_asymptote) * sigmoid(theta - beta)

    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    def estimate_parameters(answers_df, granularity_feature_name="assessmentItemID"):
        item_parameters = {
            granularity_feature_value: {"beta": 0, "nb_answers": 0}
            for granularity_feature_value in np.unique(
                answers_df[granularity_feature_name]
            )
        }
        student_parameters = {
            student_id: {"theta": 0, "nb_answers": 0}
            for student_id in np.unique(answers_df.userID)
        }

        print("Parameter estimation is starting...", flush=True)

        for student_id, item_id, left_asymptote, answered_correctly in tqdm(
            zip(
                answers_df.userID.values,
                answers_df[granularity_feature_name].values,
                answers_df.left_asymptote.values,
                answers_df.answerCode.values,
            ),
            total=len(answers_df),
        ):
            theta = student_parameters[student_id]["theta"]
            beta = item_parameters[item_id]["beta"]

            item_parameters[item_id]["beta"] = get_new_beta(
                answered_correctly,
                beta,
                left_asymptote,
                theta,
                item_parameters[item_id]["nb_answers"],
            )
            student_parameters[student_id]["theta"] = get_new_theta(
                answered_correctly,
                beta,
                left_asymptote,
                theta,
                student_parameters[student_id]["nb_answers"],
            )

            item_parameters[item_id]["nb_answers"] += 1
            student_parameters[student_id]["nb_answers"] += 1

        print(f"Theta & beta estimations on {granularity_feature_name} are completed.")
        return student_parameters, item_parameters

    def gou_func(theta, beta):
        return 1 / (1 + np.exp(-(theta - beta)))

    df["left_asymptote"] = 0

    print(f"Dataset of shape {df.shape}")
    print(f"Columns are {list(df.columns)}")

    student_parameters, item_parameters = estimate_parameters(df)

    prob = [
        gou_func(student_parameters[student]["theta"], item_parameters[item]["beta"])
        for student, item in zip(df.userID.values, df.assessmentItemID.values)
    ]

    df["elo"] = prob

    return df

In [61]:
day_dict = {'Tuesday': 0,
 'Thursday': 1,
 'Monday': 2,
 'Saturday': 3,
 'Friday': 4,
 'Wednesday': 5,
 'Sunday': 6}

def feature_engineering(df):
    df = df.copy()
    #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df.sort_values(by=['userID','Timestamp'], inplace=True)
    
    #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산
    df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount()
    df['user_acc'] = df['user_correct_answer']/df['user_total_answer']
    
    # 문제 푸는데 걸린 시간
    # 10분이상 시간소요는 새로운 문제집을 시작한 것으로 판단
    diff = df.loc[:, ['userID', 'Timestamp']].groupby('userID').diff().fillna(pd.Timedelta(seconds=0))
    diff = diff.fillna(pd.Timedelta(seconds=0))
    diff = diff['Timestamp'].apply(lambda x: x.total_seconds())
    df['elapsed'] = diff
    df['elapsed'] = df['elapsed'].apply(lambda x: 0 if x>= 600 else x)
    # 문제 푸는데 걸린 누적 시간
    df['elapsed_cumsum'] = df.groupby('userID')['elapsed'].cumsum()
    #문제 푸는데 걸린 시간의 중앙값
    elapsed_med = df.groupby('userID')['elapsed'].agg(['median'])
    elapsed_med.columns = ['elapsed_med']
    #시간 쪼개기 + 요일
    df['month'] = pd.to_datetime(df.Timestamp).dt.month
    df['day'] = pd.to_datetime(df.Timestamp).dt.day
    df['hour'] = pd.to_datetime(df.Timestamp).dt.hour
    df['dayname'] = pd.to_datetime(df.Timestamp).dt.day_name().map(day_dict)
    
    #대분류/유저
    df['bigclass'] = df['testId'].apply(lambda x : x[2]).astype(int)
    # 유저별 대분류 문제 풀이시간
    bigclasstime = df.groupby(['userID','bigclass']).agg({'elapsed' : 'mean'}).reset_index()

    # 유저별 대분류 문제 횟수
    bigclassCount = df.groupby(['userID','bigclass'])['answerCode'].count().reset_index()
    # 유저별 대분류 문제 정답 횟수
    bigclasssum = df.groupby(['userID','bigclass'])['answerCode'].sum().reset_index()
    v = bigclasssum['answerCode'].values/bigclassCount['answerCode'].values
    bigclasstime['bigclass_acc'] = v
    bigclasstime['bigclass_count']  = bigclassCount['answerCode'].values
    bigclasstime['bigclass_sum'] = bigclasssum['answerCode'].values
    bigclass = bigclasstime.rename(columns = {'elapsed' : 'bigclasstime'})
    df = pd.merge(df,bigclass, on = ['userID','bigclass'],how = 'left')


    
    # testId와 KnowledgeTag의 전체 정답률은 한번에 계산
    # 아래 데이터는 제출용 데이터셋에 대해서도 재사용
    correct_t = df.groupby(['testId'])['answerCode'].agg(['mean', 'std', 'sum'])
    correct_t.columns = ["test_mean", "test_std", 'test_sum']
    correct_k = df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'std', 'sum'])
    correct_k.columns = ["tag_mean", 'tag_std', 'tag_sum']

    df = pd.merge(df, correct_t, on=['testId'], how="left")
    df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left")
    df = pd.merge(df, elapsed_med, on =['userID'], how = 'left')
    df.fillna(0,inplace = True)
    # df.sort_values(by=['userID','Timestamp'], inplace=True)
    return df

In [62]:
df2 = feature_engineering(df)
df2.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,elapsed,...,bigclass_acc,bigclass_count,bigclass_sum,test_mean,test_std,test_sum,tag_mean,tag_std,tag_sum,elapsed_med
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,0.0,0,0.0,0.0,...,0.791908,346,274,0.947683,0.222749,1268,0.955022,0.20741,637,14.0
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,1.0,1,1.0,3.0,...,0.791908,346,274,0.947683,0.222749,1268,0.913187,0.281603,3040,14.0
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,2.0,2,1.0,8.0,...,0.791908,346,274,0.947683,0.222749,1268,0.913187,0.281603,3040,14.0
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,3.0,3,1.0,7.0,...,0.791908,346,274,0.947683,0.222749,1268,0.913187,0.281603,3040,14.0
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,4.0,4,1.0,7.0,...,0.791908,346,274,0.947683,0.222749,1268,0.913187,0.281603,3040,14.0


In [63]:
df2

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,elapsed,...,bigclass_acc,bigclass_count,bigclass_sum,test_mean,test_std,test_sum,tag_mean,tag_std,tag_sum,elapsed_med
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,0.0,0,0.000000,0.0,...,0.791908,346,274,0.947683,0.222749,1268,0.955022,0.207410,637,14.0
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,1.0,1,1.000000,3.0,...,0.791908,346,274,0.947683,0.222749,1268,0.913187,0.281603,3040,14.0
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,2.0,2,1.000000,8.0,...,0.791908,346,274,0.947683,0.222749,1268,0.913187,0.281603,3040,14.0
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,3.0,3,1.000000,7.0,...,0.791908,346,274,0.947683,0.222749,1268,0.913187,0.281603,3040,14.0
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,4.0,4,1.000000,7.0,...,0.791908,346,274,0.947683,0.222749,1268,0.913187,0.281603,3040,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2266581,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438,1.0,4,0.250000,24.0,...,0.200000,5,1,0.662590,0.472996,921,0.689706,0.462671,2814,34.0
2266582,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836,1.0,5,0.200000,0.0,...,1.000000,4,4,0.655109,0.475550,718,0.697874,0.459253,2199,34.0
2266583,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836,2.0,6,0.333333,11.0,...,1.000000,4,4,0.655109,0.475550,718,0.697874,0.459253,2199,34.0
2266584,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836,3.0,7,0.428571,46.0,...,1.000000,4,4,0.655109,0.475550,718,0.697874,0.459253,2199,34.0


In [64]:
df2 = elo(df2)

Dataset of shape (2266586, 28)
Columns are ['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp', 'KnowledgeTag', 'user_correct_answer', 'user_total_answer', 'user_acc', 'elapsed', 'elapsed_cumsum', 'month', 'day', 'hour', 'dayname', 'bigclass', 'bigclasstime', 'bigclass_acc', 'bigclass_count', 'bigclass_sum', 'test_mean', 'test_std', 'test_sum', 'tag_mean', 'tag_std', 'tag_sum', 'elapsed_med', 'left_asymptote']
Parameter estimation is starting...


100%|██████████| 2266586/2266586 [00:15<00:00, 149765.84it/s]


Theta & beta estimations on assessmentItemID are completed.


## 3. Train/Test 데이터 셋 분리

In [65]:
# train과 test 데이터셋은 사용자 별로 묶어서 분리를 해주어야함
def custom_train_test_split(df, ratio=0.7, split=True):
    
    """
    사용자 기준으로 train_test_split
    """
    
    users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))
    # users.sort(key=lambda x: x[0])
    random.shuffle(users)
    
    max_train_data_len = ratio*len(df)
    sum_of_train_data = 0
    user_ids =[]

    for user_id, count in users:
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            break
        user_ids.append(user_id)


    train = df[df['userID'].isin(user_ids)]
    test = df[df['userID'].isin(user_ids) == False]

    #test데이터셋은 각 유저의 마지막 interaction만 추출
    test = test[test['userID'] != test['userID'].shift(-1)]
    return train, test

In [73]:
# 유저별 분리
random.seed(42)
train, test = custom_train_test_split(df2)

# 사용할 Feature 설정
FEATS = ['KnowledgeTag', 
         'user_correct_answer', 
         'user_total_answer', 
         'user_acc',
         'test_mean', 
         'test_sum', 
         'tag_mean',
         'tag_sum',
         'elapsed',
         'elapsed_cumsum',
         'month',
         'day',
         'hour',
         'dayname',
         'elapsed_med',
         'bigclass',
         'bigclasstime',
         'bigclass_acc',
         'bigclass_sum',
         'bigclass_count',
         'elo'
             ]

# X, y 값 분리
y_train = train['answerCode']
train = train.drop(['answerCode'], axis=1)

y_test = test['answerCode']
test = test.drop(['answerCode'], axis=1)

In [74]:
test

Unnamed: 0,userID,assessmentItemID,testId,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,elapsed,elapsed_cumsum,...,bigclass_sum,test_mean,test_std,test_sum,tag_mean,tag_std,tag_sum,elapsed_med,left_asymptote,elo
744,0,A080129006,A080000129,2020-12-23 03:40:19,2725,470.0,744,0.631720,5.0,23563.0,...,170,0.360111,0.480143,780,0.301939,0.459258,436,14.0,0,0.104895
2786,5,A080138007,A080000138,2020-12-11 22:48:28,8431,662.0,832,0.795673,0.0,53131.0,...,501,0.504405,0.500118,916,0.491346,0.500166,511,34.0,0,0.646001
3707,6,A030145005,A030000145,2020-10-26 09:52:14,7817,408.0,920,0.443478,24.0,36391.0,...,161,0.628679,0.483340,833,0.617073,0.486160,2530,10.0,0,0.323708
4519,7,A090052006,A090000052,2020-11-02 02:02:59,2600,492.0,811,0.606658,4.0,44300.0,...,34,0.448326,0.497504,616,0.350352,0.477290,398,15.0,0,0.373063
5922,9,A070158008,A070000158,2020-12-28 21:08:31,9797,590.0,866,0.681293,36.0,46934.0,...,157,0.680398,0.466654,479,0.537484,0.498674,1649,24.0,0,0.312781
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2266199,7415,A020171006,A020000171,2020-10-26 05:17:20,8131,4.0,15,0.266667,15.0,508.0,...,1,0.721631,0.448328,1221,0.696643,0.459766,2781,13.0,0,0.399156
2266277,7421,A040161007,A040000161,2020-10-22 06:33:19,2118,12.0,17,0.705882,13.0,1207.0,...,5,0.600856,0.489853,1123,0.703704,0.456800,912,19.0,0,0.322048
2266381,7427,A040187005,A040000187,2020-10-15 02:53:08,2129,15.0,19,0.789474,13.0,701.0,...,9,0.955474,0.206335,1309,0.927544,0.259276,3546,26.5,0,0.966692
2266469,7432,A060177007,A060000177,2020-10-26 08:09:52,1578,8.0,18,0.444444,107.0,879.0,...,7,0.720437,0.448938,1054,0.660964,0.473751,425,12.0,0,0.171337


In [75]:
lgb_train = lgb.Dataset(train[FEATS], y_train)
lgb_test = lgb.Dataset(test[FEATS], y_test)

In [76]:
# wandb.init(project="LGBM",entity = 'recommy')

## 4. 훈련 및 검증

In [77]:
model = lgb.train(
    {'objective': 'binary'}, 
    lgb_train,
    valid_sets=[lgb_train,lgb_test],
    verbose_eval=100,
    early_stopping_rounds=100,
    num_boost_round=500,
    # callbacks=[wandb.lightgbm.wandb_callback()]
)
# wandb.lightgbm.log_summary(model, save_model_checkpoint=True)

preds = model.predict(test[FEATS])
acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))
auc = roc_auc_score(y_test, preds)

print(f'VALID AUC : {auc} ACC : {acc}\n')

# wandb.log({"valid_accuracy": acc})
# wandb.log({"valid_roc_auc": auc})

[LightGBM] [Info] Number of positive: 1039565, number of negative: 546592
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4026
[LightGBM] [Info] Number of data points in the train set: 1586157, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655399 -> initscore=0.642855
[LightGBM] [Info] Start training from score 0.642855
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.454642	valid_1's binary_logloss: 0.500875
[200]	training's binary_logloss: 0.45029	valid_1's binary_logloss: 0.497411
[300]	training's binary_logloss: 0.447125	valid_1's binary_logloss: 0.495545
[400]	training's binary_logloss: 0.44449	valid_1's binary_logloss: 0.494031
[500]	training's binary_logloss: 0.442018	valid_1's binary_logloss: 0.493423
Did not meet early stopping. Best iteration is:
[500]	training's binary_logloss: 0.442018	valid_1'

In [236]:
# wandb.finish()

0,1
iteration,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
training_binary_logloss,█▄▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_1_binary_logloss,█▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_accuracy,▁
valid_roc_auc,▁

0,1
best_iteration,500.0
iteration,499.0
valid_accuracy,0.70902
valid_roc_auc,0.79371


[baseline]
- VALID AUC : 0.6868241736037339 ACC : 0.587942202291978
        
[문제풀이 시간 추가]
- VALID AUC : 0.7395128748367014 ACC : 0.6507224713502741

[누적 풀이시간 추가] 성능하락 → 당시에 문제푸는 시간이 얼마나 걸렸느냐가 중요하지 이때까지 몇시간동안 공부했냐는 안중요한 듯?

- VALID AUC : 0.7390277302636187 ACC : 0.6522172396611858

[시간 쪼개기] 미세한 성능 향상

- VALID AUC : 0.740232589311615 ACC : 0.6542102640757349

### Ver2 - valid_set 고정
[baseline]
- VALID AUC : 0.6916288131757744 ACC : 0.5795076513639388

[문제 풀이 시간 추가]
- VALID AUC : 0.747883430697164 ACC : 0.6544688400975827

[문제 풀이 시간 + 누적 시간]
- VALID AUC : 0.7455611673251767 ACC : 0.6551341760922599

[문제풀이시간 + 풀이시간_median]

- VALID AUC : 0.744333015051247 ACC : 0.6540252827677977

[문제풀이시간 + 날짜 쪼개기]

- VALID AUC : 0.7413914025989795 ACC : 0.6535817254380129

[전부 추가]

- VALID AUC : 0.7411653066743833 ACC : 0.6522510534486582

'''흐음'''

## Inference

In [165]:
# LOAD TESTDATA
test_df = pd.read_csv(DATA_PATH+'test_data.csv' , dtype=dtype, parse_dates=['Timestamp'])
test_df = test_df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)


# FEATURE ENGINEERING
test_df = feature_engineering(test_df)

# LEAVE LAST INTERACTION ONLY
test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]

# # DROP ANSWERCODE
# test_df = test_df.drop(['answerCode'], axis=1)

In [183]:
# MAKE PREDICTION
total_preds = model.predict(test_df[FEATS])

In [153]:
# SAVE OUTPUT
output_dir = '/opt/ml/input/output/LGBM'
write_path = os.path.join(output_dir, "submission.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(total_preds):
        w.write('{},{}\n'.format(id,p))

writing prediction : /opt/ml/input/output/LGBM/submission.csv
