# LGBM

In [1]:
import numpy as np
import pandas as pd
import os
import random

import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import numpy as np

## 1. 데이터 로딩

In [2]:
data_dir = '/opt/ml/input/data/' 
csv_file_path = os.path.join(data_dir, 'elo.csv') 
df = pd.read_csv(csv_file_path,  parse_dates=['Timestamp'])

# train, test concat

In [3]:
# df = pd.concat([train, test])
def feature_engineering_(df):

    # 유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df.sort_values(by=["userID", "Timestamp"], inplace=True)

    # 카테고리형 feature
    categories = ["assessmentItemID", "testId"] 

    for category in categories:
        df[category] = df[category].astype("category")

    return df


df = feature_engineering_(df)

## 2. Feature Engineering

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2526700 entries, 0 to 2526699
Data columns (total 8 columns):
 #   Column            Dtype         
---  ------            -----         
 0   userID            int64         
 1   assessmentItemID  category      
 2   testId            category      
 3   answerCode        int64         
 4   Timestamp         datetime64[ns]
 5   KnowledgeTag      int64         
 6   dataset           int64         
 7   elo               float64       
dtypes: category(2), datetime64[ns](1), float64(1), int64(4)
memory usage: 144.9 MB


In [5]:
def feature_engineering(df):
    #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    # df.sort_values(by=['userID','Timestamp'], inplace=True)
    
    #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산
    df['user_correct_answer_cumsum'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df['user_total_answer_cumcount'] = df.groupby('userID')['answerCode'].cumcount()
    df['user_acc'] = df['user_correct_answer_cumsum']/df['user_total_answer_cumcount']

    # testId와 KnowledgeTag의 전체 정답률은 한번에 계산
    correct_t = df.groupby(['testId'])['answerCode'].agg(['mean', 'sum'])
    correct_t.columns = ["test_mean",  'test_sum']
    
    df['Timestamp'] = df['Timestamp'].astype(str)
    df['month'] = df['Timestamp'].str[5:7].astype(int)
    df['category_2'] = df['assessmentItemID'].str[2].astype(int)
    df['test_paper'] = df['assessmentItemID'].str[4:7].astype(int)
    df['problem_id'] = df['assessmentItemID'].str[-3:].astype(int)

    df = pd.merge(df, correct_t, on=['testId'], how="left")
    # df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left")
    
    # 결측치 처리
    df = df.fillna(0)
    
    return df

# 해당 문제 푸는데 걸린 시간

In [6]:
df['lagtime'] = pd.to_datetime(df['Timestamp'],infer_datetime_format=True)
df['lagtime'] = df['lagtime'].apply(lambda x: x.timestamp())
diff = df.loc[:,['userID','lagtime']].groupby('userID').diff(periods=-1)
df['lagtime'] = diff['lagtime'].apply(abs)
df.loc[df['lagtime']>30000, 'lagtime'] = 30000

df['lagtime'].fillna(0, inplace=True)
df.lagtime=df.lagtime.astype('int32')


In [7]:
lagtime_agg = df.groupby('userID')['lagtime'].agg(['mean'])
df['lagtime_mean'] = df['userID'].map(lagtime_agg['mean'])
df.lagtime_mean=df.lagtime_mean.astype('int32')

# 시도 횟수

In [8]:
df["attempt_no"] = 1
df.attempt_no=df.attempt_no.astype('int8')
df["attempt_no"] = df[["userID","assessmentItemID",'attempt_no']].groupby(["userID","assessmentItemID"])["attempt_no"].cumsum()
df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,dataset,elo,lagtime,lagtime_mean,attempt_no
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,1,0.980768,3,3874,1
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,1,0.973315,8,3874,1
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,1,0.947292,7,3874,1
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,1,0.974914,7,3874,1
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,1,0.961391,11,3874,1
...,...,...,...,...,...,...,...,...,...,...,...
2526695,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438,1,0.296188,30000,3372,1
2526696,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836,1,0.616957,11,3372,1
2526697,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836,1,0.601200,46,3372,1
2526698,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836,1,0.726027,73,3372,1


In [9]:
user_agg = df.groupby('userID')['answerCode'].agg(['sum', 'count'])
assessment_agg = df.groupby('assessmentItemID')['answerCode'].agg(['sum', 'count','var'])
test_agg = df.groupby('testId')['answerCode'].agg(['sum', 'count','var'])
Tag_agg = df.groupby('KnowledgeTag')['answerCode'].agg(['sum', 'count','var'])
attempt_no_agg=df.groupby(["userID","assessmentItemID"])["attempt_no"].agg(['sum'])

user_agg=user_agg.astype('int16')
assessment_agg = assessment_agg.astype('float32')
test_agg = test_agg.astype('float32')
Tag_agg= Tag_agg.astype('float32')
attempt_no_agg = attempt_no_agg.astype('int8')

In [10]:
df['assessmentItemId_cnt'] = df['assessmentItemID'].map(assessment_agg['count']).astype('int32')
df['assessmentItemId_sum'] = df['assessmentItemID'].map(assessment_agg['sum']).astype('int32')
df['assessmentItemId_mean'] = df['assessmentItemID'].map(assessment_agg['sum'] / assessment_agg['count'])
df.assessmentItemId_mean=df.assessmentItemId_mean.astype('float16')

df['KnowledgeTag_sum'] = df['KnowledgeTag'].map(Tag_agg['sum']).astype('int32')
df['KnowledgeTag_std'] = df['KnowledgeTag'].map(Tag_agg['var']).astype('float16')
df['KnowledgeTag_mean'] = df['KnowledgeTag'].map(Tag_agg['sum'] / Tag_agg['count'])
df.KnowledgeTag_mean=df.KnowledgeTag_mean.astype('float16')

In [11]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df['hour'] = df['Timestamp'].dt.hour
time_agg = df.groupby('hour')['answerCode'].agg(['mean', 'sum'])
time_agg['sum'] = time_agg['sum'] / time_agg['sum'].max()

df['hour_mean'] = df['hour'].map(time_agg['mean']).astype('float16')
df['hour_sum'] = df['hour'].map(time_agg['sum']).astype('float16')

In [12]:
df = feature_engineering(df)

In [13]:
import gc
gc.get_count()
gc.collect()
gc.get_count()

(22, 0, 0)

In [14]:
df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,dataset,elo,lagtime,lagtime_mean,...,hour_sum,user_correct_answer_cumsum,user_total_answer_cumcount,user_acc,month,category_2,test_paper,problem_id,test_mean,test_sum
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,1,0.980768,3,3874,...,0.617676,0.0,0,0.000000,3,6,1,1,0.952667,1429
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,1,0.973315,8,3874,...,0.617676,1.0,1,1.000000,3,6,1,2,0.952667,1429
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,1,0.947292,7,3874,...,0.617676,2.0,2,1.000000,3,6,1,3,0.952667,1429
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,1,0.974914,7,3874,...,0.617676,3.0,3,1.000000,3,6,1,4,0.952667,1429
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,1,0.961391,11,3874,...,0.617676,4.0,4,1.000000,3,6,1,5,0.952667,1429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2526695,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438,1,0.296188,30000,3372,...,0.988281,1.0,4,0.250000,6,3,71,5,0.666000,999
2526696,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836,1,0.616957,11,3372,...,0.833984,1.0,5,0.200000,8,4,165,1,0.652500,783
2526697,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836,1,0.601200,46,3372,...,0.833984,2.0,6,0.333333,8,4,165,2,0.652500,783
2526698,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836,1,0.726027,73,3372,...,0.833984,3.0,7,0.428571,8,4,165,3,0.652500,783


## 3. Train/Test 데이터 셋 분리

- 아래와 같이 split을 진행함
- custom split은 필수적임, 맨마지막 interaction을 가져와야 되니까
- 반복문을 이용해서 train 시키고, 나눠주면 될 듯

In [15]:
# Inference 위해 미리 나누어 놓음
train = df[df.dataset == 1]
test = test = df[df.dataset == 2]

In [16]:
def all_train_valid_split(df):
    train = df[df.dataset == 1]

    test = df[(df.dataset == 2) & (df.answerCode != -1)]  # -1 인 answerCode 제외
    # 마지막에서 두 번째를 validset
    test = test[test["userID"] != test["userID"].shift(-1)]

    return train, test

In [17]:
# dataset에 포함된 feature 목록
sorted(list(train.columns))

['KnowledgeTag',
 'KnowledgeTag_mean',
 'KnowledgeTag_std',
 'KnowledgeTag_sum',
 'Timestamp',
 'answerCode',
 'assessmentItemID',
 'assessmentItemId_cnt',
 'assessmentItemId_mean',
 'assessmentItemId_sum',
 'attempt_no',
 'category_2',
 'dataset',
 'elo',
 'hour',
 'hour_mean',
 'hour_sum',
 'lagtime',
 'lagtime_mean',
 'month',
 'problem_id',
 'testId',
 'test_mean',
 'test_paper',
 'test_sum',
 'userID',
 'user_acc',
 'user_correct_answer_cumsum',
 'user_total_answer_cumcount']

In [18]:
# train과 valid 데이터셋은 사용자 별로 묶어서 분리를 해주어야함
random.seed(42)
def custom_train_valid_split(df, ratio=0.9, split=True):
    
    users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))
    random.shuffle(users)
    
    max_train_data_len = ratio*len(df)
    sum_of_train_data = 0
    user_ids =[]

    for user_id, count in users:
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            break
        user_ids.append(user_id)


    train = df[df['userID'].isin(user_ids)]
    valid = df[df['userID'].isin(user_ids) == False]

    #valid데이터셋은 각 유저의 마지막 interaction만 추출
    valid = valid[valid['userID'] != valid['userID'].shift(-1)]
    
    return train, valid

In [19]:
# 유저별 분리
train, valid = all_train_valid_split(df)

# 사용할 Feature 설정
FEATS = ['KnowledgeTag',
 'KnowledgeTag_mean',
 'KnowledgeTag_std',
 'KnowledgeTag_sum',
#  'Timestamp',
#  'assessmentItemID',
 'assessmentItemId_cnt',
 'assessmentItemId_mean',
 'assessmentItemId_sum',
 'attempt_no',
 'category_2',
#  'dataset',
 'elo',
 'hour',
 'hour_mean',
 'hour_sum',
 'lagtime',
 'lagtime_mean',
 'month',
 'problem_id',
 'testId',
 'test_mean',
 'test_paper',
 'test_sum',
#  'userID',
 'user_acc',
 'user_correct_answer_cumsum',
 'user_total_answer_cumcount']
#  'elapsed', 'elapsed_sum''elapsed_mean', 'tag_level', 'test_level', 'meanSeconds'

y_train = train['answerCode']
train = train.drop(['answerCode'], axis=1)

y_valid = valid['answerCode']
valid = valid.drop(['answerCode'], axis=1)

In [20]:
# dataset에 포함된 feature 목록
sorted(list(train.columns))

['KnowledgeTag',
 'KnowledgeTag_mean',
 'KnowledgeTag_std',
 'KnowledgeTag_sum',
 'Timestamp',
 'assessmentItemID',
 'assessmentItemId_cnt',
 'assessmentItemId_mean',
 'assessmentItemId_sum',
 'attempt_no',
 'category_2',
 'dataset',
 'elo',
 'hour',
 'hour_mean',
 'hour_sum',
 'lagtime',
 'lagtime_mean',
 'month',
 'problem_id',
 'testId',
 'test_mean',
 'test_paper',
 'test_sum',
 'userID',
 'user_acc',
 'user_correct_answer_cumsum',
 'user_total_answer_cumcount']

In [21]:
lgb_train = lgb.Dataset(train[FEATS], y_train)
lgb_test = lgb.Dataset(valid[FEATS], y_valid)

## 4. 훈련 및 검증

In [22]:
model = lgb.train(
    {'objective': 'binary',
    'max_depth':8,
    "min_data_in_leaf": 1000,
    'num_iterations':1000,
    "boosting": "gbdt",
    'num_leaves':80,
    'learning_rate':0.01,
    "seed": 42,
    # 'min_child_weight': 0.03454472573214212,
    "bagging_fraction": 0.75,
    'metric' : 'auc'}, 
    train_set = lgb_train,
    valid_sets=[lgb_train, lgb_test],
    verbose_eval=100,
    num_boost_round=1000,
    early_stopping_rounds=100
)

preds = model.predict(valid[FEATS])
acc = accuracy_score(y_valid, np.where(preds >= 0.5, 1, 0))
auc = roc_auc_score(y_valid, preds)

print(f'VALID AUC : {auc} ACC : {acc}\n')



[LightGBM] [Info] Number of positive: 1483205, number of negative: 783381
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5341
[LightGBM] [Info] Number of data points in the train set: 2266586, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654378 -> initscore=0.638341
[LightGBM] [Info] Start training from score 0.638341




Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.826598	valid_1's auc: 0.834876
[200]	training's auc: 0.830753	valid_1's auc: 0.83414
Early stopping, best iteration is:
[103]	training's auc: 0.826727	valid_1's auc: 0.834993
VALID AUC : 0.8349926482959408 ACC : 0.7594086021505376



In [33]:
df_importances = pd.DataFrame(FEATS, columns=['feature_name'])
df_importances['gain'] = model.feature_importance('gain')
df_importances = df_importances.sort_values('gain', ascending=False).reset_index(drop=True)
df_importances.head(50)

Unnamed: 0,feature_name,gain
0,elo,20755260.0
1,lagtime,6182291.0
2,assessmentItemId_mean,1708108.0
3,user_acc,926769.6
4,category_2,551557.1
5,testId,483547.9
6,month,95109.67
7,user_correct_answer_cumsum,22883.38
8,assessmentItemId_sum,14944.85
9,test_mean,14411.1


In [24]:
# import optuna
# from optuna.samplers import TPESampler
# from optuna.pruners import SuccessiveHalvingPruner

# def objective(trial):
#     lgb_train = lgb.Dataset(train[FEATS], y_train)
#     lgb_test = lgb.Dataset(valid[FEATS], y_valid)

#     param = {
#         'objective': 'binary', # 회귀
#         'verbose': -1,
#         'metric': 'auc', 
#         "boosting_type": "gbdt",
#         'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=1, log=True), 
#         'max_depth': trial.suggest_int('max_depth', 1, 10, step=1, log=False), 
#         'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True), 
#         'n_estimators': trial.suggest_int('n_estimators', 8, 1024, step=1, log=True), 
#         'min_child_samples': trial.suggest_int('min_child_samples', 10, 1050), 
#         'bagging_fraction': trial.suggest_float('bagging_fraction',0.58, 0.80),
#         # 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
#         # 'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
#         'seed': 42
#     }

#     model = lgb.LGBMClassifier(**param)
#     lgb_model = model.fit(train[FEATS], y_train)
#     preds = model.predict(valid[FEATS])
#     acc = accuracy_score(y_valid, np.where(preds >= 0.5, 1, 0))
#     auc = roc_auc_score(y_valid, preds)

#     print(f'VALID AUC : {auc} ACC : {acc}\n')
#     return auc
        
# study_lgb = optuna.create_study(direction='maximize', pruner=SuccessiveHalvingPruner())
# study_lgb.optimize(objective, n_trials=50)

## 5. Inference

In [25]:
test_df = df[df.dataset == 2]

# LEAVE LAST INTERACTION ONLY
test_df = test_df[test_df["userID"] != test_df["userID"].shift(-1)]

# DROP ANSWERCODE
test_df = test_df.drop(["answerCode"], axis=1)

# MAKE PREDICTION
total_preds = model.predict(test_df[FEATS])

In [39]:
test_df[FEATS]

Unnamed: 0,KnowledgeTag,KnowledgeTag_mean,KnowledgeTag_std,KnowledgeTag_sum,assessmentItemId_cnt,assessmentItemId_mean,assessmentItemId_sum,attempt_no,category_2,elo,...,lagtime_mean,month,problem_id,testId,test_mean,test_paper,test_sum,user_acc,user_correct_answer_cumsum,user_total_answer_cumcount
2989,5289,0.559082,0.247192,1817,250,0.527832,132,1,5,0.452318,...,3861,10,8,A050000133,0.654500,133,1309,0.692754,717.0,1035
3660,9080,0.541504,0.254639,1381,150,0.560059,84,1,7,0.484820,...,3528,12,8,A070000146,0.645000,146,774,0.694030,465.0,670
10860,9660,0.492432,0.252686,1477,250,0.364014,91,1,7,0.184150,...,3975,12,8,A070000111,0.456000,111,912,0.695289,915.0,1316
15278,2611,0.417725,0.245850,355,100,0.260010,26,1,9,0.372931,...,3745,10,6,A090000064,0.443333,64,266,0.818904,1031.0,1259
23531,1422,0.609375,0.238525,2894,250,0.308105,77,1,6,0.330752,...,4170,10,7,A060000135,0.639429,135,1119,0.759067,293.0,386
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525938,10615,0.701172,0.210693,1472,300,0.443359,133,1,4,0.076693,...,3774,9,5,A040000122,0.790000,122,1185,0.304348,7.0,23
2526081,7636,0.824219,0.145386,3956,300,0.879883,264,1,3,0.624517,...,4029,10,5,A030000111,0.866000,111,1299,0.500000,7.0,14
2526282,10402,0.820801,0.150024,2750,250,0.844238,211,1,5,0.613140,...,4025,10,4,A050000193,0.746000,193,746,0.500000,7.0,14
2526297,10402,0.820801,0.150024,2750,250,0.844238,211,1,5,0.396708,...,4084,9,4,A050000193,0.746000,193,746,0.142857,2.0,14


In [36]:
test_df["userID"]

2989          3
3660          4
10860        13
15278        17
23531        26
           ... 
2525938    7395
2526081    7404
2526282    7416
2526297    7417
2526675    7439
Name: userID, Length: 744, dtype: int64

In [37]:
test_df["userID"].shift(-1)

2989          4.0
3660         13.0
10860        17.0
15278        26.0
23531        29.0
            ...  
2525938    7404.0
2526081    7416.0
2526282    7417.0
2526297    7439.0
2526675       NaN
Name: userID, Length: 744, dtype: float64

In [40]:
# SAVE OUTPUT
output_dir = 'output/'
write_path = os.path.join(output_dir, "LJH_LGBM_submission2.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(total_preds):
        w.write('{},{}\n'.format(id,p))

writing prediction : output/LJH_LGBM_submission2.csv


###**콘텐츠 라이선스**

<font color='red'><b>**WARNING**</b></font> : **본 교육 콘텐츠의 지식재산권은 재단법인 네이버커넥트에 귀속됩니다. 본 콘텐츠를 어떠한 경로로든 외부로 유출 및 수정하는 행위를 엄격히 금합니다.** 다만, 비영리적 교육 및 연구활동에 한정되어 사용할 수 있으나 재단의 허락을 받아야 합니다. 이를 위반하는 경우, 관련 법률에 따라 책임을 질 수 있습니다.

