## LGBM Baseline

In [2]:
import pandas as pd
import os
import random
import warnings
import lightgbm as lgb
from wandb.lightgbm import wandb_callback
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import numpy as np
import random
from matplotlib import pylab as plt
from lgbm_function import inference, set_params, custom_train_test_split
from feature_engineering import feature_engineering
from datetime import datetime
import wandb

%matplotlib inline
warnings.filterwarnings('ignore')

## 1. 데이터 로딩

In [3]:
data_dir = '/opt/ml/input/data/train_dataset'
csv_file_path = os.path.join(data_dir, 'train_data.csv')
df = pd.read_csv(csv_file_path, parse_dates=['Timestamp'])
print(df.shape)
df.head(5)

(2266586, 6)


Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225


## 2. Feature Engineering

In [None]:
%%time
df = feature_engineering(df)
df.head(2)

## 3. Cross Validation

In [None]:
# 유저별 분리
train_lst, test_lst = custom_train_test_split(df)

# 사용할 Feature 설정
FEATS = ["user_acc", "user_mean", "user_count", "user_correct_answer", "question_mean", "question_class_mean"]

# set parameters
params = set_params()

# "test_sum", "question_class_count", "tag_sum", "question_count", "tag_mean", "test_mean",

for fold_num, (train, test) in enumerate(zip(train_lst, test_lst)):
    print("@"*50)
    print(fold_num, "번째 fold")
    print("@"*50)
    
    # X, y 값 분리
    y_train = train["answerCode"]
    train = train.drop(["answerCode"], axis=1)

    y_test = test["answerCode"]
    test = test.drop(["answerCode"], axis=1)
    
    print("="*30)
    print("train, test shape")
    print(train.shape, test.shape)
    print("="*30)
    print()
    
    lgb_train = lgb.Dataset(train[FEATS], y_train)
    lgb_test = lgb.Dataset(test[FEATS], y_test)
    
    now = datetime.now()
    wandb.init(project='P4-DKT', config=params, entity="team-ikyo")
    wandb.run.name = "sun-lgbm-fold" + str(fold_num) + " time: " + " ".join(map(str, [now.month, now.day, now.hour, now.minute]))
    
    # train
    model = lgb.train(params,
                      lgb_train,
                      valid_sets = [lgb_train, lgb_test],
                      verbose_eval = 100,
                      callbacks=[wandb_callback()])

    preds = model.predict(test[FEATS])
    acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))
    auc = roc_auc_score(y_test, preds)

    print(f'VALID AUC : {auc} ACC : {acc}\n')
    
    # show feature importance
    fig, ax = plt.subplots(figsize=(6,12))
    lgb.plot_importance(model, max_num_features=100, height=0.8, ax=ax)
    plt.show()
    
    # inference
    inference(FEATS, model, auc, acc)

### Result to csv

In [None]:
from glob import glob
import pandas as pd

output_path = "/opt/ml/code/output/cross_validation/output.csv"
csv_file_path_list = glob("/opt/ml/code/output/*.csv")
print(csv_file_path_list)

# concat result dataframe
result = pd.read_csv(csv_file_path_list[0])["prediction"]
for csv_file_path in csv_file_path_list[1:]:
    result = pd.concat([result, pd.read_csv(csv_file_path)["prediction"]], axis=1)

# mean result dataframe
result = pd.DataFrame(result.mean(axis=1)).reset_index().rename(columns = {0:"prediction", "index":"id"})
result.to_csv(output_path, index=False)

Exception in thread Thread-7:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 180, in check_network_status
    status_response = self._interface.communicate_network_status()
  File "/opt/conda/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 747, in communicate_network_status
    resp = self._communicate(req, timeout=timeout, local=True)
  File "/opt/conda/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 537, in _communicate
    return self._communicate_async(rec, local=local).get(timeout=timeout)
  File "/opt/conda/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 542, in _communicate_async
    raise Exception("The wandb backend process has shutdown")
Exception: The 

#### Grid Search

In [485]:
FEATS = ["user_correct_answer", "time_difference",
             "user_acc", "test_mean", "test_sum", 
             "tag_mean", "tag_sum", "user_mean", "user_count",
             "question_mean", "question_count", "question_class_mean", "question_class_count"]

In [488]:
grid_FEATS = [["user_correct_answer", "time_difference",
             "user_acc", "test_mean", "test_sum", 
             "tag_mean", "tag_sum", "user_mean", "user_count",
             "question_mean", "question_count", "question_class_mean", "question_class_count"]]

for comb_num in range(6, 13, 2):
    for features in list(combinations(FEATS, comb_num)):
        grid_FEATS.append(list(features))

In [367]:
# for FEATS in grid_FEATS:
#     # 유저별 분리
#     train, test = custom_train_test_split(df)

#     # X, y 값 분리
#     y_train = train['answerCode']
#     train = train.drop(['answerCode'], axis=1)

#     y_test = test['answerCode']
#     test = test.drop(['answerCode'], axis=1)
    
#     params = {}
#     params["boosting_type"] = "gbdt" # gbdt, dart, goss
#     params["learning_rate"] = 1e-1 # 1e-1, 5e-2, 1e-2, 5e-3, 1e-3
#     params["objective"] = "binary"
#     params["metric"] = "auc" # binary_logloss, rmse, huber, auc
#     params["num_iterations"] = 1000 # 100
#     params["max_depth"] = 5 # -1
#     params["num_leaves"] = 10 # 31 이상적으로 num_leaves값은 2 ^ (max_depth) 값보다 적거나 같아야 합니다.
#     params["min_data_in_leaf"] = 10000 # 20 100 ~ 1000 수백 또는 수천 개로 정하는 것
#     params["max_bin"] = 16 # 256
#     params["min_split_gain"] = 1e-2 # ?
#     params["scale_pos_weight"] = 1.1 # 1.1~1.5
#     params["tree_learner"] = "serial" # serial, feature, data, voting
#     params["early_stopping_rounds"] = 50
#     params["bagging_fraction"] = 0.8 # 1.0
#     params["lambda_l1"] = 1e-1 # 0.0
#     params["lambda_l2"] = 1e-1 # 0.0

#     print("="*30)
#     print("="*30)
#     print(FEATS)
#     print("|"*30)
#     print(params)
#     print("|"*30)
#     lgb_train = lgb.Dataset(train[FEATS], y_train)
#     lgb_test = lgb.Dataset(test[FEATS], y_test)

#     model = lgb.train(params,
#                       lgb_train,
#                       valid_sets = [lgb_train, lgb_test],
#                       verbose_eval = 500)

#     preds = model.predict(test[FEATS])
#     acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))
#     auc = roc_auc_score(y_test, preds)

#     print(f'VALID AUC : {auc} ACC : {acc}\n')

#     # LOAD TESTDATA
#     test_csv_file_path = os.path.join(data_dir, 'test_data.csv')
#     test_df = pd.read_csv(test_csv_file_path, parse_dates=['Timestamp'])

#     # FEATURE ENGINEERING
#     test_df = feature_engineering(test_df)

#     # LEAVE LAST INTERACTION ONLY
#     test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]

#     # DROP ANSWERCODE
#     test_df = test_df.drop(['answerCode'], axis=1)

#     # MAKE PREDICTION
#     total_preds = model.predict(test_df[FEATS])

#     # SAVE OUTPUT
#     output_dir = 'output/'
#     write_path = os.path.join(output_dir, f"lgbm/output_VALID_AUC_{round(auc, 4)}_ACC_{round(acc, 4)}.csv")
#     if not os.path.exists(output_dir):
#         os.makedirs(output_dir)    
#     with open(write_path, 'w', encoding='utf8') as w:
#         print("writing prediction : {}".format(write_path))
#         w.write("id,prediction\n")
#         for id, p in enumerate(total_preds):
#             w.write('{},{}\n'.format(id,p))
#     print("="*30)
#     print("="*30)