In [25]:
import warnings 
warnings.filterwarnings("ignore")

In [26]:
import pandas as pd
import os
import random
import numpy as np
import yaml
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import numpy as np
import torch
import pytz
from datetime import datetime, timezone, timedelta
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import LabelEncoder
from wandb.lightgbm import wandb_callback, log_summary

from typing import List
from lightgbm.callback import CallbackEnv

In [27]:
#wandb_callback 수정 
from typing import TYPE_CHECKING, Callable
import wandb
from wandb.sdk.lib import telemetry as wb_telemetry

MINIMIZE_METRICS = [
    "l1",
    "l2",
    "rmse",
    "mape",
    "huber",
    "fair",
    "poisson",
    "gamma",
    "binary_logloss",
]

MAXIMIZE_METRICS = ["map", "auc", "average_precision"]

def set_seeds(seed: int = 42):
    # 랜덤 시드를 설정하여 매 코드를 실행할 때마다 동일한 결과를 얻게 합니다.
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
        
def wandb_callback(log_params=True, define_metric=True) -> Callable:
    """Automatically integrates LightGBM with wandb.

    Arguments:
        log_params: (boolean) if True (default) logs params passed to lightgbm.train as W&B config
        define_metric: (boolean) if True (default) capture model performance at the best step, instead of the last step, of training in your `wandb.summary`

    Passing `wandb_callback` to LightGBM will:
      - log params passed to lightgbm.train as W&B config (default).
      - log evaluation metrics collected by LightGBM, such as rmse, accuracy etc to Weights & Biases
      - Capture the best metric in `wandb.summary` when `define_metric=True` (default).

    Use `log_summary` as an extension of this callback.

    Example:
        ```python
        params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            .
        }
        gbm = lgb.train(params,
                        lgb_train,
                        num_boost_round=10,
                        valid_sets=lgb_eval,
                        valid_names=('validation'),
                        callbacks=[wandb_callback()])
        ```
    """
    def _define_metric(data: str, metric_name: str) -> None:
    
        """Capture model performance at the best step.
        instead of the last step, of training in your `wandb.summary`
        """
        if "loss" in str.lower(metric_name):
            wandb.define_metric(f"{data}_{metric_name}", summary="min")
        elif str.lower(metric_name) in MINIMIZE_METRICS:
            wandb.define_metric(f"{data}_{metric_name}", summary="min")
        elif str.lower(metric_name) in MAXIMIZE_METRICS:
            wandb.define_metric(f"{data}_{metric_name}", summary="max")
            
    log_params_list: "List[bool]" = [log_params]
    define_metric_list: "List[bool]" = [define_metric]

    def _init(env: "CallbackEnv") -> None:
        with wb_telemetry.context() as tel:
            tel.feature.lightgbm_wandb_callback = True

        wandb.config.update(env.params)
        log_params_list[0] = False

        if define_metric_list[0]:
            for i in range(len(env.evaluation_result_list)):
                data_type = env.evaluation_result_list[i][0]
                metric_name = env.evaluation_result_list[i][1]
                _define_metric(data_type, metric_name)

    def _callback(env: "CallbackEnv") -> None:
        if log_params_list[0]:
            _init(env)
        # eval_results: "Dict[str, Dict[str, List[Any]]]" = {}
        # recorder = lightgbm.record_evaluation(eval_results)
        # recorder(env)
        eval_results = {x[0]:{x[1:][0]:x[1:][1:]} for x in env.evaluation_result_list}

        for validation_key in eval_results.keys():
            for key in eval_results[validation_key].keys():
                 wandb.log(
                     {validation_key + "_" + key: eval_results[validation_key][key][0]},
                     commit=False,
                 )
        for item in eval_results:
            if len(item) == 4:
                wandb.log({f"{item[0]}_{item[1]}": item[2]}, commit=False)

        # Previous log statements use commit=False. This commits them.
        wandb.log({"iteration": env.iteration}, commit=True)

    return _callback

## Training

In [28]:
sweep_config_path = '/data/ephemeral/home/level2-dkt-recsys-06/code/boost/lgbmsweepconfig.yaml'

# 노트북의 이름 설정

os.environ['WANDB_NOTEBOOK_NAME'] = 'LGBM_Train.ipynb'
# YAML 파일 로드
with open(sweep_config_path, 'r') as file:
    sweep_config = yaml.safe_load(file)

# W&B 스위프트 설정
sweep_id = wandb.sweep(sweep=sweep_config, project="lightgbm-sweep")

# 시드 고정
set_seeds()


Create sweep with ID: o44ljf2p
Sweep URL: https://wandb.ai/yechance7/lightgbm-sweep/sweeps/o44ljf2p


In [29]:
X = pd.read_csv('/data/ephemeral/home/level2-dkt-recsys-06/data/FE_v9.csv')
X = X.sort_values(by=["userID", "Timestamp", "assessmentItemID"]).reset_index(drop=True)

# test = test[test["userID"] != test["userID"].shift(-1)]
# test = test.drop(["answerCode"], axis=1)



In [30]:
test = X[X["answerCode"] == -1]
X = X[X['answerCode']!=-1]

In [31]:
Feature = [ 'Itemseq', 'SolvingTime', 'CumulativeTime', 'UserAvgSolvingTime',
       'RelativeUserAvgSolvingTime', 'CumulativeItemCount', 'Item_last7days',
       'Item_last30days', 'CumulativeUserItemAcc', 'PastItemCount',
       'UserItemElapsed', 'UserRecentItemSolvingTime', 'ItemAcc',
       'AverageItemSolvingTime_Correct', 'AverageItemSolvingTime_Incorrect',
       'AverageItemSolvingTime', 'RelativeItemSolvingTime',
       'SolvingTimeClosenessDegree', 'UserTagAvgSolvingTime', 'TagAcc',
       'CumulativeUserTagAverageAcc', 'CumulativeUserTagExponentialAverage',
       'UserTagCount', 'UserTagElapsed',  'TestAcc', ]

Categorical_Feature = ['userID', 'assessmentItemID', 'testId','KnowledgeTag',
                       'Month','DayOfWeek', 'TimeOfDay', 'WeekOfYear', 
       'UserRecentTagAnswer',
       'UserRecentItemAnswer',
       'categorize_solvingTime',
       'categorize_ItemAcc', 'categorize_TagAcc', 'categorize_TestAcc',
       'categorize_CumulativeUserItemAcc',
       'categorize_CumulativeUserTagAverageAcc',
       'categorize_CumulativeUserTagExponentialAverage', 'CategorizedDegree'

]
Feature = Feature + Categorical_Feature

# as category: integer여도 범주형으로 취급 가능
for feature in Categorical_Feature:
       test[feature] = test[feature].astype('category')
       X[feature] = X[feature].astype('category')

In [32]:
# print(X.shape)

# # 원-핫 인코딩 적용할 컬럼 선택
# columns_to_encode = [
#     "UserRecentTagAnswer",
#     "PreviousItemAnswer",
#     # 추가적으로 원-핫 인코딩을 적용할 다른 컬럼들을 여기에 추가
# ]
# for column in columns_to_encode:
#     if column in X.columns:
#         X = pd.get_dummies(X, columns=[column])
#     if column in test.columns:
#         test = pd.get_dummies(test, columns=[column])

#         # 다른 필드들에 대해서도 동일하게 적용
# print(X.shape)

# 라벨로 이동 


In [33]:
feat = X.columns.tolist()

exclude_columns = [
    "Timestamp",
    "answerCode",
    "DayOfWeek",
    'WeekOfYear',
    'categorize_solvingTime',
    'categorize_ItemAcc', 'categorize_TagAcc', 'categorize_TestAcc',
    'categorize_CumulativeUserItemAcc',
    'categorize_CumulativeUserTagAverageAcc',
    'categorize_CumulativeUserTagExponentialAverage', 'CategorizedDegree'
]

filtered_feat = [column for column in feat if column not in exclude_columns]

print(X[feat].shape)
print(X[filtered_feat].shape)
print(X.columns.tolist())

(2525956, 45)
(2525956, 33)
['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp', 'KnowledgeTag', 'Itemseq', 'SolvingTime', 'CumulativeTime', 'Month', 'DayOfWeek', 'TimeOfDay', 'WeekOfYear', 'UserAvgSolvingTime', 'RelativeUserAvgSolvingTime', 'CumulativeItemCount', 'Item_last7days', 'Item_last30days', 'CumulativeUserItemAcc', 'PastItemCount', 'UserItemElapsed', 'UserRecentItemSolvingTime', 'ItemAcc', 'AverageItemSolvingTime_Correct', 'AverageItemSolvingTime_Incorrect', 'AverageItemSolvingTime', 'RelativeItemSolvingTime', 'SolvingTimeClosenessDegree', 'UserTagAvgSolvingTime', 'TagAcc', 'CumulativeUserTagAverageAcc', 'CumulativeUserTagExponentialAverage', 'UserTagCount', 'UserTagElapsed', 'UserRecentTagAnswer', 'UserRecentItemAnswer', 'TestAcc', 'categorize_solvingTime', 'categorize_ItemAcc', 'categorize_TagAcc', 'categorize_TestAcc', 'categorize_CumulativeUserItemAcc', 'categorize_CumulativeUserTagAverageAcc', 'categorize_CumulativeUserTagExponentialAverage', 'CategorizedD

In [34]:
default_config = {
    "num_leaves": 10,  # 최소값 10
    "learning_rate": 0.0001,  # 최소값 0.0001
    "max_depth": -1,  # -1 (깊이 제한 없음)
    "min_data_in_leaf": 20,  # 최소값 20
    "feature_fraction": 0.6,  # 최소값 0.6
    "bagging_fraction": 0.6,  # 최소값 0.6
    "bagging_freq": 0,  # 최소값 0
    "lambda_l1": 0.0,  # 최소값 0.0
    "lambda_l2": 0.0,  # 최소값 0.0
    "cat_smooth": 10,  # 최소값 10
}



In [35]:

def train():
    
    auc = 0
    acc = 0
    test_preds = np.zeros(len(test))
    
    wandb.init(project=f"lightgbm-sweep", config=default_config,entity= "boostcamp6-recsys6")
    
    ratio = wandb.config.ratio
    
    sampled_indices = X.groupby('userID').sample(frac=ratio).index

    # userID별 마지막 인덱스 찾기
    # last_indices = X.groupby("userID").tail(1).index

    # 학습 데이터셋 생성
    X_train = X.drop(sampled_indices)
    y_train = X_train["answerCode"]

    # 검증 데이터셋 생성
    X_valid = X.loc[sampled_indices]
    y_valid = X_valid["answerCode"]

    lgb_train = lgb.Dataset(X_train[filtered_feat], y_train)
    lgb_valid = lgb.Dataset(X_valid[filtered_feat], y_valid)

    # 완드비 실험 이름
    korea = pytz.timezone("Asia/Seoul")
    current_time = datetime.now(korea).strftime("%m-%d %H:%M")
    wandb.run.name = f"yechan {current_time}"
    current_params = {
        "objective": "binary",
        "metric": ["auc"],
        "device": "cpu",
        "num_leaves": wandb.config.num_leaves,
        "learning_rate": wandb.config.learning_rate,
        "max_depth": wandb.config.max_depth,
        "min_data_in_leaf": wandb.config.min_data_in_leaf,
        "feature_fraction": wandb.config.feature_fraction,
        "bagging_fraction": wandb.config.bagging_fraction,
        "bagging_freq": wandb.config.bagging_freq,
        "lambda_l1": wandb.config.lambda_l1,
        "lambda_l2": wandb.config.lambda_l2,
        "cat_smooth": wandb.config.cat_smooth,
    }
    model = lgb.train(
        current_params,
        lgb_train,
        valid_sets=[lgb_train, lgb_valid],
        num_boost_round=500,
        callbacks=[
            wandb_callback(log_params=True, define_metric=True),
            lgb.early_stopping(30),
        ],
        categorical_feature=[
            "userID",
            "assessmentItemID",
            "testId",
            "KnowledgeTag",
            "Month"
        ],
    )
    preds = model.predict(X_valid[filtered_feat])
    acc = accuracy_score(y_valid, np.where(preds >= 0.5, 1, 0))
    auc = roc_auc_score(y_valid, preds)
    test_preds += model.predict(test[filtered_feat])
    print(f"VALID AUC : {auc} ACC : {acc}\n")
    wandb.log({"auc": auc, "accuracy": acc})
    wandb.finish()
    
    #output파일 생성
    output_dir = "output/"
    write_path = os.path.join(
        output_dir,
        f"auc:{auc} acc:{acc}" + " lgbm.csv",
    )
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    with open(write_path, "w", encoding="utf8") as w:
        print("writing prediction : {}".format(write_path))
        w.write("id,prediction\n")
        for id, p in enumerate(test_preds):
            w.write("{},{}\n".format(id, p))
            
    feature_importances = model.feature_importance()
    feature_names = model.feature_name()
    importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

    print(importance_df)

In [36]:

wandb.agent(sweep_id, train)

[34m[1mwandb[0m: Agent Starting Run: rx8zs4hi with config:
[34m[1mwandb[0m: 	bagging_fraction: 0.8733244182585951
[34m[1mwandb[0m: 	bagging_freq: 16
[34m[1mwandb[0m: 	feature_fraction: 0.7740314900820466
[34m[1mwandb[0m: 	lambda_l1: 4.876360369590984
[34m[1mwandb[0m: 	lambda_l2: 14.02083653708583
[34m[1mwandb[0m: 	learning_rate: 0.3558208520008016
[34m[1mwandb[0m: 	max_depth: 41
[34m[1mwandb[0m: 	min_data_in_leaf: 65
[34m[1mwandb[0m: 	num_leaves: 41
[34m[1mwandb[0m: 	ratio: 0.22653397682506593
[34m[1mwandb[0m: Currently logged in as: [33myechance7[0m ([33mboostcamp6-recsys6[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: [32m[41mERROR[0m Error while calling W&B API: could not find agent vunkyg0b during agentHeartbeat (<Response [404]>)
Exception in thread Thread-43 (_heartbeat):
Traceback (most recent call last):
  File "/opt/ml/miniconda3/envs/dkt/lib/python3.10/site-packages/wandb/sdk/lib/retry.py", line 131, in __call__
    result = self._call_fn(*args, **kwargs)
  File "/opt/ml/miniconda3/envs/dkt/lib/python3.10/site-packages/wandb/sdk/internal/internal_api.py", line 369, in execute
    return self.client.execute(*args, **kwargs)  # type: ignore
  File "/opt/ml/miniconda3/envs/dkt/lib/python3.10/site-packages/wandb/vendor/gql-0.2.0/wandb_gql/client.py", line 52, in execute
    result = self._get_result(document, *args, **kwargs)
  File "/opt/ml/miniconda3/envs/dkt/lib/python3.10/site-packages/wandb/vendor/gql-0.2.0/wandb_gql/client.py", line 60, in _get_result
    return self.transport.execute(document, *args, **kwargs)
  File "/opt/ml/miniconda3/envs/dkt/lib/python3.10/site-