In [11]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import torch
import os
import random

import yaml
from datetime import datetime
import pytz

import lightgbm as LGBM
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedGroupKFold

import wandb
from wandb.lightgbm import wandb_callback, log_summary
from typing import TYPE_CHECKING, Callable
from wandb.sdk.lib import telemetry as wb_telemetry

In [12]:
### 중요 ###
file_name="FE_v8.csv"
Feature = ['userID', 'assessmentItemID', 'testId', 
       'KnowledgeTag', 'Itemseq', 'SolvingTime', 'CumulativeTime', 'Month',
       'DayOfWeek', 'TimeOfDay', 'WeekOfYear', 'UserAvgSolvingTime',
       'Difference_SolvingTime_UserAvgSolvingTime', 'CumulativeItemCount',
       'Item_last7days', 'Item_last30days', 'CumulativeUserItemAcc',
       'PastItemCount', 'UserItemElapsed', 'ItemAcc',
       'AverageItemSolvingTime_Correct', 'AverageItemSolvingTime_Incorrect',
       'AverageItemSolvingTime', 'Difference_SolvingTime_AvgItemSolvingTime',
       'UserTagAvgSolvingTime', 'TagAcc', 'CumulativeUserTagAverageAcc',
       'CumulativeUserTagExponentialAverage', 'UserTagCount', 'UserTagElapsed',
       'PastTagSolvingTime', 'UserRecentTagAnswer', 'PreviousItemAnswer',
       'TestAcc', 'categorize_solvingTime', 'categorize_ItemAcc',
       'categorize_TagAcc', 'categorize_TestAcc',
       'categorize_CumulativeUserItemAcc',
       'categorize_CumulativeUserTagAverageAcc',
       'categorize_CumulativeUserTagExponentialAverage'
]

Categorical_Feature = ['userID', 'assessmentItemID', 'testId', 'KnowledgeTag', 
       'Month','DayOfWeek', 'TimeOfDay', 'WeekOfYear',
        'UserRecentTagAnswer', 'PreviousItemAnswer',
       'categorize_solvingTime', 'categorize_ItemAcc',
       'categorize_TagAcc', 'categorize_TestAcc',
       'categorize_CumulativeUserItemAcc',
       'categorize_CumulativeUserTagAverageAcc',
       'categorize_CumulativeUserTagExponentialAverage'
]

n_fold =5
seed = 42
data_dir ="../../data/"
output_dir ="submit/"
sweep_config_path = '/data/ephemeral/home/level2-dkt-recsys-06/code/tabular/LGBMsweepconfig.yaml'

# wandb_callback

In [13]:
MINIMIZE_METRICS = [
    "l1",
    "l2",
    "rmse",
    "mape",
    "huber",
    "fair",
    "poisson",
    "gamma",
    "binary_logloss",
]

MAXIMIZE_METRICS = ["map", "auc", "average_precision"]

def set_seeds(seed):
    # 랜덤 시드를 설정하여 매 코드를 실행할 때마다 동일한 결과를 얻게 합니다.
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


def wandb_callback(log_params=True, define_metric=True) -> Callable:
    """Automatically integrates LightGBM with wandb.

    Arguments:
        log_params: (boolean) if True (default) logs params passed to lightgbm.train as W&B config
        define_metric: (boolean) if True (default) capture model performance at the best step, instead of the last step, of training in your `wandb.summary`

    Passing `wandb_callback` to LightGBM will:
      - log params passed to lightgbm.train as W&B config (default).
      - log evaluation metrics collected by LightGBM, such as rmse, accuracy etc to Weights & Biases
      - Capture the best metric in `wandb.summary` when `define_metric=True` (default).

    Use `log_summary` as an extension of this callback.

    Example:
        ```python
        params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            .
        }
        gbm = lgb.train(params,
                        lgb_train,
                        num_boost_round=10,
                        valid_sets=lgb_eval,
                        valid_names=('validation'),
                        callbacks=[wandb_callback()])
        ```
    """
    def _define_metric(data: str, metric_name: str) -> None:
    
        """Capture model performance at the best step.
        instead of the last step, of training in your `wandb.summary`
        """
        if "loss" in str.lower(metric_name):
            wandb.define_metric(f"{data}_{metric_name}", summary="min")
        elif str.lower(metric_name) in MINIMIZE_METRICS:
            wandb.define_metric(f"{data}_{metric_name}", summary="min")
        elif str.lower(metric_name) in MAXIMIZE_METRICS:
            wandb.define_metric(f"{data}_{metric_name}", summary="max")
            
    log_params_list: "List[bool]" = [log_params]
    define_metric_list: "List[bool]" = [define_metric]

    def _init(env: "CallbackEnv") -> None:
        with wb_telemetry.context() as tel:
            tel.feature.lightgbm_wandb_callback = True

        wandb.config.update(env.params)
        log_params_list[0] = False

        if define_metric_list[0]:
            for i in range(len(env.evaluation_result_list)):
                data_type = env.evaluation_result_list[i][0]
                metric_name = env.evaluation_result_list[i][1]
                _define_metric(data_type, metric_name)

    def _callback(env: "CallbackEnv") -> None:
        if log_params_list[0]:
            _init(env)
        # eval_results: "Dict[str, Dict[str, List[Any]]]" = {}
        # recorder = lightgbm.record_evaluation(eval_results)
        # recorder(env)
        eval_results = {x[0]:{x[1:][0]:x[1:][1:]} for x in env.evaluation_result_list}

        for validation_key in eval_results.keys():
            for key in eval_results[validation_key].keys():
                 wandb.log(
                     {validation_key + "_" + key: eval_results[validation_key][key][0]},
                     commit=False,
                 )
        for item in eval_results:
            if len(item) == 4:
                wandb.log({f"{item[0]}_{item[1]}": item[2]}, commit=False)

        # Previous log statements use commit=False. This commits them.
        wandb.log({"iteration": env.iteration}, commit=True)

    return _callback

# Time
korea_timezone = pytz.timezone('Asia/Seoul')
now_korea = datetime.now(korea_timezone)
now_date = now_korea.strftime('%Y%m%d')
now_hour = now_korea.strftime('%H%M%S')
save_time = f"{now_date}_{now_hour}"

# 노트북의 이름 설정
os.environ['WANDB_NOTEBOOK_NAME'] = 'LGBM_Train.ipynb'

# YAML 파일 로드
with open(sweep_config_path, 'r') as file:
    sweep_config = yaml.safe_load(file)

# W&B 스위프트 설정
sweep_id = wandb.sweep(sweep=sweep_config, project="lightgbm-sweep")

# 시드 고정
set_seeds(seed)



Create sweep with ID: n77jl4mh
Sweep URL: https://wandb.ai/yechance7/lightgbm-sweep/sweeps/n77jl4mh


# data loader

In [14]:
class Dataset:
    def __init__(self, train: pd.DataFrame,Feature):
        self.train = train
        self.feature = Feature

    def restruct_data(self) -> dict:
        # train과 test 분할
        data = {}
        df = self.train
        train = df[df["answerCode"] >= 0]
        test = df[df["answerCode"] == -1]
        data["train"], data["test"] = train, test
        return data
    
    
    def split_data(self) -> dict:
        """
        data의 구성
        data['train'] : 전체 user_id에 대한 데이터(Test에 있는 User에 대해서는 이미 마지막으로 푼 문제 정보가 없음)
        data['train_split'] : 전체 user_id별 마지막으로 푼 문제를 제외한 데이터
        data['valid'] : 전체 user_id별 마지막으로 푼 문제에 대한 데이터
        """
        data = self.restruct_data()
        
        train = data['train'].copy()
        train["is_valid"] = [False] * train.shape[0]
        idx_last = train.drop_duplicates(subset="userID", keep="last").index
        train.loc[idx_last, "is_valid"] = True

        train, valid = train[train["is_valid"] == False], train[train["is_valid"] == True]
        data['train'] = train.drop("is_valid", axis=1)
        data['valid'] = valid.drop("is_valid", axis=1)

        print(f'{data[f"train"].shape[0]} train data')
        print(f'{data[f"valid"].shape[0]} valid data')

        data["train_x"] = data["train"].drop("answerCode", axis=1)
        data["train_y"] = data["train"]["answerCode"]

        data["valid_x"] = data["valid"].drop("answerCode", axis=1)
        data["valid_y"] = data["valid"]["answerCode"]

        data["test"] = data["test"].drop("answerCode", axis=1)

        return data["train_x"][self.feature], data["train_y"], data["valid_x"][self.feature], data["valid_y"], data["test"][self.feature]

######################## DATA LOAD
print("### DATA LOAD ###")
FE = pd.read_csv(data_dir + file_name)

######################## DATA PREPROCESSING
print("### DATA PREPROCESSING ###")
data = Dataset(FE,Feature)
X_train, y_train, X_valid, y_valid, test = data.split_data()

### DATA LOAD ###


### DATA PREPROCESSING ###
2518514 train data
7442 valid data


# trainer

In [15]:
default_config = {
    "num_leaves": 10,  # 최소값 10
    "learning_rate": 0.0001,  # 최소값 0.0001
    "max_depth": -1,  # -1 (깊이 제한 없음)
    "min_data_in_leaf": 20,  # 최소값 20
    "feature_fraction": 0.6,  # 최소값 0.6
    "bagging_fraction": 0.6,  # 최소값 0.6
    "bagging_freq": 0,  # 최소값 0
    "lambda_l1": 0.0,  # 최소값 0.0
    "lambda_l2": 0.0,  # 최소값 0.0
    "cat_smooth": 10,  # 최소값 10
}

def train(X_train, y_train, X_valid, y_valid, test,Categorical_Feature):
    
    test_preds = np.zeros(len(test))
    
    wandb.init(project=f"lightgbm-sweep", config=default_config)
    wandb.run.name = f"{save_time} yechan"

    current_params = {
        "objective": "binary",
        "metric": ["auc"],
        "device": "cpu",
        "num_leaves": wandb.config.num_leaves,
        "learning_rate": wandb.config.learning_rate,
        "max_depth": wandb.config.max_depth,
        "min_data_in_leaf": wandb.config.min_data_in_leaf,
        "feature_fraction": wandb.config.feature_fraction,
        "bagging_fraction": wandb.config.bagging_fraction,
        "bagging_freq": wandb.config.bagging_freq,
        "lambda_l1": wandb.config.lambda_l1,
        "lambda_l2": wandb.config.lambda_l2,
        "cat_smooth": wandb.config.cat_smooth,
    }
    lgb_train = LGBM.Dataset(X_train, y_train)
    lgb_valid = LGBM.Dataset(X_valid, y_valid)

    
    model = LGBM.train(
        current_params,
        lgb_train,
        valid_sets=[lgb_train, lgb_valid],
        num_boost_round=500,
        callbacks=[
            wandb_callback(log_params=True, define_metric=True),
            LGBM.early_stopping(30),
        ],
         categorical_feature= Categorical_Feature,
    )
    

    
    # Prediction
    y_pred_proba = model.predict_proba(X_valid)[:, 1]
    y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred_proba]

    # Calculate Accuracy and AUC
    acc = accuracy_score(y_valid, y_pred_binary)
    auc = roc_auc_score(y_valid, y_pred_proba)
    print(f"VALID AUC : {auc} ACC : {acc}\n")

    test_preds += model.predict_proba(test)[:, 1]
    wandb.log({"auc": auc, "accuracy": acc})
            
    write_path = os.path.join(
        output_dir,
        f"auc:{auc} acc:{acc}" + "LGBM_{save_time}.csv",
    )
    

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)  
    with open(write_path, "w", encoding="utf8") as w:
        print("writing prediction : {}".format(write_path))
        w.write("id,prediction\n")
        for id, p in enumerate(test_preds):
            w.write("{},{}\n".format(id, p))

    get_feature_importance(model) 
    print( "LGBM" + "_" + save_time + " Submission file has been made" )


def get_feature_importance(model, feature_names, model_type):
        importance = model.feature_importances_

        feature_importance = np.array(importance)
        feature_names = np.array(feature_names)
    
        # DataFrame 생성하고 정렬
        fi_df = pd.DataFrame({'Feature Names': feature_names, 'Feature Importance': feature_importance})
        fi_df = fi_df.sort_values(by='Feature Importance', ascending=False)

        # Print the results
        print(f"{model_type} Feature Importance:")
        print(fi_df)

######################## TRAIN
print("### TRAIN ###")
wandb.agent(sweep_id, train(X_train, y_train, X_valid, y_valid, test,Categorical_Feature))

### TRAIN ###


VBox(children=(Label(value='0.006 MB of 0.016 MB uploaded\r'), FloatProgress(value=0.3975544470130186, max=1.0…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111262604697711, max=1.0)…

TypeError: train() got an unexpected keyword argument 'X'