In [1]:
import os
import time
import random

import numpy as np
import pandas as pd
import torch
import wandb
from datetime import datetime
import pytz

import warnings
warnings.filterwarnings("ignore")
import argparse

from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

from catboost import CatBoostClassifier, CatBoostRegressor
import xgboost as XG
from lightgbm import LGBMClassifier
import optuna
import joblib
import json
from sklearn.model_selection import StratifiedGroupKFold
import wandb

In [2]:
fe="Y" # train시 valid set 쓸건지 안쓸건지 안 쓸꺼면 --fe N
trials=10 #랜덤 조합으로 몇번
    

## EDA 바뀔시 ##
file_name="FE_v7.csv"

cat_feats=['userID', 'assessmentItemID', 'testId', 
       'KnowledgeTag', 
       #'Itemseq', 'SolvingTime', 'CumulativeTime', 
       'Month',
       'DayOfWeek', 'TimeOfDay', 'WeekOfYear', 
       #'UserAvgSolvingTime',
       #'CumulativeItemCount', 'Item_last7days', 'Item_last30days',
       #'PastItemCount', 'CumulativeUserItemAnswerRate', 'ItemAnswerRate',
       #'AverageItemSolvingTime_Correct', 'AverageItemSolvingTime_Incorrect',
       #'AverageItemSolvingTime', 'Difference_SolvingTime_AvgItemSolvingTime',
       #'UserTagAvgSolvingTime', 'TagAnswerRate',
       #'CumulativeUserTagAverageAnswerRate',
       #'CumulativeUserTagExponentialAverage', 'UserCumulativeTagCount',
       'UserRecentTagAnswer', 'PreviousItemAnswer', 
       #'TestAnswerRate',
       #'categorize_solvingTime', 
       'categorize_ItemAnswerRate', 
       'categorize_TagAnswerRate', 'categorize_TestAnswerRate',
       'categorize_CumulativeUserItemAnswerRate',
       #'categorize_CumulativeUserTagAverageAnswerRate',
       'categorize_CumulativeUserTagExponentialAverage'
                                                
    ]

## 일반 ##
n_fold =5
seed = 42
data_dir ="../../data/"
model_dir ="model/"
model_name ="best_model.pt"
output_dir ="submit/"
test_file_name ="test_data.csv"

def set_seeds(seed: int = 42):
    # 랜덤 시드를 설정하여 매 코드를 실행할 때마다 동일한 결과를 얻게 합니다.
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

# data loader

In [3]:
class Dataset:
    def __init__(self, train: pd.DataFrame):
        self.train = train

    def restruct_data(self) -> dict:
        # train과 test 분할
        data = {}
        df = self.train
        train = df[df["answerCode"] >= 0]
        test = df[df["answerCode"] == -1]
        data["train"], data["test"] = train, test
        return data
    
    
    def split_data(self) -> dict:
        """
        data의 구성
        data['train'] : 전체 user_id에 대한 데이터(Test에 있는 User에 대해서는 이미 마지막으로 푼 문제 정보가 없음)
        data['train_split'] : 전체 user_id별 마지막으로 푼 문제를 제외한 데이터
        data['valid'] : 전체 user_id별 마지막으로 푼 문제에 대한 데이터
        """
        data = self.restruct_data()
        FE_train= type_conversion(data["train"])
        
        train = data['train'].copy()
        train["is_valid"] = [False] * train.shape[0]
        idx_last = train.drop_duplicates(subset="userID", keep="last").index
        train.loc[idx_last, "is_valid"] = True

        train, valid = train[train["is_valid"] == False], train[train["is_valid"] == True]
        data['train'] = train.drop("is_valid", axis=1)
        data['valid'] = valid.drop("is_valid", axis=1)

        print(f'{data[f"train"].shape[0]} train data')
        print(f'{data[f"valid"].shape[0]} valid data')

        return data, FE_train

def type_conversion(df):
        # [FEAT] integer여도 범주형으로 취급 가능
        for feature in cat_feats:
                df[feature] = df[feature].astype('category')

        return df

class Preprocess:
    def __init__(self, data: dict):
        self.data = data

    def preprocess(self,cat_feats) -> dict:
        self.data["train_x"] = self.data["train"].drop("answerCode", axis=1)
        self.data["train_y"] = self.data["train"]["answerCode"]

        self.data["valid_x"] = self.data["valid"].drop("answerCode", axis=1)
        self.data["valid_y"] = self.data["valid"]["answerCode"]

        self.data["test"] = self.data["test"].drop("answerCode", axis=1)

        # as category: integer여도 범주형으로 취급 가능
        for state in ["train_x", "valid_x", "test"]:
            df = self.data[state]
                
            for feature in cat_feats:
                df[feature] = df[feature].astype('category')

        


        return self.data

# trainer

In [4]:
#logger = get_logger(logger_conf=logging_conf)

# optuna
def objective(trial, FEATURE,data):
    params_LGBM = {
        'random_state':seed,
        #'objective': 'binary', 
        'metric': 'auc',  # 평가 지표로 AUC 사용
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
        'n_estimators' : trial.suggest_int('num_round', 1000, 5000),  
        
        #'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.3),  # 학습 속도
    
        }
    
    score = []
    bst = LGBMClassifier(**params_LGBM,force_row_wise=True)
    bst.fit(X=data['train_x'][FEATURE], y=data['train_y'], eval_set=[(data['valid_x'][FEATURE], data['valid_y'])])

    # Predict on the validation set
    y_pred_proba = bst.predict_proba(data['valid_x'][FEATURE])[:, 1]
    y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred_proba]

    # Calculate accuracy and AUC
    accuracy = accuracy_score(data['valid_y'], y_pred_binary)
    auc = roc_auc_score(data['valid_y'], y_pred_proba)

    # Print accuracy and AUC
    print('Accuracy: {:.4f}'.format(accuracy))
    print('AUC: {:.4f}'.format(auc))
  
    score.append(auc)

    # Calculate and print the average AUC score
    result = sum(score) / len(score)
    print('Average AUC: {:.4f}'.format(result))

    return result  # auc 최대화하는 방향으로


class boosting_model:
    def __init__(self, FEATURE,data):
        self.feature = FEATURE
        self.data = data
        
        # Optuna 최적화
        study = optuna.create_study(direction='maximize')
        study.optimize(lambda trial: objective(trial,self.feature, self.data), n_trials=trials)

        # 최적 하이퍼파라미터 출력
        print('Hyperparameters: {}'.format(study.best_params))
            
        self.model = LGBMClassifier(
               **study.best_params, objective = 'binary', metric = 'auc'
            )
            


    def training(self, data, FEATURE,FE_train):
        #logger.info("###start MODEL training ###")
        #logger.info(self.feature)


        if fe == "N":
            self.model.fit(
                    FE_train[FEATURE],
                    FE_train["answerCode"],
                    categorical_feature=cat_feats
                    )
        else:
            print("Valid Data is used while training")
            score = []
            self.model.fit(
                        data["train_x"][FEATURE],
                        data["train_y"],
                        eval_set=(data["valid_x"][FEATURE], data['valid_y']),
                   force_row_wise=True
                    )

            # Prediction
            y_pred_proba = self.model.predict(data["valid_x"][FEATURE])
            y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred_proba]

            # Calculate accuracy and AUC
            accuracy = accuracy_score(data['valid_y'], y_pred_binary)
            auc = roc_auc_score(data['valid_y'], y_pred_proba)
            print('Accuracy: {:.4f}'.format(accuracy))
            print('AUC: {:.4f}'.format(auc))

            # Append the AUC score to the list
            score.append(auc)

            # Calculate and print the average AUC score
            result = sum(score) / len(score)
            print('Average AUC: {:.4f}'.format(result))
            model_type = 'LGBM' 
     
        
        get_feature_importance(self.model, FEATURE, model_type)    


    def inference(self, data,save_time,model_type,output_dir):
        # submission 제출하기 위한 코드
        #test_pred = self.model.predict(data["test"][self.feature])
        test_pred = self.model.predict_proba(data["test"][self.feature])[:, 1]

        data["test"]["prediction"] = test_pred
        submission = data["test"]["prediction"].reset_index(drop=True).reset_index()
        submission.rename(columns={"index": "id"}, inplace=True)
        submission_filename = f"{model_type}_{save_time}.csv"
        submission.to_csv(
            os.path.join(output_dir, submission_filename), index=False
        )


def get_feature_importance(model, feature_names, model_type):
        importance = model.feature_importances_

        feature_importance = np.array(importance)
        feature_names = np.array(feature_names)
    
        # DataFrame 생성하고 정렬
        fi_df = pd.DataFrame({'Feature Names': feature_names, 'Feature Importance': feature_importance})
        fi_df = fi_df.sort_values(by='Feature Importance', ascending=False)

        # Print the results
        print(f"{model_type} Feature Importance:")
        print(fi_df)
        

# main

In [5]:
# python main.py --model    # CAT, XG, LGBM   default="CAT", 

# Boosting 계열, 수정할수 있는 파라미터
# 1. FEATURE 선택
# 2. train시 valid set 쓸건지 안쓸건지, default: Y
# 3. optuna 시도 횟수, default: n_trials=10, 보통 100번이상이면 수렴됨
# 4. optuna params

#logger = get_logger(logger_conf=logging_conf)


def main():
    ######################## SELECT FEATURE
    FEATURE = ['userID', 'assessmentItemID', 'testId', 
       'KnowledgeTag', 'Itemseq', 'SolvingTime', 'CumulativeTime', 'Month',
       'DayOfWeek', 'TimeOfDay', 'WeekOfYear', 'UserAvgSolvingTime',
       'CumulativeItemCount', 'Item_last7days', 'Item_last30days',
       'PastItemCount', 
       #'CumulativeUserItemAnswerRate', 'ItemAnswerRate',
       'AverageItemSolvingTime_Correct', 'AverageItemSolvingTime_Incorrect',
       'AverageItemSolvingTime', 'Difference_SolvingTime_AvgItemSolvingTime',
       'UserTagAvgSolvingTime', 
       #'TagAnswerRate',
       #'CumulativeUserTagAverageAnswerRate',
       #'CumulativeUserTagExponentialAverage', 
       'UserCumulativeTagCount',
       'UserRecentTagAnswer', 'PreviousItemAnswer', 
       #'TestAnswerRate',
       #'categorize_solvingTime', 
       'categorize_ItemAnswerRate',
       'categorize_TagAnswerRate', 'categorize_TestAnswerRate',
       'categorize_CumulativeUserItemAnswerRate',
       #'categorize_CumulativeUserTagAverageAnswerRate',
       'categorize_CumulativeUserTagExponentialAverage'
    ]
    #wandb.login()

    set_seeds(seed)
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Time
    korea_timezone = pytz.timezone('Asia/Seoul')
    now_korea = datetime.now(korea_timezone)
    now_date = now_korea.strftime('%Y%m%d')
    now_hour = now_korea.strftime('%H%M%S')
    save_time = f"{now_date}_{now_hour}"
    

    ######################## DATA LOAD
    print("### DATA LOAD ###")
    #logger.info("Loading data ...")
    train = pd.read_csv(data_dir + 'FE_v7.csv')

    data = Dataset(train)
    data, FE_train = data.split_data()

    ######################## DATA PREPROCESSING
    print("### DATA PREPROCESSING ###")
    #logger.info("Preparing data ...")
    process = Preprocess(data)
    data = process.preprocess(cat_feats)

    ######################## HYPER PARAMETER TUNING - USING OPTUNA
    print("### HYPER PARAMETER TUNING - USING OPTUNA ###")
    print("number of selected features:", len(FEATURE))
    #wandb.init(project="level2-dkt", config=vars(), entity="boostcamp6-recsys6")
    #wandb.run.name = "yechance" + current_time
    #wandb.run.save()

    #logger.info("Building Model ...")
    model = boosting_model(FEATURE, data)

    ######################## TRAIN
    print("### TRAIN ###")
    #logger.info("Start Training ...")
    model.training(data, FEATURE,FE_train)
    
    ######################## INFERENCE
    print("### INFERENCE ###")
    #logger.info("")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    model_type = 'LGBM' 
    model.inference(data,save_time,model_type,output_dir)

    print( model_type + "_" + save_time + " submission file has been made" )
    #wandb.finish()


if __name__ == "__main__":
    os.makedirs(model_dir, exist_ok=True)
    main()


### DATA LOAD ###


2518514 train data
7442 valid data
### DATA PREPROCESSING ###


[I 2024-01-23 07:02:38,617] A new study created in memory with name: no-name-37c39a8d-3625-4953-a8f7-280bd84df356


### HYPER PARAMETER TUNING - USING OPTUNA ###
number of selected features: 29
[LightGBM] [Info] Number of positive: 1649969, number of negative: 868545
[LightGBM] [Info] Total Bins 20308
[LightGBM] [Info] Number of data points in the train set: 2518514, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655136 -> initscore=0.641692
[LightGBM] [Info] Start training from score 0.641692


[W 2024-01-23 07:16:41,130] Trial 0 failed with parameters: {'boosting_type': 'gbdt', 'num_round': 4885, 'learning_rate': 0.007762430289977601} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/opt/ml/miniconda3/envs/dkt/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_1410537/2023225261.py", line 48, in <lambda>
    study.optimize(lambda trial: objective(trial,self.feature, self.data), n_trials=trials)
  File "/tmp/ipykernel_1410537/2023225261.py", line 18, in objective
    bst.fit(X=data['train_x'][FEATURE], y=data['train_y'], eval_set=[(data['valid_x'][FEATURE], data['valid_y'])])
  File "/opt/ml/miniconda3/envs/dkt/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1187, in fit
    super().fit(
  File "/opt/ml/miniconda3/envs/dkt/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/ml/mini

KeyboardInterrupt: 