### 최민수 CatBoost XGBoost LGBM Code

In [41]:
from catboost import CatBoostClassifier
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from typing_extensions import Concatenate

import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from optuna.integration import SkoptSampler

from scipy.stats import randint
from sklearn.utils.fixes import loguniform

from catboost import CatBoostRegressor
import xgboost as xgb
import lightgbm as lgb
import skopt

from category_encoders import CatBoostEncoder

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold 
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
import random

import wandb
import matplotlib.pyplot as plt
import os

### DataLoader

In [42]:
class DataProcessor:
    def __init__(self, file_path, test_path) :
        self.df = pd.read_csv(file_path)
        self.test = pd.read_csv(test_path)
        self.Data = {}
        
        # self.X = self.df.drop(['answerCode', 'Timestamp'], axis=1)
        # self.y = self.df[['answerCode']].astype(int)
        self.df[['userID', 'assessmentItemID', 'testId']] = self.df[['userID', 'assessmentItemID', 'testId']].astype('category')
        
        self.df['elapsed'] = np.log1p(self.df['elapsed'])
        
        
        self.X_test = self.test.drop(['answerCode', 'Timestamp', 'problem_difficulty', 'continuous_tag', 'test_sum', 'tag_sum', 'Time', 'userTime'], axis = 1)
        self.y_test = self.test['answerCode'].astype(int)
        
        self.X_test['elapsed'] = np.log1p(self.X_test['elapsed'])
        
        self.df = self.df.drop(['Timestamp', 'problem_difficulty', 'continuous_tag', 'test_sum', 'tag_sum', 'Time', 'userTime'], axis = 1)
                
        # self.Data['train_x'], self.Data['train_y'] = self.df.drop(['answerCode'], axis = 1), self.df['answerCode'].astype(int)
        self.Data['X_test'], self.Data['y_test'] = self.X_test, self.y_test
        
        self.Data['Train'], self.Data['Label'] = self.df.drop(['answerCode'], axis = 1), self.df['answerCode']
        
        # encoder = CatBoostEncoder(cols=['userID', 'assessmentItemID', 'testId'])  # CatBoostEncoder를 생성하고 카테고리컬 변수 지정
        # self.Data['train_x'] = encoder.fit_transform(self.Data['train_x'], self.Data['train_y'])
        # self.Data['X_test'] = encoder.transform(self.Data['X_test'])
           
    
    # def split_train_valid(self, test_size=0.1, random_state=20):
    #     X_train, X_valid, y_train, y_valid = train_test_split(self.X, self.y, stratify=self.X['userID'], test_size=test_size, random_state=random_state)
    #     self.Data['X_train'], self.Data['X_valid'], self.Data['y_train'], self.Data['y_valid'] = X_train, X_valid, y_train, y_valid
        
        
    def custom_train_valid_split(self, ratio=0.8, split=True):
        
        random.seed(42)

        users = list(zip(self.df["userID"].value_counts().index, self.df["userID"].value_counts()))
        random.shuffle(users)

        max_train_data_len = ratio * len(self.df)
        sum_of_train_data = 0
        user_ids = []

        for user_id, count in users:
            sum_of_train_data += count
            if max_train_data_len < sum_of_train_data:
                break
            user_ids.append(user_id)

        train = self.df[self.df["userID"].isin(user_ids)]
        valid = self.df[self.df["userID"].isin(user_ids) == False]

        valid = valid[valid["userID"] != valid["userID"].shift(-1)]

        X_train, y_train = train.drop(['answerCode'], axis = 1), train['answerCode'].astype(int)
        X_valid, y_valid = valid.drop(['answerCode'], axis = 1), valid['answerCode'].astype(int)
        
        self.Data['X_train'], self.Data['X_valid'], self.Data['y_train'], self.Data['y_valid'] = X_train, X_valid, y_train, y_valid
        
    
    def preprocess_data(self) :
        
        self.Data['X_train'][['userID', 'assessmentItemID', 'testId']] = self.Data['X_train'][['userID', 'assessmentItemID', 'testId']].astype('category')
        self.Data['X_valid'][['userID', 'assessmentItemID', 'testId']] = self.Data['X_valid'][['userID', 'assessmentItemID', 'testId']].astype('category')
        self.Data['X_test'][['userID', 'assessmentItemID', 'testId']] = self.Data['X_test'][['userID', 'assessmentItemID', 'testId']].astype('category')
        
        
#         self.Data['X_train'][['problem_cluster', 'user_cluster']] = self.Data['X_train'][['problem_cluster', 'user_cluster']].astype('category')
#         self.Data['X_valid'][['problem_cluster', 'user_cluster']] = self.Data['X_valid'][['problem_cluster', 'user_cluster']].astype('category')
#         self.Data['X_test'][['problem_cluster', 'user_cluster']] = self.Data['X_test'][['problem_cluster', 'user_cluster']].astype('category')
        
                
        last_sequence_indices = self.test.groupby('userID').apply(lambda x: x.index[-1] if x.iloc[-1]['answerCode'] == -1 else None).dropna()  
        
        self.Data['Idx'] = last_sequence_indices
    
#         label_encoder = LabelEncoder()
#         columns_to_encode = ['userID', 'assessmentItemID', 'testId']
        
#         for column in columns_to_encode:
#             self.Data['X_train'][column] = label_encoder.fit_transform(self.Data['X_train'][column])
#             self.Data['X_valid'][column] = label_encoder.fit_transform(self.Data['X_valid'][column])
#             self.Data['X_test'][column] = label_encoder.fit_transform(self.Data['X_test'][column])

In [43]:
data_processor = DataProcessor("/opt/ml/input/code/CatBoost/data/train_5_23_v2.csv" , "./data/test_5_23_v2.csv")
data_processor.custom_train_valid_split()
data_processor.preprocess_data()

In [44]:
data_processor.Data['X_train'].info()

<class 'pandas.core.frame.DataFrame'>
Index: 2020035 entries, 0 to 2525955
Data columns (total 22 columns):
 #   Column                Dtype   
---  ------                -----   
 0   userID                category
 1   assessmentItemID      category
 2   testId                category
 3   KnowledgeTag          int64   
 4   user_correct_answer   float64 
 5   user_total_answer     int64   
 6   user_acc              float64 
 7   elapsed               float64 
 8   user_tag_cum_acc      float64 
 9   test_mean             float64 
 10  tag_mean              float64 
 11  similar_tag_accuracy  float64 
 12  problem_std           float64 
 13  problem_std_by_tag    float64 
 14  tag_try               int64   
 15  tag_correct           int64   
 16  tag_answer_rate       float64 
 17  tag_difficult         float64 
 18  id_try                int64   
 19  id_correct            int64   
 20  id_answer_rate        float64 
 21  id_difficult          float64 
dtypes: category(3), float64

In [45]:
data_processor.Data['X_test'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260114 entries, 0 to 260113
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype   
---  ------                --------------   -----   
 0   userID                260114 non-null  category
 1   assessmentItemID      260114 non-null  category
 2   testId                260114 non-null  category
 3   KnowledgeTag          260114 non-null  int64   
 4   user_correct_answer   260114 non-null  float64 
 5   user_total_answer     260114 non-null  int64   
 6   user_acc              260114 non-null  float64 
 7   elapsed               260114 non-null  float64 
 8   user_tag_cum_acc      260114 non-null  float64 
 9   test_mean             260114 non-null  float64 
 10  tag_mean              260114 non-null  float64 
 11  similar_tag_accuracy  260114 non-null  float64 
 12  problem_std           260114 non-null  float64 
 13  problem_std_by_tag    260114 non-null  float64 
 14  tag_try               260114 non-nul

### CatBoost

In [7]:
class CatBoost:

    def __init__(self, data, n_splits=5, random_state=20):
        self.data = data
        self.cat_features = [0, 1, 2, 3, 9, 26, 27]
        self.n_splits = n_splits
        self.random_state = random_state

        self.X_train = self.data["X_train"].values
        self.X_valid = self.data["X_valid"].values
        self.y_train = self.data["y_train"].values
        self.y_valid = self.data["y_valid"].values


        self.cv = KFold(n_splits=n_splits, random_state=random_state, shuffle = True)

    def objective(self, trial):
        param = {
                    'random_state': 25,
                    'eval_metric': 'AUC',
                    'cat_features': self.cat_features,
                    'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log = True),
                    'n_estimators': trial.suggest_int("n_estimators", 500, 1500),
                    'max_depth': trial.suggest_int("max_depth", 6, 12),
                    # 'depth' : trial.suggest_int('depth', 4, 12),
                    'l2_leaf_reg': trial.suggest_float("l2_leaf_reg", 0.1, 5, log = True),
                    # 'bagging_temperature': trial.suggest_uniform('bagging_temperature', 0.1, 1.0),
                    'random_strength': trial.suggest_int('random_strength', 0, 2),
                    # 'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
                    # 'colsample_bylevel': trial.suggest_uniform('colsample_bylevel', 0.5, 1.0),
                    # 'bootstrap_type': 'Bernoulli'
                }

        auc_scores = []

        for train_idx, valid_idx in self.cv.split(self.X_train, self.y_train):
            X_train_fold, X_valid_fold = self.X_train[train_idx], self.X_train[valid_idx]
            y_train_fold, y_valid_fold = self.y_train[train_idx], self.y_train[valid_idx]

            model = CatBoostClassifier(**param, task_type='GPU', devices='0')
            model.fit(X_train_fold, y_train_fold,
                      cat_features=self.cat_features,
                      eval_set=(X_valid_fold, y_valid_fold),
                      early_stopping_rounds=10,
                      verbose=500)

            y_pred_fold = model.predict_proba(X_valid_fold)[:, 1]
            auc_fold = roc_auc_score(y_valid_fold, y_pred_fold)
            auc_scores.append(auc_fold)

        return 1.0 - np.mean(auc_scores)


    def hyperparameter_search(self, n_trials):
        study = optuna.create_study(direction='minimize')
        study.optimize(self.objective, n_trials=n_trials)

        best_params = study.best_params
        best_accuracy = 1.0 - study.best_value

        print(f"Best AUC: {best_accuracy}")
        print(f"Best hyperparameters: {best_params}")

        return best_params

    def train(self, params):
        
        wandb.init(project='dkt',  entity = 'new-recs', config=params)
        
        wandb.run.name = 'Catboost_1_MS' 
        
        param = {
            'random_state': 25,
            'eval_metric': 'AUC',
            'cat_features': self.cat_features,
            'learning_rate': params['learning_rate'],
            'n_estimators': params['n_estimators'],
            'max_depth': params['max_depth'],
            # 'depth' : params['depth'],
            'l2_leaf_reg': params['l2_leaf_reg'],
            # 'bagging_temperature': params['bagging_temperature'],
            'random_strength': params['random_strength'], 
            # 'subsample': params['subsample'],
            # 'colsample_bylevel': params['colsample_bylevel'],
            # 'bootstrap_type': params['bootstrap_type']
        }

        self.model = CatBoostClassifier(**param, task_type='GPU', devices='0')
        self.model.fit(self.X_train, self.y_train,
                       cat_features=self.cat_features,
                       eval_set=(self.X_valid, self.y_valid),
                       early_stopping_rounds=10,
                       verbose=50, 
                       plot=True)
        
        y_train_pred = self.model.predict_proba(self.X_train)[:, 1]
        train_auc = roc_auc_score(self.y_train, y_train_pred)
        
        print("Train AUC:", train_auc)
        
        wandb.log({'Validation AUC': self.model.best_score_['validation']['AUC']})
        
        wandb.finish()

    def predict(self, temp):
        y_pred = self.model.predict_proba(temp)[:, 1]
        return y_pred

    def feature(self):
        result = self.model.get_feature_importance()
        return result            

### XGBoost V1

In [46]:
from sklearn.preprocessing import LabelEncoder

class XGBoost:

    def __init__(self, data, n_splits=5, random_state=17):
        self.data = data
        self.n_splits = n_splits
        self.random_state = random_state
        
        self.Train = self.data['Train']
        self.Label = self.data['Label']

        self.X_train = self.data["X_train"]
        self.X_valid = self.data["X_valid"]
        self.y_train = self.data["y_train"]
        self.y_valid = self.data["y_valid"]

        self.cv = KFold(n_splits=n_splits, random_state=random_state, shuffle = True)

    def objective(self, trial):
        param = {
                    'random_state': 17,
                    'booster': 'gbtree',
                    'eval_metric': 'auc',
                    'colsample_bylevel': trial.suggest_float("colsample_bylevel", 0.6, 0.9),
                    'colsample_bytree': trial.suggest_float("colsample_bytree", 0.6, 0.9),
                    'gamma': trial.suggest_float("gamma", 0, 1),
                    'max_depth': trial.suggest_int("max_depth", 4, 10),
                    'min_child_weight': trial.suggest_float("min_child_weight", 1, 10),
                    'n_estimators': trial.suggest_int("n_estimators", 200, 1500),
                    'nthread': -1,
                    'objective': 'binary:logistic',
                    'verbosity': 1,
                    'subsample': trial.suggest_float("subsample", 0.5, 1.0),
                    'learning_rate': trial.suggest_float("learning_rate", 0.01, 0.2),
                    'reg_alpha': trial.suggest_float("reg_alpha", 0.0, 0.5),
                    'reg_lambda': trial.suggest_float("reg_lambda", 0.0, 0.5),
                    'scale_pos_weight' : 0.52,
                    'enable_categorical': True
            
                }
        
        auc_scores = []

        for train_idx, valid_idx in self.cv.split(self.X_train, self.y_train):  
            
            X_train_fold, X_valid_fold = self.X_train.iloc[train_idx], self.X_train.iloc[valid_idx]
            y_train_fold, y_valid_fold = self.y_train.iloc[train_idx], self.y_train.iloc[valid_idx]

            model = xgb.XGBClassifier(**param, tree_method = 'gpu_hist', gpu_id = '0')
            model.set_params(early_stopping_rounds=200)
            model.fit(X_train_fold, y_train_fold,
                      eval_set=[(X_valid_fold, y_valid_fold)],
                      verbose=100)

            y_pred_fold = model.predict_proba(X_valid_fold)[:, 1]
            auc_fold = roc_auc_score(y_valid_fold, y_pred_fold)
            auc_scores.append(auc_fold)

        return 1.0 - np.mean(auc_scores)


    def hyperparameter_search(self, n_trials):
        
        
        sampler = optuna.samplers.TPESampler(seed=42)
        
        study = optuna.create_study(direction='minimize', sampler = sampler)
        study.optimize(self.objective, n_trials=n_trials)

        best_params = study.best_params
        best_accuracy = 1.0 - study.best_value

        print(f"Best AUC: {best_accuracy}")
        print(f"Best hyperparameters: {best_params}")

        return best_params

    def train(self, params) :
            
        param = {
                    'random_state': 17,
                    'booster': 'gbtree',
                    'eval_metric': 'auc',
                    'colsample_bylevel': params['colsample_bylevel'],
                    'colsample_bytree': params['colsample_bytree'],
                    'gamma': params['gamma'],
                    'max_depth': params['max_depth'],
                    'min_child_weight': params['min_child_weight'],
                    'n_estimators': params['n_estimators'],
                    'nthread': -1,
                    'objective': 'binary:logistic',
                    'verbosity': 1,
                    'subsample': params['subsample'],
                    'learning_rate': params['learning_rate'],
                    'reg_alpha': params['reg_alpha'],
                    'reg_lambda': params['reg_lambda'],
                    'scale_pos_weight' : 0.52,
                    'enable_categorical': True
                }
        
        wandb.init(project='dkt',  entity = 'new-recs', config = params)
        wandb.run.name = 'XGBoost_3_MS'
        
        self.model = xgb.XGBClassifier(**param , tree_method = 'gpu_hist', gpu_id = '0')
        self.model.set_params(early_stopping_rounds=200)
        self.model.fit(self.X_train, self.y_train,
                       eval_set=[(self.X_valid, self.y_valid)],
                       callbacks=[wandb.xgboost.WandbCallback(log_model=True)],
                       verbose=50)
        
        wandb.finish()

    def predict(self, temp):
        y_pred = self.model.predict_proba(temp)[:, 1]
        return y_pred

    
    def feature(self):
        result = self.model.feature_importances_
        return result

### XGBOOST V2

In [9]:
from sklearn.model_selection import GridSearchCV


class XGBoost_v2 :

    def __init__(self, data, n_splits=5, random_state=20):
        self.data = data
        self.n_splits = n_splits
        self.random_state = random_state

        self.X_train = self.data["X_train"]
        self.X_valid = self.data["X_valid"]
        self.y_train = self.data["y_train"]
        self.y_valid = self.data["y_valid"]
        
        encoder = CatBoostEncoder(cols=['userID', 'assessmentItemID', 'testId', 'KnowledgeTag','continuous_tag', 'problem_cluster', 'user_cluster'])  # CatBoostEncoder를 생성하고 카테고리컬 변수 지정
        self.X_train_encoded = encoder.fit_transform(self.X_train, self.y_train)
        self.X_valid_encoded = encoder.transform(self.X_valid)

        self.cv = KFold(n_splits=n_splits, random_state=random_state, shuffle = True)

    def hyperparameter_search(self, param_grid):
        
        
        model = xgb.XGBClassifier(random_state=25, booster='gbtree', eval_metric='auc', objective='binary:logistic',
                                 tree_method='gpu_hist', gpu_id='0')
        
        grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=self.cv, scoring='roc_auc', n_jobs=-1, verbose = 3)
        grid_search.fit(self.X_train_encoded, self.y_train)
    
    
        results = grid_search.cv_results_ 
        
        for mean_score, params in zip(results['mean_test_score'], results['params']):
            print(f"AUC: {mean_score:.4f}, Hyperparameters: {params}")
        
        best_params = grid_search.best_params_
        best_accuracy = grid_search.best_score_

        print(f"Best AUC: {best_accuracy}")
        print(f"Best hyperparameters: {best_params}")

        return best_params

    def train(self, params) :
            
        param = {
                    'random_state': 25,
                    'booster': 'gbtree',
                    'eval_metric': 'auc',
                    'colsample_bylevel': params['colsample_bylevel'],
                    'colsample_bytree': params['colsample_bytree'],
                    'gamma': params['gamma'],
                    'max_depth': params['max_depth'],
                    'min_child_weight': params['min_child_weight'],
                    'n_estimators': params['n_estimators'],
                    'nthread': -1,
                    'objective': 'binary:logistic',
                    'verbosity': 1,
                    # 'subsample': params['subsample'],
                    # 'learning_rate': params['learning_rate'],
                    # 'reg_alpha': params['reg_alpha'],
                    # 'reg_lambda': params['reg_lambda'],
                    'scale_pos_weight' : 0.5
                }
        
        wandb.init(project='dkt',  entity = 'new-recs', config = params)
        wandb.run.name = 'XGBoost_3_MS'
        
        self.model = xgb.XGBClassifier(**param , tree_method = 'gpu_hist', gpu_id = '0')
        self.model.set_params(early_stopping_rounds=20)
        self.model.fit(self.X_train_encoded, self.y_train,
                       eval_set=[(self.X_train_encoded, self.y_train), (self.X_valid_encoded, self.y_valid)],
                       callbacks=[wandb.xgboost.WandbCallback(log_model=True)],
                       verbose=50)
        
        wandb.finish()

    def predict(self, temp):
        y_pred = self.model.predict_proba(temp)[:, 1]
        return y_pred

    
    def feature(self):
        result = self.model.feature_importances_
        return result

### LGBM

In [10]:
class LGBM:

    def __init__(self, data, n_splits=5, random_state=20):
        self.data = data
        self.n_splits = n_splits
        self.random_state = random_state

        self.X_train = self.data["X_train"]
        self.X_valid = self.data["X_valid"]
        self.y_train = self.data["y_train"]
        self.y_valid = self.data["y_valid"]
        
        encoder = CatBoostEncoder(cols=['KnowledgeTag', 'continuous_tag'])
        self.X_train_encoded = encoder.fit_transform(self.X_train, self.y_train)
        self.X_valid_encoded = encoder.transform(self.X_valid)

        self.cv = KFold(n_splits=n_splits, random_state=random_state, shuffle=True)

    def objective(self, trial):
        param = {
            'random_state': 25,
            'boosting_type': 'gbdt',
            'metric': 'auc',
            'colsample_bytree': trial.suggest_float("colsample_bytree", 0.6, 0.9),
            'num_leaves': trial.suggest_int("num_leaves", 30, 100),
            'max_depth': trial.suggest_int("max_depth", 5, 15),
            'min_child_samples': trial.suggest_int("min_child_samples", 5, 50),
            'n_estimators': trial.suggest_int("n_estimators", 1000, 2500),
            'learning_rate': trial.suggest_float("learning_rate", 0.01, 0.1),
            'subsample': trial.suggest_float("subsample", 0.6, 0.9),
            'reg_alpha': trial.suggest_float("reg_alpha", 0.0, 1.0),
            'reg_lambda': trial.suggest_float("reg_lambda", 0.0, 1.0)
            # 'device_type': 'gpu',
            # 'gpu_platform_id': 0,
            # 'gpu_device_id': 0
        }
        

        auc_scores = []

        for train_idx, valid_idx in self.cv.split(self.X_train, self.y_train):
            X_train_fold, X_valid_fold = self.X_train_encoded.iloc[train_idx], self.X_train_encoded.iloc[valid_idx]
            y_train_fold, y_valid_fold = np.ravel(self.y_train.iloc[train_idx]), np.ravel(self.y_train.iloc[valid_idx])

            model = lgb.LGBMClassifier(**param)
            model.fit(X_train_fold, y_train_fold,
                      eval_set=[(X_valid_fold, y_valid_fold)],
                      callbacks=[lgb.early_stopping(stopping_rounds=10), lgb.log_evaluation(period=50)])

            y_pred_fold = model.predict_proba(self.X_valid_encoded)[:, 1]
            auc_fold = roc_auc_score(self.y_valid, y_pred_fold)
            auc_scores.append(auc_fold)

        return 1.0 - np.mean(auc_scores)
    
    def hyperparameter_search(self, n_trials):
        study = optuna.create_study(direction='minimize')
        study.optimize(self.objective, n_trials=n_trials)

        best_params = study.best_params
        best_accuracy = 1.0 - study.best_value

        print(f"Best AUC: {best_accuracy}")
        print(f"Best hyperparameters: {best_params}")

        return best_params
    
    def train(self, params):
        
        param = {
            'random_state': 25,
            'boosting_type': 'gbdt',
            'metric': 'auc',
            'colsample_bytree': params['colsample_bytree'],
            'num_leaves': params['num_leaves'],
            'max_depth': params['max_depth'],
            'min_child_samples': params['min_child_samples'],
            'n_estimators': params['n_estimators'],
            'learning_rate': params['learning_rate'],
            'subsample': params['subsample'],
            'reg_alpha': params['reg_alpha'],
            'reg_lambda': params['reg_lambda']
            # 'device_type': 'gpu',
            # 'gpu_platform_id': 0,
            # 'gpu_device_id': 0
        }
        
        wandb.init(project='dkt',  entity = 'new-recs', config = params)
        wandb.run.name = 'LGBM_1_MS'

        self.model = lgb.LGBMClassifier(**param)
        self.model.fit(self.X_train_encoded, np.ravel(self.y_train),
                       eval_set=[(self.X_valid_encoded, np.ravel(self.y_valid))],
                       callbacks=[lgb.early_stopping(stopping_rounds=10), lgb.log_evaluation(period=50), wandb.lightgbm.wandb_callback()])

        wandb.finish()
        
    def predict(self, temp):
        y_pred = self.model.predict_proba(temp)[:, 1]
        return y_pred

    def feature(self):
        result = self.model.feature_importances_
        return result

### 학습 및 예측

In [47]:
model = XGBoost(data_processor.Data)

In [None]:
best_parameter = model.hyperparameter_search(200)

best_parameter

In [50]:
model.train(best_parameter)



[0]	validation_0-auc:0.77868
[50]	validation_0-auc:0.81189
[100]	validation_0-auc:0.81659
[150]	validation_0-auc:0.81804
[200]	validation_0-auc:0.81878
[250]	validation_0-auc:0.81949
[300]	validation_0-auc:0.82010
[350]	validation_0-auc:0.82008
[400]	validation_0-auc:0.82016
[450]	validation_0-auc:0.82097
[500]	validation_0-auc:0.82075
[550]	validation_0-auc:0.82089
[600]	validation_0-auc:0.82065
[650]	validation_0-auc:0.82071
[663]	validation_0-auc:0.82072


0,1
best_iteration,▁
best_score,▁
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
validation_0-auc,▁▃▄▅▆▆▆▇▇▇▇▇▇▇▇█████████████████████████

0,1
best_iteration,464.0
best_score,0.82111
epoch,663.0


In [None]:
feature = model.feature()

plt.figure(figsize=(10, 6))
plt.barh(data_processor.Data["X_train"].columns, feature)
plt.xticks(rotation=90)
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.show()

### 제출

In [98]:
data_processor.Data['X_test'].loc[data_processor.Data['Idx']]

Unnamed: 0,userID,assessmentItemID,testId,user_correct_answer,user_total_answer,user_acc,elapsed,user_tag_cum_acc,test_mean,tag_mean,similar_tag_accuracy,problem_difficulty,problem_std,problem_std_by_tag,tag_try,tag_correct,tag_answer_rate,id_try,id_correct,id_answer_rate
1035,3,A050133008,A050000133,717.0,1035,0.692754,46.0,0.818182,0.661765,0.542662,0.470588,30.147449,0.605625,0.504981,293,160,0.546075,249,133,0.534137
1706,4,A070146008,A070000146,465.0,670,0.694030,23.0,0.666667,0.740385,0.565693,0.153846,14.883059,0.948371,0.551433,274,163,0.594891,145,89,0.613793
3023,13,A070111008,A070000111,915.0,1316,0.695289,8.0,0.333333,0.417857,0.446753,0.285714,10.871422,0.511101,0.517633,385,176,0.457143,249,92,0.369478
4283,17,A090064006,A090000064,1031.0,1259,0.818904,75.0,1.000000,0.625000,0.514286,0.250000,5.867069,0.661438,0.527605,70,37,0.528571,99,27,0.272727
4670,26,A060135007,A060000135,293.0,386,0.759067,17.0,0.666667,0.678571,0.602767,0.321429,15.093339,0.538090,0.493347,506,306,0.604743,249,78,0.313253
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260052,7395,A040122005,A040000122,7.0,23,0.304348,2.0,0.000000,0.753846,0.654902,0.000000,11.912108,0.530466,0.483579,255,168,0.658824,299,134,0.448161
260067,7404,A030111005,A030000111,7.0,14,0.500000,107.0,0.500000,0.866667,0.834661,0.861111,9.359426,0.418514,0.376810,502,420,0.836653,299,265,0.886288
260082,7416,A050193004,A050000193,7.0,14,0.500000,24.0,0.666667,0.750000,0.792517,0.760000,8.348514,0.585150,0.445474,294,238,0.809524,248,213,0.858871
260097,7417,A050193004,A050000193,2.0,14,0.142857,21.0,0.666667,0.750000,0.792517,0.760000,8.348514,0.585150,0.445474,294,238,0.809524,248,213,0.858871


In [52]:
testing = model.predict(data_processor.Data['X_test'].loc[data_processor.Data['Idx'] - 1])
roc_auc_score(data_processor.Data['y_test'].loc[data_processor.Data['Idx'] - 1], testing)

0.8537193770436639

In [53]:
submission = pd.read_csv("/opt/ml/input/data/sample_submission.csv")

In [None]:
submission['prediction'] = model.predict(data_processor.Data['X_test'].loc[data_processor.Data['Idx'] ])
submission.to_csv("./output/XGBoost_5_25_diff1.csv", index = False)

submission