In [6]:
# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

import json, os
import gc

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve
from catboost import CatBoostClassifier, Pool


from utils import seed_everything, get_config, print_score
from preprocess import feature_engineering

# 기본 프로젝트 디렉터리
PROJECT_PATH = "/opt/ml/online_retail_prediction"

# 데이터가 저장된 디렉터리
BASE_DATA_PATH = "/opt/ml/online_retail_prediction/input"


class CFG:
    objective = "binary"
    boosting_type = "gbdt"
    metric = "auc"
    feature_fraction = 0.8
    bagging_fraction = 0.8
    bagging_freq = 1
    n_estimators = 10000
    early_stopping_rounds = 100
    verbose = -1
    n_jobs = -1

    predict_year_month = '2011-12'
    seed = 42 # random seed
    total_thres = 300 # total threshold
    folds = 10 # number of k-fold
    model = 'LightGBM' # model
    description = 'Modeling' # description

    train_data_path = os.path.join(BASE_DATA_PATH, 'train.csv') # train csv 파일
    sample_submission_path = os.path.join(BASE_DATA_PATH, 'sample_submission.csv') # train csv 파일
    config_path = './config/config.json'
    docs_path = os.path.join(PROJECT_PATH, 'docs') # result, visualization 저장 경로
    model_path = os.path.join(PROJECT_PATH, 'models') # trained model 저장 경로


def get_data():
    # 데이터 파일 읽기
    data = pd.read_csv(CFG.train_data_path, parse_dates=['order_date'])
    return data


def get_from_dataset():
    train = pd.read_csv(os.path.join(CFG.docs_path,'dataset','train_399_quantile.csv'))
    test = pd.read_csv(os.path.join(CFG.docs_path,'dataset','test_399_quantile.csv'))
    features = train.drop(columns=['customer_id', 'label', 'year_month']).columns
    y = train['label']

    return train, test, y, features


def inference(test_preds):
    # 테스트 결과 제출 파일 읽기
    sub = pd.read_csv(CFG.sample_submission_path)
    
    # 테스트 예측 결과 저장
    sub['probability'] = test_preds
    
    # 제출 파일 쓰기
    sub.to_csv(os.path.join(CFG.docs_path ,'result', 'output_TabNet.csv'), index=False)

In [2]:
# config.json parsing
get_config(CFG)

# fix seed for reproducible model
seed_everything(CFG.seed)

# train.csv data 가져오기
data = get_data()

train, test, y, features = get_from_dataset()

In [3]:
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
from pytorch_tabnet.pretraining import TabNetPretrainer
import torch

In [4]:
x_train = train[features]
x_test = test[features]
x_test = x_test.to_numpy()
test_preds = np.zeros(x_test.shape[0])
y_oof = np.zeros(x_train.shape[0])
score = 0
folds = 10
skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=CFG.seed)

In [10]:
score = 0
for fold, (tr_idx, val_idx) in enumerate(skf.split(x_train, y)):
    x_tr, x_val = x_train.loc[tr_idx, features], x_train.loc[val_idx, features]
    y_tr, y_val = y[tr_idx], y[val_idx]

    x_tr = x_tr.to_numpy()
    y_tr = y_tr.to_numpy()
    x_val = x_val.to_numpy()
    y_val = y_val.to_numpy()

    clf = TabNetClassifier(
        seed=CFG.seed,
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=7e-3),
        scheduler_params={"step_size":10, "gamma":0.9},
        scheduler_fn=torch.optim.lr_scheduler.StepLR,
        mask_type='sparsemax' # This will be overwritten if using pretrain model
    )
    clf.fit(
        X_train=x_tr, y_train=y_tr,
        eval_set=[(x_val, y_val)], 
        eval_name=['valid'], 
        eval_metric=['auc'],
        batch_size=64,
    )

    # Validation 데이터 예측
    val_preds = np.array(clf.predict_proba(x_val))[:,1]
    
    # Validation index에 예측값 저장 
    y_oof[val_idx] = val_preds
    
    # 폴드별 Validation 스코어 측정
    print(f"Fold {fold + 1}")
    print_score(y_val, val_preds)
    # print(f"parameters : \n{model.get_all_params()}")
    print('-'*80)

    # score 변수에 폴드별 평균 Validation 스코어 저장
    score += roc_auc_score(y_val, val_preds) / folds
    
    # 테스트 데이터 예측하고 평균해서 저장
    test_preds += np.array(clf.predict_proba(x_test))[:,1] / folds
    
    del x_tr, x_val, y_tr, y_val
    gc.collect()

print(f"\nMean AUC = {score}") # 폴드별 Validation 스코어 출력
print(f"OOF AUC = {roc_auc_score(y, y_oof)}") # Out Of Fold Validation 스코어 출력


Device used : cuda
epoch 0  | loss: 0.58287 | valid_auc: 0.70311 |  0:00:03s
epoch 1  | loss: 0.48198 | valid_auc: 0.73977 |  0:00:06s
epoch 2  | loss: 0.4608  | valid_auc: 0.732   |  0:00:09s
epoch 3  | loss: 0.43957 | valid_auc: 0.75878 |  0:00:11s
epoch 4  | loss: 0.42693 | valid_auc: 0.79232 |  0:00:14s
epoch 5  | loss: 0.41453 | valid_auc: 0.77511 |  0:00:17s
epoch 6  | loss: 0.42012 | valid_auc: 0.78773 |  0:00:20s
epoch 7  | loss: 0.4112  | valid_auc: 0.79223 |  0:00:22s
epoch 8  | loss: 0.40779 | valid_auc: 0.79695 |  0:00:25s
epoch 9  | loss: 0.40417 | valid_auc: 0.79663 |  0:00:28s
epoch 10 | loss: 0.40672 | valid_auc: 0.7939  |  0:00:30s
epoch 11 | loss: 0.40353 | valid_auc: 0.79384 |  0:00:33s
epoch 12 | loss: 0.40013 | valid_auc: 0.78353 |  0:00:35s
epoch 13 | loss: 0.39823 | valid_auc: 0.803   |  0:00:38s
epoch 14 | loss: 0.40178 | valid_auc: 0.79478 |  0:00:41s
epoch 15 | loss: 0.39764 | valid_auc: 0.7978  |  0:00:43s
epoch 16 | loss: 0.40208 | valid_auc: 0.79607 |  0:00

In [5]:
score = 0
for fold, (tr_idx, val_idx) in enumerate(skf.split(x_train, y)):
    x_tr, x_val = x_train.loc[tr_idx, features], x_train.loc[val_idx, features]
    y_tr, y_val = y[tr_idx], y[val_idx]

    x_tr = x_tr.to_numpy()
    y_tr = y_tr.to_numpy()
    x_val = x_val.to_numpy()
    y_val = y_val.to_numpy()

    # TabNetPretrainer
    unsupervised_model = TabNetPretrainer(
        seed=CFG.seed,
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=0.030841738109062655),
        mask_type='entmax' # "sparsemax"
    )
    unsupervised_model.fit(
        X_train=x_tr,
        eval_set=[x_val],
        pretraining_ratio=0.7000476182098592,
    )

    clf = TabNetClassifier(
        seed=CFG.seed,
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=0.03330921005446336),
        scheduler_params={"step_size":20, "gamma":0.9},
        scheduler_fn=torch.optim.lr_scheduler.StepLR,
        mask_type='entmax', # This will be overwritten if using pretrain model
        n_d=24,
        n_a=24,
        n_steps=5
    )
    clf.fit(
        X_train=x_tr, y_train=y_tr,
        eval_set=[(x_val, y_val)], 
        eval_name=['valid'], 
        eval_metric=['auc'],
        batch_size=256,
        from_unsupervised=unsupervised_model,
        num_workers=4,
        virtual_batch_size=32,
    )

    # Validation 데이터 예측
    val_preds = np.array(clf.predict_proba(x_val))[:,1]
    
    # Validation index에 예측값 저장 
    y_oof[val_idx] = val_preds
    
    # 폴드별 Validation 스코어 측정
    print(f"Fold {fold + 1}")
    print_score(y_val, val_preds)
    # print(f"parameters : \n{model.get_all_params()}")
    print('-'*80)

    # score 변수에 폴드별 평균 Validation 스코어 저장
    score += roc_auc_score(y_val, val_preds) / folds
    
    # 테스트 데이터 예측하고 평균해서 저장
    test_preds += np.array(clf.predict_proba(x_test))[:,1] / folds
    
    del x_tr, x_val, y_tr, y_val
    gc.collect()

print(f"\nMean AUC = {score}") # 폴드별 Validation 스코어 출력
print(f"OOF AUC = {roc_auc_score(y, y_oof)}") # Out Of Fold Validation 스코어 출력

Device used : cuda
epoch 0  | loss: 1339.13584| val_0_unsup_loss: 3.7157  |  0:00:01s
epoch 1  | loss: 489.83062| val_0_unsup_loss: 3.49106 |  0:00:01s
epoch 2  | loss: 440.86175| val_0_unsup_loss: 1.78415 |  0:00:02s
epoch 3  | loss: 1.65966 | val_0_unsup_loss: 1.36603 |  0:00:02s
epoch 4  | loss: 119.80102| val_0_unsup_loss: 0.95396 |  0:00:03s
epoch 5  | loss: 69.43921| val_0_unsup_loss: 0.98436 |  0:00:04s
epoch 6  | loss: 38.4307 | val_0_unsup_loss: 0.84894 |  0:00:04s
epoch 7  | loss: 10.22032| val_0_unsup_loss: 0.79176 |  0:00:05s
epoch 8  | loss: 7.57911 | val_0_unsup_loss: 0.77059 |  0:00:05s
epoch 9  | loss: 35.9161 | val_0_unsup_loss: 0.75971 |  0:00:06s
epoch 10 | loss: 2.48475 | val_0_unsup_loss: 0.74548 |  0:00:06s
epoch 11 | loss: 24.01548| val_0_unsup_loss: 0.72033 |  0:00:07s
epoch 12 | loss: 6.86071 | val_0_unsup_loss: 0.71823 |  0:00:08s
epoch 13 | loss: 6.307   | val_0_unsup_loss: 0.68887 |  0:00:08s
epoch 14 | loss: 31.68229| val_0_unsup_loss: 0.68511 |  0:00:09s
e

In [7]:
inference(test_preds)