In [1]:
!pip install hyperopt

Collecting hyperopt
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m
Collecting networkx>=2.2
  Downloading networkx-3.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m74.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cloudpickle
  Downloading cloudpickle-2.2.1-py3-none-any.whl (25 kB)
Collecting future
  Downloading future-0.18.3.tar.gz (840 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m840.9/840.9 kB[0m [31m67.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting py4j
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m38.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: future
  Building wheel for future (setup.

In [6]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from hyperopt import hp, fmin, tpe, Trials
import pickle

In [3]:
space = {
    'iterations': hp.quniform('iterations', 5000, 150000, 5000),
    'learning_rate': hp.uniform('learning_rate', 0.0001, 0.1),
    'depth': hp.quniform('depth', 1, 10, 1),
    'l2_leaf_reg': hp.loguniform('l2_leaf_reg', np.log(1e-9), np.log(10)),
    'bagging_temperature': hp.loguniform('bagging_temperature', np.log(0.1), np.log(20.0)),
    'random_strength': hp.uniform('random_strength', 0, 10),
    'min_data_in_leaf': hp.quniform('min_data_in_leaf', 1, 300, 1),
    "use_best_model": True,
    "task_type": "GPU"
}

In [11]:
import os
import time
import random
import numpy as np
import torch
import torch.nn as nn
import logging
import json
seed = 42
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [12]:
test_size = 0.2
def context_data_split(data):
    """
    Parameters
    ----------
    Args:
        test_size : float
            Train/Valid split 비율을 입력합니다.
        seed : int
            랜덤 seed 값
    ----------
    """

    X_train, X_valid, y_train, y_valid = train_test_split(
                                                        data['train'].drop(['rating'], axis=1),
                                                        data['train']['rating'],
                                                        test_size=test_size,
                                                        random_state=seed,
                                                        shuffle=True
                                                        )
    data['X_train'], data['X_valid'], data['y_train'], data['y_valid'] = X_train, X_valid, y_train, y_valid
    
    # train 데이터와 validation 데이터의 인덱스 분할
    train_idx, valid_idx = train_test_split(data['train'].index, test_size=test_size, random_state=seed, shuffle=True)

    # train 데이터와 validation 데이터의 인덱스를 각각 CSV 파일로 저장
    os.makedirs('./data_index', exist_ok=True)
    data['train'].loc[train_idx].to_csv(f'./data_index/context_data_train_index.csv', index=False)
    data['train'].loc[valid_idx].to_csv(f'./data_index/context_data_valid_index.csv', index=False)
    return data

In [None]:
with open('/opt/ml/data/20230419_015908_catboost_data.pt',"rb") as f:
    data = pickle.load(f)
data = context_data_split(data)

def rmse(real: list, predict: list) -> float:
    pred = np.array(predict)
    return np.sqrt(np.mean((real-pred) ** 2))

# final EDA
cat_features = ['user_id', 'isbn', 'category', 'category_high', 'publisher', 'language', 'book_author','age_map', 'location_city', 'location_state', 'location_country', 'year_of_publication_map']

def objective(params):
    model = CatBoostRegressor(
        learning_rate=params['learning_rate'],
        depth=int(params['depth']),
        l2_leaf_reg=params['l2_leaf_reg'],
        bagging_temperature=params['bagging_temperature'],
        random_strength=params['random_strength'],
        verbose=False,
        task_type="GPU",
        eval_metric='RMSE'
    )

    model.fit(data['X_train'], data['y_train'], eval_set=(data['X_valid'],data['y_valid']), cat_features=cat_features, early_stopping_rounds=300)
    y_pred = model.predict(data['X_valid'])

    rmse = np.sqrt(mean_squared_error(data['y_valid'], y_pred))
    return {'loss': rmse, 'status': 'ok'}

trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=1000,
            trials=trials)

print('Best hyperparameters:', best)

 84%|████████▍ | 839/1000 [6:57:27<2:19:20, 51.93s/trial, best loss: 1.4716599290996835]



 84%|████████▍ | 840/1000 [6:58:27<2:25:22, 54.52s/trial, best loss: 1.4716599290996835]



 84%|████████▍ | 841/1000 [6:59:09<2:14:19, 50.69s/trial, best loss: 1.4716599290996835]



 84%|████████▍ | 842/1000 [7:00:04<2:16:56, 52.00s/trial, best loss: 1.4716599290996835]



 84%|████████▍ | 843/1000 [7:01:06<2:23:35, 54.88s/trial, best loss: 1.4716599290996835]



 84%|████████▍ | 844/1000 [7:02:09<2:29:00, 57.31s/trial, best loss: 1.4716599290996835]



 84%|████████▍ | 845/1000 [7:02:52<2:17:29, 53.22s/trial, best loss: 1.4716599290996835]



 85%|████████▍ | 846/1000 [7:03:53<2:22:39, 55.58s/trial, best loss: 1.4716599290996835]



 85%|████████▍ | 847/1000 [7:04:55<2:26:25, 57.42s/trial, best loss: 1.4716599290996835]



 85%|████████▍ | 848/1000 [7:05:25<2:04:44, 49.24s/trial, best loss: 1.4716599290996835]



 86%|████████▌ | 856/1000 [7:12:22<2:02:48, 51.17s/trial, best loss: 1.4716599290996835]



 86%|████████▌ | 857/1000 [7:13:23<2:09:02, 54.14s/trial, best loss: 1.4716599290996835]



 86%|████████▌ | 858/1000 [7:14:04<1:58:43, 50.17s/trial, best loss: 1.4716599290996835]



 86%|████████▌ | 859/1000 [7:14:59<2:01:02, 51.51s/trial, best loss: 1.4716599290996835]



 86%|████████▌ | 860/1000 [7:15:55<2:03:00, 52.72s/trial, best loss: 1.4716599290996835]



 86%|████████▌ | 861/1000 [7:16:36<1:54:12, 49.30s/trial, best loss: 1.4716599290996835]



 87%|████████▋ | 874/1000 [7:26:39<1:41:22, 48.27s/trial, best loss: 1.4716599290996835]

In [14]:
best

{'bagging_temperature': 0.11524314031940042,
 'depth': 6.0,
 'iterations': 145000.0,
 'l2_leaf_reg': 0.07724627592519046,
 'learning_rate': 0.09845218780938297,
 'min_data_in_leaf': 6.0,
 'random_strength': 0.44099650574603644}