In [1]:
!pip install hyperopt

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from hyperopt import hp, fmin, tpe, Trials
import pickle

In [3]:
space = {
    'iterations': hp.quniform('iterations', 5000, 150000, 5000),
    'learning_rate': hp.uniform('learning_rate', 0.0001, 0.1),
    'depth': hp.quniform('depth', 1, 10, 1),
    'l2_leaf_reg': hp.loguniform('l2_leaf_reg', np.log(1e-9), np.log(10)),
    'bagging_temperature': hp.loguniform('bagging_temperature', np.log(0.1), np.log(20.0)),
    'random_strength': hp.uniform('random_strength', 0, 10),
    'min_data_in_leaf': hp.quniform('min_data_in_leaf', 1, 300, 1),
    "use_best_model": True,
    "task_type": "GPU"
}

In [4]:
import os
import time
import random
import numpy as np
import torch
import torch.nn as nn
import logging
import json
seed = 42
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [5]:
test_size = 0.2
def context_data_split(data):
    """
    Parameters
    ----------
    Args:
        test_size : float
            Train/Valid split 비율을 입력합니다.
        seed : int
            랜덤 seed 값
    ----------
    """

    X_train, X_valid, y_train, y_valid = train_test_split(
                                                        data['train'].drop(['rating'], axis=1),
                                                        data['train']['rating'],
                                                        test_size=test_size,
                                                        random_state=seed,
                                                        shuffle=True
                                                        )
    data['X_train'], data['X_valid'], data['y_train'], data['y_valid'] = X_train, X_valid, y_train, y_valid
    
    # train 데이터와 validation 데이터의 인덱스 분할
    train_idx, valid_idx = train_test_split(data['train'].index, test_size=test_size, random_state=seed, shuffle=True)

    # train 데이터와 validation 데이터의 인덱스를 각각 CSV 파일로 저장
    os.makedirs('./data_index', exist_ok=True)
    data['train'].loc[train_idx].to_csv(f'./data_index/context_data_train_index.csv', index=False)
    data['train'].loc[valid_idx].to_csv(f'./data_index/context_data_valid_index.csv', index=False)
    return data

In [None]:
with open('/opt/ml/data/20230419_015908_catboost_data.pt',"rb") as f:
    data = pickle.load(f)
del data['train']['mean_user']
del data['train']['median_user']
del data['train']['var_user']
del data['train']['std_user']
del data['train']['category_mean']
del data['train']['category_high_mean']
del data['train']['category_median']
del data['train']['category_high_median']
del data['train']['category_std']
del data['train']['category_high_std']
del data['train']['mean_user_category']
del data['train']['median_user_category']
del data['train']['var_user_category']
del data['train']['std_user_category']
del data['train']['mean_user_category_high']
del data['train']['median_user_category_high']
del data['train']['var_user_category_high']
del data['train']['user_rating_avg']
del data['train']['user_rating_var']
del data['train']['user_rating_median']
del data['train']['std_user_category_high']

del data['test']['median_user']
del data['test']['var_user']
del data['test']['std_user']
del data['test']['category_mean']
del data['test']['category_high_mean']
del data['test']['category_median']
del data['test']['category_high_median']
del data['test']['category_std']
del data['test']['category_high_std']
del data['test']['mean_user_category']
del data['test']['median_user_category']
del data['test']['var_user_category']
del data['test']['std_user_category']
del data['test']['mean_user_category_high']
del data['test']['median_user_category_high']
del data['test']['var_user_category_high']
del data['test']['user_rating_avg']
del data['test']['user_rating_var']
del data['test']['user_rating_median']
del data['test']['std_user_category_high']

data = context_data_split(data)

def rmse(real: list, predict: list) -> float:
    pred = np.array(predict)
    return np.sqrt(np.mean((real-pred) ** 2))

cat_features = ['user_id', 'isbn', 'category', 'category_high', 'publisher', 'language', 'book_author','age_map', 'location_city', 'location_state', 'location_country', 'year_of_publication_map']

def objective(params):
    model = CatBoostRegressor(
        learning_rate=params['learning_rate'],
        depth=int(params['depth']),
        l2_leaf_reg=params['l2_leaf_reg'],
        bagging_temperature=params['bagging_temperature'],
        random_strength=params['random_strength'],
        verbose=False,
        task_type="GPU",
        eval_metric='RMSE'
    )

    model.fit(data['X_train'], data['y_train'], eval_set=(data['X_valid'],data['y_valid']), cat_features=cat_features, early_stopping_rounds=300)
    y_pred = model.predict(data['X_valid'])

    rmse = np.sqrt(mean_squared_error(data['y_valid'], y_pred))
    return {'loss': rmse, 'status': 'ok'}

trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=1000,
            trials=trials)

print('Best hyperparameters:', best)

  0%|          | 0/1000 [00:00<?, ?trial/s, best loss=?]



  0%|          | 1/1000 [00:55<15:18:55, 55.19s/trial, best loss: 2.2325302571811414]



  0%|          | 2/1000 [01:05<11:35:33, 41.82s/trial, best loss: 2.1752828766156838]



  0%|          | 3/1000 [01:16<9:01:27, 32.59s/trial, best loss: 2.1752828766156838] 



  0%|          | 4/1000 [02:28<12:16:19, 44.36s/trial, best loss: 2.1378368049737975]



  0%|          | 5/1000 [03:26<13:23:47, 48.47s/trial, best loss: 2.1378368049737975]



  1%|          | 6/1000 [03:46<11:01:22, 39.92s/trial, best loss: 2.1378368049737975]



  1%|          | 7/1000 [04:22<10:40:40, 38.71s/trial, best loss: 2.1378368049737975]



  1%|          | 8/1000 [05:00<10:33:53, 38.34s/trial, best loss: 2.1378368049737975]



  1%|          | 9/1000 [06:00<12:20:33, 44.84s/trial, best loss: 2.1378368049737975]



  1%|          | 10/1000 [07:03<13:54:12, 50.56s/trial, best loss: 2.1378368049737975]



  1%|          | 11/1000 [08:28<16:42:17, 60.81s/trial, best loss: 2.1378368049737975]



  1%|          | 12/1000 [09:52<18:34:52, 67.71s/trial, best loss: 2.1378368049737975]



  1%|▏         | 13/1000 [10:56<18:13:45, 66.49s/trial, best loss: 2.1378368049737975]



  1%|▏         | 14/1000 [12:08<18:41:10, 68.23s/trial, best loss: 2.1378368049737975]



  2%|▏         | 15/1000 [13:29<19:43:42, 72.10s/trial, best loss: 2.1378368049737975]



  2%|▏         | 16/1000 [14:22<18:10:00, 66.46s/trial, best loss: 2.1378368049737975]



  2%|▏         | 17/1000 [15:48<19:43:23, 72.23s/trial, best loss: 2.1378368049737975]



  2%|▏         | 18/1000 [16:56<19:20:00, 70.88s/trial, best loss: 2.1378368049737975]



  2%|▏         | 19/1000 [17:19<15:22:39, 56.43s/trial, best loss: 2.1378368049737975]



  2%|▏         | 20/1000 [18:30<16:36:26, 61.01s/trial, best loss: 2.1378368049737975]



  2%|▏         | 21/1000 [19:48<17:55:34, 65.92s/trial, best loss: 2.1378368049737975]



  2%|▏         | 22/1000 [20:58<18:18:00, 67.36s/trial, best loss: 2.1378368049737975]



  2%|▏         | 23/1000 [22:12<18:46:28, 69.18s/trial, best loss: 2.1378368049737975]



  2%|▏         | 24/1000 [23:00<17:02:37, 62.87s/trial, best loss: 2.1378368049737975]



  2%|▎         | 25/1000 [24:07<17:20:46, 64.05s/trial, best loss: 2.1378368049737975]



  3%|▎         | 26/1000 [25:18<17:54:44, 66.21s/trial, best loss: 2.1378368049737975]



  3%|▎         | 27/1000 [25:58<15:46:06, 58.34s/trial, best loss: 2.1378368049737975]



  3%|▎         | 28/1000 [27:05<16:27:19, 60.95s/trial, best loss: 2.1372968120964697]



  3%|▎         | 29/1000 [28:15<17:10:19, 63.67s/trial, best loss: 2.1372968120964697]



  3%|▎         | 30/1000 [29:09<16:23:59, 60.87s/trial, best loss: 2.1372968120964697]



  3%|▎         | 31/1000 [30:09<16:19:00, 60.62s/trial, best loss: 2.1372968120964697]



  3%|▎         | 32/1000 [30:53<14:56:04, 55.54s/trial, best loss: 2.1372968120964697]



  3%|▎         | 33/1000 [32:06<16:18:29, 60.71s/trial, best loss: 2.1372968120964697]



  3%|▎         | 34/1000 [33:20<17:20:25, 64.62s/trial, best loss: 2.1372968120964697]



  4%|▎         | 35/1000 [34:16<16:38:15, 62.07s/trial, best loss: 2.1372968120964697]



  4%|▎         | 36/1000 [35:44<18:41:44, 69.82s/trial, best loss: 2.1372968120964697]



  4%|▎         | 37/1000 [36:17<15:44:21, 58.84s/trial, best loss: 2.1372968120964697]



  4%|▍         | 38/1000 [37:11<15:22:18, 57.52s/trial, best loss: 2.1372968120964697]



  4%|▍         | 39/1000 [38:14<15:48:57, 59.25s/trial, best loss: 2.1372968120964697]



  4%|▍         | 40/1000 [39:32<17:14:18, 64.64s/trial, best loss: 2.1372968120964697]



  6%|▌         | 57/1000 [58:31<20:54:58, 79.85s/trial, best loss: 2.1372968120964697]



  6%|▌         | 58/1000 [59:54<21:10:47, 80.94s/trial, best loss: 2.1372968120964697]



  6%|▌         | 59/1000 [1:01:01<20:01:55, 76.64s/trial, best loss: 2.1372968120964697]



  6%|▌         | 60/1000 [1:01:24<15:47:40, 60.49s/trial, best loss: 2.1372968120964697]



  6%|▌         | 61/1000 [1:03:01<18:40:25, 71.59s/trial, best loss: 2.136947281689055] 



  6%|▌         | 62/1000 [1:03:29<15:13:06, 58.41s/trial, best loss: 2.136947281689055]



  6%|▋         | 63/1000 [1:03:41<11:35:41, 44.55s/trial, best loss: 2.136947281689055]



  6%|▋         | 64/1000 [1:05:12<15:11:02, 58.40s/trial, best loss: 2.136947281689055]



  6%|▋         | 65/1000 [1:05:56<14:03:58, 54.16s/trial, best loss: 2.136947281689055]



  7%|▋         | 66/1000 [1:07:37<17:40:10, 68.11s/trial, best loss: 2.136947281689055]



  7%|▋         | 74/1000 [1:18:41<21:39:49, 84.22s/trial, best loss: 2.136947281689055]



  8%|▊         | 75/1000 [1:19:40<19:42:26, 76.70s/trial, best loss: 2.136947281689055]



  8%|▊         | 76/1000 [1:20:57<19:45:41, 76.99s/trial, best loss: 2.136947281689055]



  8%|▊         | 77/1000 [1:22:31<21:01:14, 81.99s/trial, best loss: 2.136947281689055]



  8%|▊         | 78/1000 [1:23:53<21:01:54, 82.12s/trial, best loss: 2.136947281689055]



  8%|▊         | 79/1000 [1:25:05<20:13:48, 79.08s/trial, best loss: 2.136947281689055]



  8%|▊         | 80/1000 [1:26:11<19:09:04, 74.94s/trial, best loss: 2.136947281689055]



  8%|▊         | 81/1000 [1:27:10<17:54:43, 70.17s/trial, best loss: 2.136947281689055]



  8%|▊         | 82/1000 [1:28:06<16:48:39, 65.93s/trial, best loss: 2.136947281689055]



  8%|▊         | 83/1000 [1:29:28<18:00:31, 70.70s/trial, best loss: 2.136947281689055]



  8%|▊         | 84/1000 [1:30:36<17:50:33, 70.12s/trial, best loss: 2.136947281689055]



  8%|▊         | 85/1000 [1:32:14<19:56:27, 78.46s/trial, best loss: 2.136947281689055]



  9%|▊         | 86/1000 [1:33:30<19:44:27, 77.75s/trial, best loss: 2.136947281689055]



  9%|▉         | 88/1000 [1:36:07<19:49:39, 78.27s/trial, best loss: 2.136947281689055]



  9%|▉         | 89/1000 [1:37:12<18:49:27, 74.39s/trial, best loss: 2.136947281689055]



  9%|▉         | 90/1000 [1:38:15<17:56:01, 70.95s/trial, best loss: 2.136947281689055]



 10%|▉         | 98/1000 [1:47:18<16:05:58, 64.26s/trial, best loss: 2.136947281689055]



 10%|▉         | 99/1000 [1:48:43<17:37:11, 70.40s/trial, best loss: 2.136947281689055]



 10%|█         | 100/1000 [1:50:08<18:41:45, 74.78s/trial, best loss: 2.136947281689055]



 10%|█         | 101/1000 [1:51:19<18:23:41, 73.66s/trial, best loss: 2.136947281689055]



 10%|█         | 102/1000 [1:51:44<14:45:25, 59.16s/trial, best loss: 2.136947281689055]



 10%|█         | 103/1000 [1:52:34<14:01:49, 56.31s/trial, best loss: 2.136947281689055]



 10%|█         | 104/1000 [1:54:03<16:28:24, 66.19s/trial, best loss: 2.136947281689055]



 10%|█         | 105/1000 [1:55:18<17:07:17, 68.87s/trial, best loss: 2.136947281689055]



 11%|█         | 106/1000 [1:55:53<14:35:21, 58.75s/trial, best loss: 2.136947281689055]



 11%|█▏        | 114/1000 [2:05:52<18:33:24, 75.40s/trial, best loss: 2.136947281689055]



 12%|█▏        | 115/1000 [2:06:50<17:13:59, 70.10s/trial, best loss: 2.136947281689055]



 12%|█▏        | 116/1000 [2:07:33<15:12:49, 61.96s/trial, best loss: 2.136947281689055]



 12%|█▏        | 117/1000 [2:08:50<16:16:55, 66.38s/trial, best loss: 2.136947281689055]



 12%|█▏        | 118/1000 [2:09:54<16:07:25, 65.81s/trial, best loss: 2.136947281689055]



 12%|█▏        | 119/1000 [2:10:54<15:40:04, 64.02s/trial, best loss: 2.136947281689055]



 12%|█▏        | 120/1000 [2:12:20<17:14:29, 70.53s/trial, best loss: 2.136947281689055]



 12%|█▏        | 121/1000 [2:13:09<15:41:26, 64.26s/trial, best loss: 2.136947281689055]



 12%|█▏        | 122/1000 [2:14:28<16:41:13, 68.42s/trial, best loss: 2.136947281689055]



 12%|█▏        | 123/1000 [2:14:43<12:49:50, 52.67s/trial, best loss: 2.136947281689055]



 12%|█▏        | 124/1000 [2:16:04<14:49:08, 60.90s/trial, best loss: 2.136947281689055]



 12%|█▎        | 125/1000 [2:17:30<16:41:06, 68.65s/trial, best loss: 2.136947281689055]



 13%|█▎        | 126/1000 [2:18:33<16:13:13, 66.81s/trial, best loss: 2.136947281689055]



 13%|█▎        | 127/1000 [2:19:32<15:37:57, 64.46s/trial, best loss: 2.136947281689055]



 13%|█▎        | 128/1000 [2:20:15<14:05:06, 58.15s/trial, best loss: 2.136947281689055]



 13%|█▎        | 129/1000 [2:21:46<16:24:15, 67.80s/trial, best loss: 2.136947281689055]



 13%|█▎        | 130/1000 [2:23:26<18:46:21, 77.68s/trial, best loss: 2.136947281689055]



 13%|█▎        | 131/1000 [2:24:31<17:48:20, 73.76s/trial, best loss: 2.136947281689055]



 13%|█▎        | 132/1000 [2:25:46<17:54:30, 74.27s/trial, best loss: 2.136947281689055]



 13%|█▎        | 133/1000 [2:26:46<16:49:43, 69.88s/trial, best loss: 2.136947281689055]



 13%|█▎        | 134/1000 [2:28:03<17:20:08, 72.07s/trial, best loss: 2.136947281689055]



 14%|█▎        | 135/1000 [2:29:19<17:34:01, 73.11s/trial, best loss: 2.136947281689055]



 14%|█▎        | 136/1000 [2:30:31<17:29:00, 72.85s/trial, best loss: 2.136947281689055]



 14%|█▎        | 137/1000 [2:31:33<16:42:13, 69.68s/trial, best loss: 2.136947281689055]



 14%|█▍        | 138/1000 [2:32:07<14:07:05, 58.96s/trial, best loss: 2.136947281689055]



 14%|█▍        | 139/1000 [2:32:54<13:12:19, 55.21s/trial, best loss: 2.136947281689055]



 14%|█▍        | 140/1000 [2:34:18<15:18:17, 64.07s/trial, best loss: 2.136947281689055]



 14%|█▍        | 141/1000 [2:35:35<16:09:16, 67.70s/trial, best loss: 2.136947281689055]



 14%|█▍        | 142/1000 [2:37:19<18:46:53, 78.80s/trial, best loss: 2.136947281689055]



 14%|█▍        | 143/1000 [2:38:46<19:19:34, 81.18s/trial, best loss: 2.136947281689055]



 14%|█▍        | 144/1000 [2:39:53<18:16:32, 76.86s/trial, best loss: 2.136947281689055]



 14%|█▍        | 145/1000 [2:41:28<19:32:57, 82.31s/trial, best loss: 2.136947281689055]



 15%|█▍        | 146/1000 [2:42:48<19:23:29, 81.74s/trial, best loss: 2.136947281689055]



 15%|█▍        | 147/1000 [2:44:00<18:41:16, 78.87s/trial, best loss: 2.136947281689055]



 15%|█▍        | 148/1000 [2:45:12<18:10:26, 76.79s/trial, best loss: 2.136947281689055]



 46%|████▋     | 463/1000 [5:41:40<5:04:09, 33.98s/trial, best loss: 2.1366520454653113]

In [13]:
print('Best hyperparameters:', best)

Best hyperparameters: {'bagging_temperature': 0.10001217419400599, 'depth': 8.0, 'iterations': 75000.0, 'l2_leaf_reg': 0.0007822886554960759, 'learning_rate': 0.07161638867514175, 'min_data_in_leaf': 95.0, 'random_strength': 9.169090950443135}
