### ランダムサーチを適用しcatboostでトレーニングしました。

Best params: {'iterations': 1000, 'depth': 10, 'learning_rate': 0.08, 'l2_leaf_reg': 10, 'border_count': 128, 'bagging_temperature': 1, 'random_strength': 5, 'task_type': 'CPU', 'devices': '0', 'eval_metric': 'Logloss', 'use_best_model': True, 'random_seed': 42, 'verbose': 0}

Mean Logloss across folds: 0.20729743235865605


In [1]:
from google.colab import drive

# Google Driveをマウント
drive.mount('/content/drive')

# マウント完了後のメッセージ
print("Google Driveがマウントされました！")

Mounted at /content/drive
Google Driveがマウントされました！


In [None]:
import pandas as pd


# csvファイルを読み込む
file_path = '/content/drive/My Drive/signate/train/train_0105.csv'
train = pd.read_csv(file_path, low_memory=False)

train.info()

In [2]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


### ランダムサーチで最適なパラメータを探す

In [12]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
import random

# csvファイルを読み込む
file_path = '/content/drive/My Drive/signate/train/train_0105.csv'
train = pd.read_csv(file_path, low_memory=False)


# 目的変数と特徴量を分ける前にカテゴリカル変数を追加
train['C2_C3_C5_mean_cat'] = train['C2_C3_C5_mean'].round().astype(int)

# 目的変数（ターゲット列）と特徴量を分ける
target_column = 'click'
X = train.drop(columns=[target_column, 'id'])
y = train[target_column]

# カテゴリカル変数リストの更新
categorical_features = ['C1', 'C4', 'C6', 'C2_freq_group', 'C3_freq_group', 'C5_freq_group', 'C2_C3_C5_mean_cat']

# Stratified K-Fold Cross Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# ハイパーパラメータのランダムサーチ
def random_search():
    param_grid = {
        'iterations': [500],
        'depth': [8, 10],
        'learning_rate': [0.08, 0.1],
        'l2_leaf_reg': [10],
        'border_count': [64, 128],
        'bagging_temperature': [1, 5],
        'random_strength': [5, 10]

    }

    best_logloss = float('inf')
    best_params = None

    for _ in range(5):  # ランダムに5回試行
        params = {
            'iterations': random.choice(param_grid['iterations']),
            'depth': random.choice(param_grid['depth']),
            'learning_rate': random.choice(param_grid['learning_rate']),
            'l2_leaf_reg': random.choice(param_grid['l2_leaf_reg']),
            'border_count': random.choice(param_grid['border_count']),
            'bagging_temperature': random.choice(param_grid['bagging_temperature']),
            'random_strength': random.choice(param_grid['random_strength']),
            'task_type': 'CPU',  # CPUを使用
            'devices': '0',      # デバイスIDを指定
            'eval_metric': 'Logloss',
            'use_best_model': True,
            'random_seed': 42,
            'verbose': 0
        }

        logloss_scores = []

        for train_idx, val_idx in skf.split(X, y):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            # Poolを作成
            train_pool = Pool(X_train, y_train, cat_features=categorical_features)
            val_pool = Pool(X_val, y_val, cat_features=categorical_features)

            # モデルのトレーニング
            model = CatBoostClassifier(**params)
            model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=100, verbose=100)

            # loglossを計算
            y_pred_prob = model.predict_proba(X_val)[:, 1]
            logloss = log_loss(y_val, y_pred_prob)
            logloss_scores.append(logloss)

        mean_logloss = np.mean(logloss_scores)

        if mean_logloss < best_logloss:
            best_logloss = mean_logloss
            best_params = params

    return best_params, best_logloss

# ランダムサーチの実行
best_params, best_logloss = random_search()

# 最適パラメータの確認
print("Best params:", best_params)
print("Best logloss:", best_logloss)


0:	learn: 0.6072831	test: 0.6072627	best: 0.6072627 (0)	total: 1.41s	remaining: 11m 44s
100:	learn: 0.2146086	test: 0.2128089	best: 0.2128089 (100)	total: 4m 27s	remaining: 17m 37s
200:	learn: 0.2108727	test: 0.2096251	best: 0.2096251 (200)	total: 9m 6s	remaining: 13m 32s
300:	learn: 0.2054892	test: 0.2070194	best: 0.2070194 (300)	total: 15m 42s	remaining: 10m 23s
400:	learn: 0.2023501	test: 0.2063717	best: 0.2063717 (400)	total: 22m 48s	remaining: 5m 37s
499:	learn: 0.2001247	test: 0.2060788	best: 0.2060776 (496)	total: 29m 38s	remaining: 0us

bestTest = 0.2060776275
bestIteration = 496

Shrink model to first 497 iterations.
0:	learn: 0.6069337	test: 0.6070176	best: 0.6070176 (0)	total: 1.32s	remaining: 10m 59s
100:	learn: 0.2139144	test: 0.2143793	best: 0.2143793 (100)	total: 4m 35s	remaining: 18m 10s
200:	learn: 0.2098166	test: 0.2113185	best: 0.2113185 (200)	total: 9m 22s	remaining: 13m 56s
300:	learn: 0.2046720	test: 0.2092272	best: 0.2092272 (300)	total: 16m 4s	remaining: 10m 37s

KeyboardInterrupt: 

### ランダムサーチで探したパラメータを使用し、再度catboostでトレーニングする

In [3]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss

# csvファイルを読み込む
file_path = '/content/drive/My Drive/signate/train/train_0105.csv'
train = pd.read_csv(file_path, low_memory=False)

# 目的変数と特徴量を分ける前にカテゴリカル変数を追加
train['C2_C3_C5_mean_cat'] = train['C2_C3_C5_mean'].round().astype(int)

# 目的変数（ターゲット列）と特徴量を分ける
target_column = 'click'
X = train.drop(columns=[target_column, 'id'])
y = train[target_column]

# カテゴリカル変数リストの更新
categorical_features = ['C1', 'C4', 'C6', 'C2_freq_group', 'C3_freq_group', 'C5_freq_group', 'C2_C3_C5_mean_cat']

# Stratified K-Fold Cross Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# ランダムサーチで得られたハイパーパラメータ
best_params = {
    'iterations': 1000,
    'depth': 10,
    'learning_rate': 0.08,
    'l2_leaf_reg': 10,
    'border_count': 128,
    'bagging_temperature': 1,
    'random_strength': 5,
    'task_type': 'CPU',
    'devices': '0',
    'eval_metric': 'Logloss',
    'use_best_model': True,
    'random_seed': 42,
    'verbose': 100
}

# 最適パラメータでモデルをトレーニング
final_model = CatBoostClassifier(**best_params)

logloss_scores = []

for train_idx, val_idx in skf.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Poolを作成
    train_pool = Pool(X_train, y_train, cat_features=categorical_features)
    val_pool = Pool(X_val, y_val, cat_features=categorical_features)

    # モデルのトレーニング
    final_model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=100, verbose=100)

    # loglossを計算
    y_pred_prob = final_model.predict_proba(X_val)[:, 1]
    logloss = log_loss(y_val, y_pred_prob)
    logloss_scores.append(logloss)

mean_logloss = np.mean(logloss_scores)
print(f"Mean Logloss across folds: {mean_logloss}")

# モデルを保存
model_file_path = '/content/drive/My Drive/signate/submission/catboost_model_0105_1.cbm'
final_model.save_model(model_file_path)
print(f"Model saved to: {model_file_path}")


0:	learn: 0.6067169	test: 0.6066682	best: 0.6066682 (0)	total: 419ms	remaining: 6m 58s
100:	learn: 0.2125433	test: 0.2110890	best: 0.2110890 (100)	total: 50.6s	remaining: 7m 30s
200:	learn: 0.2091636	test: 0.2085953	best: 0.2085953 (200)	total: 1m 36s	remaining: 6m 25s
300:	learn: 0.2050680	test: 0.2070545	best: 0.2070545 (300)	total: 2m 44s	remaining: 6m 21s
400:	learn: 0.2026116	test: 0.2064977	best: 0.2064977 (400)	total: 3m 52s	remaining: 5m 47s
500:	learn: 0.2005539	test: 0.2061732	best: 0.2061732 (500)	total: 5m 2s	remaining: 5m 1s
600:	learn: 0.1986029	test: 0.2059431	best: 0.2059379 (595)	total: 6m 12s	remaining: 4m 7s
700:	learn: 0.1969613	test: 0.2058440	best: 0.2058440 (700)	total: 7m 22s	remaining: 3m 8s
800:	learn: 0.1954291	test: 0.2057627	best: 0.2057588 (797)	total: 8m 31s	remaining: 2m 7s
900:	learn: 0.1939337	test: 0.2057802	best: 0.2057384 (854)	total: 9m 40s	remaining: 1m 3s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2057384048
bestIteratio