### ランダムサーチでXGBoostの最低なパラメータを探す

In [None]:
import pandas as pd
import numpy as np
from xgboost import DMatrix, train
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
import random

# TSVファイルを読み込む
file_path = '/content/drive/My Drive/signate/train/train_0101.tsv'
train_data = pd.read_csv(file_path, low_memory=False, sep='\t')

# 新しい特徴量を作成
train_data['C2_I11_interaction'] = train_data['C2'] * train_data['I11']
train_data['I5_I12_I6_sum'] = train_data['I5'] + train_data['I12'] + train_data['I6']

# 目的変数（ターゲット列）と特徴量を分ける
target_column = 'click'
X = train_data.drop(columns=[target_column, 'id'])
y = train_data[target_column]

# Categorical featuresはXGBoostに直接渡せないため、One-hotエンコーディング
categorical_features = ['C1', 'C4', 'C6', 'C2_freq_group', 'C3_freq_group', 'C5_freq_group']

# 訓練時に使用した特徴量リストを保存
X = pd.get_dummies(X, columns=categorical_features)
saved_feature_columns = X.columns.tolist()

# Stratified K-Fold Cross Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# ハイパーパラメータのランダムサーチ
def random_search():
    param_grid = {
        'max_depth': [8, 10],
        'learning_rate': [0.08, 0.1],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
        'gamma': [0, 1],
        'reg_alpha': [0, 1],
        'reg_lambda': [1, 10],
        'min_child_weight': [1, 5, 10],
        'n_estimators': [500]
    }

    best_logloss = float('inf')
    best_params = None

    for _ in range(5):  # ランダムに5回試行
        params = {
            'max_depth': random.choice(param_grid['max_depth']),
            'learning_rate': random.choice(param_grid['learning_rate']),
            'subsample': random.choice(param_grid['subsample']),
            'colsample_bytree': random.choice(param_grid['colsample_bytree']),
            'gamma': random.choice(param_grid['gamma']),
            'reg_alpha': random.choice(param_grid['reg_alpha']),
            'reg_lambda': random.choice(param_grid['reg_lambda']),
            'min_child_weight': random.choice(param_grid['min_child_weight']),
            'objective': 'binary:logistic',
            'eval_metric': 'logloss',
            'seed': 42
        }

        logloss_scores = []

        for train_idx, val_idx in skf.split(X, y):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            # XGBoost DMatrix形式に変換
            dtrain = DMatrix(X_train, label=y_train)
            dval = DMatrix(X_val, label=y_val)

            # 早期終了用の評価セット
            evals = [(dtrain, 'train'), (dval, 'eval')]

            # モデルのトレーニング
            model = train(params, dtrain, num_boost_round=500, evals=evals,
                          early_stopping_rounds=50, verbose_eval=50)

            # loglossを計算
            y_pred_prob = model.predict(dval)
            logloss = log_loss(y_val, y_pred_prob)
            logloss_scores.append(logloss)

        mean_logloss = np.mean(logloss_scores)

        if mean_logloss < best_logloss:
            best_logloss = mean_logloss
            best_params = params

    return best_params, best_logloss

# ランダムサーチの実行
best_params, best_logloss = random_search()

# 最適パラメータの確認
print("Best params:", best_params)
print("Best logloss:", best_logloss)



### ランダムサーチで探したパラメータを適用しxgboostをトレーニングする

In [None]:
import pandas as pd
import numpy as np
from xgboost import DMatrix, train
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
import pickle

# TSVファイルを読み込む
file_path = '/content/drive/My Drive/signate/train/train_0101.tsv'
train_data = pd.read_csv(file_path, low_memory=False, sep='\t')

# 新しい特徴量を作成
train_data['C2_I11_interaction'] = train_data['C2'] * train_data['I11']
train_data['I5_I12_I6_sum'] = train_data['I5'] + train_data['I12'] + train_data['I6']

# 目的変数（ターゲット列）と特徴量を分ける
target_column = 'click'
X = train_data.drop(columns=[target_column, 'id'])
y = train_data[target_column]

# Categorical featuresはXGBoostに直接渡せないため、One-hotエンコーディング
categorical_features = ['C1', 'C4', 'C6', 'C2_freq_group', 'C3_freq_group', 'C5_freq_group']

X = pd.get_dummies(X, columns=categorical_features)

# 特徴量列名を保存
saved_feature_columns = X.columns.tolist()
with open('/content/drive/My Drive/signate/submission/saved_feature_columns.pkl', 'wb') as f:
    pickle.dump(saved_feature_columns, f)

# Stratified K-Fold Cross Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 固定されたベストパラメータ
best_params = {
    'max_depth': 5,
    'learning_rate': 0.08,
    'subsample': 1.0,
    'colsample_bytree': 1.0,
    'gamma': 1,
    'reg_alpha': 0,
    'reg_lambda': 10,
    'min_child_weight': 1,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'seed': 42
}

logloss_scores = []

for train_idx, val_idx in skf.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # XGBoost DMatrix形式に変換
    dtrain = DMatrix(X_train, label=y_train)
    dval = DMatrix(X_val, label=y_val)

    # 早期終了用の評価セット
    evals = [(dtrain, 'train'), (dval, 'eval')]

    # モデルのトレーニング
    model = train(best_params, dtrain, num_boost_round=1000, evals=evals,
                  early_stopping_rounds=50, verbose_eval=50)

    # loglossを計算
    y_pred_prob = model.predict(dval)
    logloss = log_loss(y_val, y_pred_prob)
    logloss_scores.append(logloss)

# クロスバリデーションの平均logloss
mean_logloss = np.mean(logloss_scores)
print("Mean Logloss across folds:", mean_logloss)

# モデルを保存
model_file_path = '/content/drive/My Drive/signate/submission/xgboost_model_0103_1.xgb'
model.save_model(model_file_path)
print(f"Model saved to: {model_file_path}")


### Mean Logloss across folds: 0.20845315925409133