In [1]:
from google.colab import drive

# Google Driveをマウント
drive.mount('/content/drive')

# マウント完了後のメッセージ
print("Google Driveがマウントされました！")

Mounted at /content/drive
Google Driveがマウントされました！


### lightGBMとcatboostでアンサンブルモデルを作成する

In [None]:
!pip install scipy



In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from scipy.optimize import minimize

# モデルファイルパス
lgb_model_file_path = '/content/drive/My Drive/signate/submission/lgbm_model_0105_1.txt'
cat_model_file_path = '/content/drive/My Drive/signate/submission/catboost_model_0105_1.cbm'

# csvファイルを読み込む
file_path = '/content/drive/My Drive/signate/train/train_0105.csv'
train = pd.read_csv(file_path, low_memory=False)

# 目的変数と特徴量を分ける前にカテゴリカル変数を追加
train['C2_C3_C5_mean_cat'] = train['C2_C3_C5_mean'].round().astype(int)

# 目的変数（ターゲット列）と特徴量を分ける
target_column = 'click'
X = train.drop(columns=[target_column, 'id'])
y = train[target_column]

# カテゴリカル変数リストの更新
categorical_features = ['C1', 'C4', 'C6', 'C2_freq_group', 'C3_freq_group', 'C5_freq_group', 'C2_C3_C5_mean_cat']

# LightGBM用にカテゴリカル変数を明示的に指定
for col in categorical_features:
    if col in X.columns:  # 特徴量リストに存在する場合のみ変換
        X[col] = X[col].astype('category')

# Stratified K-Fold Cross Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 保存したモデルの読み込み
lgb_model = lgb.Booster(model_file=lgb_model_file_path)
cat_model = CatBoostClassifier()


# アンサンブルのloglossを計算
logloss_scores = []

# 最適な重みを計算する関数
def optimize_weights(y_val, lgb_pred_prob, cat_pred_prob):
    def objective(weights):
        w1, w2 = weights
        ensemble_pred_prob = w1 * lgb_pred_prob + w2 * cat_pred_prob
        return log_loss(y_val, ensemble_pred_prob)

    result = minimize(objective, [0.5, 0.5], bounds=[(0, 1), (0, 1)])
    return result.x

for train_idx, val_idx in skf.split(X, y):
    X_val = X.iloc[val_idx]
    y_val = y.iloc[val_idx]

    # LightGBMの予測
    lgb_pred_prob = lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration)

    # CatBoostの予測
    val_pool = Pool(X_val, cat_features=categorical_features)
    cat_pred_prob = cat_model.predict_proba(val_pool)[:, 1]

    # 最適な重みを計算
    optimal_weights = optimize_weights(y_val, lgb_pred_prob, cat_pred_prob)
    print("Optimal weights:", optimal_weights)

    # 最適な重みでアンサンブル
    w1, w2 = optimal_weights
    ensemble_pred_prob = w1 * lgb_pred_prob + w2 * cat_pred_prob

    # loglossを計算
    logloss = log_loss(y_val, ensemble_pred_prob)
    logloss_scores.append(logloss)

# クロスバリデーションでの平均logloss
mean_logloss = np.mean(logloss_scores)
print(f"Mean Logloss across folds: {mean_logloss}")


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


###LightGBMとCatBoostで同じカテゴリ変数を使用し、前処理の方法を統一

In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from scipy.optimize import minimize

# モデルファイルパス
lgb_model_file_path = '/content/drive/My Drive/signate/submission/lgbm_model_0105_1.txt'
cat_model_file_path = '/content/drive/My Drive/signate/submission/catboost_model_0105_1.cbm'

# CSVファイルを読み込む
file_path = '/content/drive/My Drive/signate/train/train_0105.csv'
train = pd.read_csv(file_path, low_memory=False)

# **カテゴリ変数の前処理（両モデルで共通化）**
train['C2_C3_C5_mean_cat'] = train['C2_C3_C5_mean'].round().astype(int)
categorical_features = ['C1', 'C4', 'C6', 'C2_freq_group', 'C3_freq_group', 'C5_freq_group', 'C2_C3_C5_mean_cat']

# LightGBM用の前処理（カテゴリ型へ変換）
X_lgb = train.copy()
for col in categorical_features:
    X_lgb[col] = X_lgb[col].astype('category')

# CatBoost用の前処理（文字列化）
X_cat = train.copy()
for col in categorical_features:
    X_cat[col] = X_cat[col].astype(str)

# 目的変数と特徴量の分離
target_column = 'click'
y = train[target_column]
X_lgb = X_lgb.drop(columns=[target_column, 'id'])
X_cat = X_cat.drop(columns=[target_column, 'id'])

# Stratified K-Fold Cross Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 保存したモデルの読み込み
lgb_model = lgb.Booster(model_file=lgb_model_file_path)
cat_model = CatBoostClassifier()
cat_model.load_model(cat_model_file_path)

# 最適な重みを計算する関数
def optimize_weights(y_val, lgb_pred_prob, cat_pred_prob):
    def objective(weights):
        w1, w2 = weights
        ensemble_pred_prob = w1 * lgb_pred_prob + w2 * cat_pred_prob
        return log_loss(y_val, ensemble_pred_prob)

    constraints = ({'type': 'eq', 'fun': lambda w: 1 - sum(w)})
    result = minimize(objective, [0.5, 0.5], bounds=[(0, 1), (0, 1)], constraints=constraints)
    return result.x

# アンサンブルのloglossを計算
logloss_scores = []

for train_idx, val_idx in skf.split(X_lgb, y):
    X_lgb_train, X_lgb_val = X_lgb.iloc[train_idx], X_lgb.iloc[val_idx]
    X_cat_train, X_cat_val = X_cat.iloc[train_idx], X_cat.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # **LightGBMのトレーニング**
    lgb_train = lgb.Dataset(X_lgb_train, label=y_train, categorical_feature=categorical_features)
    lgb_model = lgb.train({'objective': 'binary', 'metric': 'logloss'}, lgb_train, num_boost_round=100)
    lgb_pred_prob = lgb_model.predict(X_lgb_val)

    # **CatBoostのトレーニング**
    train_pool = Pool(X_cat_train, y_train, cat_features=categorical_features)
    val_pool = Pool(X_cat_val, y_val, cat_features=categorical_features)
    cat_model.fit(train_pool, eval_set=val_pool, verbose=100, early_stopping_rounds=50)
    cat_pred_prob = cat_model.predict_proba(val_pool)[:, 1]

    # **最適な重みの計算**
    optimal_weights = optimize_weights(y_val, lgb_pred_prob, cat_pred_prob)
    print("Optimal weights:", optimal_weights)

    # **最適な重みでアンサンブル**
    w1, w2 = optimal_weights
    ensemble_pred_prob = w1 * lgb_pred_prob + w2 * cat_pred_prob

    # **loglossの計算**
    logloss = log_loss(y_val, ensemble_pred_prob)
    logloss_scores.append(logloss)

# クロスバリデーションでの平均logloss
mean_logloss = np.mean(logloss_scores)
print(f"Mean Logloss across folds: {mean_logloss}")


[LightGBM] [Info] Number of positive: 70197, number of negative: 865036
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.135441 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6228
[LightGBM] [Info] Number of data points in the train set: 935233, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.075058 -> initscore=-2.511466
[LightGBM] [Info] Start training from score -2.511466
0:	learn: 0.6067169	test: 0.6066682	best: 0.6066682 (0)	total: 2.01s	remaining: 33m 27s
100:	learn: 0.2123562	test: 0.2108049	best: 0.2108049 (100)	total: 5m 27s	remaining: 48m 31s
200:	learn: 0.2091564	test: 0.2084170	best: 0.2084170 (200)	total: 11m 55s	remaining: 47m 24s
300:	learn: 0.2049445	test: 0.2068072	best: 0.2068072 (300)	total: 18m 36s	remaining: 43m 13s
400:	learn: 0.2025913	test: 0.2063041	best: 0.2063041 (400)	total: 25m 

KeyboardInterrupt: 

## lightGBM、catboost、XGBoostでアンサンブルモデルを作成する

In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
import lightgbm as lgb
from xgboost import DMatrix, Booster
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from scipy.optimize import minimize
from scipy.optimize import LinearConstraint
import pickle

# モデルファイルパス
lgb_model_file_path = '/content/drive/My Drive/signate/submission/lgbm_model_0101_1.txt'
cat_model_file_path = '/content/drive/My Drive/signate/submission/catboost_model_0101_1.cbm'
xgb_model_file_path = '/content/drive/My Drive/signate/submission/xgboost_model_0103_1.xgb'
saved_feature_columns_file = '/content/drive/My Drive/signate/submission/saved_feature_columns.pkl'

# 保存した特徴量をloadする
with open(saved_feature_columns_file, 'rb') as f:
    saved_feature_columns = pickle.load(f)

# TSVファイルを読み込む
file_path = '/content/drive/My Drive/signate/train/train_0101.tsv'
train = pd.read_csv(file_path, low_memory=False, sep='\t')

# 新しい特徴量を作成
train['C2_I11_interaction'] = train['C2'] * train['I11']
train['I5_I12_I6_sum'] = train['I5'] + train['I12'] + train['I6']

# 目的変数（ターゲット列）と特徴量を分ける
target_column = 'click'
X = train.drop(columns=[target_column, 'id'])
y = train[target_column]

# 明示的にカテゴリカル変数として指定したい特徴量をリスト化
categorical_features = ['C1', 'C4', 'C6', 'C2_freq_group', 'C3_freq_group', 'C5_freq_group']

# LightGBM用にカテゴリカル変数を明示的に指定
for col in categorical_features:
    if col in X.columns:
        X[col] = X[col].astype('category')

# スタッキング前に使用したエンコーディングを適用
X_xgb = pd.get_dummies(X, columns=categorical_features)

# 訓練時の特徴量にない列を追加
for col in saved_feature_columns:
    if col not in X_xgb.columns:
        X_xgb[col] = 0

# 余分な列を削除（列順も統一）
X_xgb = X_xgb[saved_feature_columns]

# 保存したモデルの読み込み
lgb_model = lgb.Booster(model_file=lgb_model_file_path)
cat_model = CatBoostClassifier()
cat_model.load_model(cat_model_file_path)
xgb_model = Booster()
xgb_model.load_model(xgb_model_file_path)

# Stratified K-Fold Cross Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 最適な重みを計算する関数
def optimize_weights(y_val, lgb_pred_prob, cat_pred_prob, xgb_pred_prob):
    def objective(weights):
        w1, w2, w3 = weights
        ensemble_pred_prob = w1 * lgb_pred_prob + w2 * cat_pred_prob + w3 * xgb_pred_prob
        return log_loss(y_val, ensemble_pred_prob)

    constraint = LinearConstraint([1, 1, 1], [1], [1])
    result = minimize(objective, [0.33, 0.33, 0.33], bounds=[(0, 1), (0, 1), (0, 1)], constraints=constraint)
    return result.x

logloss_scores = []

for train_idx, val_idx in skf.split(X, y):
    X_val = X.iloc[val_idx]
    y_val = y.iloc[val_idx]

    # LightGBMの予測
    lgb_pred_prob = lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration)

    # CatBoostの予測
    val_pool = Pool(X_val, cat_features=categorical_features)
    cat_pred_prob = cat_model.predict_proba(val_pool)[:, 1]

    # XGBoostの予測
    X_val_xgb = pd.get_dummies(X_val, columns=categorical_features)
    for col in saved_feature_columns:
        if col not in X_val_xgb.columns:
            X_val_xgb[col] = 0
    X_val_xgb = X_val_xgb[saved_feature_columns]
    dval_xgb = DMatrix(X_val_xgb)
    xgb_pred_prob = xgb_model.predict(dval_xgb)

    # 最適な重みを計算
    optimal_weights = optimize_weights(y_val, lgb_pred_prob, cat_pred_prob, xgb_pred_prob)
    print("Optimal weights:", optimal_weights)

    w1, w2, w3 = optimal_weights
    ensemble_pred_prob = w1 * lgb_pred_prob + w2 * cat_pred_prob + w3 * xgb_pred_prob
    logloss = log_loss(y_val, ensemble_pred_prob)
    logloss_scores.append(logloss)

mean_logloss = np.mean(logloss_scores)
print(f"Mean Logloss across folds: {mean_logloss}")


### 平均化された重みを計算し最終的なloglossを計算する

In [None]:
# 平均化された重み
weights_lgb = 0.33354156
weights_cat = 0.33149742
weights_xgb = 0.33496102

# 最終的なアンサンブル予測
ensemble_pred_prob = weights_lgb * lgb_pred_prob + weights_cat * cat_pred_prob + weights_xgb * xgb_pred_prob

# loglossを計算
final_logloss = log_loss(y_val, ensemble_pred_prob)
print(f"Final Logloss with averaged weights: {final_logloss}")

lightGBMとニューラルネットワークでアンサンブルモデルを作成する

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from tensorflow.keras.models import load_model
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from scipy.optimize import minimize
from scipy.optimize import LinearConstraint
import joblib

# モデルファイルパス
lgb_model_file_path = '/content/drive/My Drive/signate/submission/lgbm_model_0101_1.txt'
nn_model_file_path = '/content/drive/My Drive/signate/submission/nn_model.keras'
scaler_path = '/content/drive/My Drive/signate/submission/scaler_0103.pkl'

# モデルとスケーラーの読み込み
scaler = joblib.load(scaler_path)
lgb_model = lgb.Booster(model_file=lgb_model_file_path)
nn_model = load_model(nn_model_file_path)

# データの読み込み
file_path = '/content/drive/My Drive/signate/train/train_0101.tsv'
train = pd.read_csv(file_path, low_memory=False, sep='\t')

# 新しい特徴量の作成
train['C2_I11_interaction'] = train['C2'] * train['I11']
train['I5_I12_I6_sum'] = train['I5'] + train['I12'] + train['I6']

# 目的変数と特徴量の分離
target_column = 'click'
X = train.drop(columns=[target_column, 'id'])
y = train[target_column]

# カテゴリカル変数の指定
categorical_features = ['C1', 'C4', 'C6', 'C2_freq_group', 'C3_freq_group', 'C5_freq_group']
for col in categorical_features:
    if col in X.columns:
        X[col] = X[col].astype('category')

# Stratified K-Fold Cross Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 最適な重みを計算する関数 (2つのモデル)
def optimize_weights(y_val, lgb_pred_prob, nn_pred_prob):
    def objective(weights):
        w1, w2 = weights
        ensemble_pred_prob = w1 * lgb_pred_prob + w2 * nn_pred_prob
        return log_loss(y_val, ensemble_pred_prob)

    # 合計が1の制約を課す
    constraint = LinearConstraint([[1, 1]], [1], [1])
    result = minimize(objective, [0.5, 0.5], bounds=[(0, 1)] * 2, constraints=constraint)
    return result.x

# クロスバリデーションとアンサンブル
logloss_scores = []

for train_idx, val_idx in skf.split(X, y):
    X_val = X.iloc[val_idx]
    y_val = y.iloc[val_idx]

    # LightGBMの予測
    lgb_pred_prob = lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration)

    # NNの予測（スケーリング適用）
    X_val_scaled = scaler.transform(X_val)
    nn_pred_prob = nn_model.predict(X_val_scaled).flatten()

    # 最適な重みの計算
    optimal_weights = optimize_weights(y_val, lgb_pred_prob, nn_pred_prob)
    print("Optimal weights:", optimal_weights)

    # アンサンブル予測の計算
    w1, w2 = optimal_weights
    ensemble_pred_prob = w1 * lgb_pred_prob + w2 * nn_pred_prob

    # ログロスの計算
    logloss = log_loss(y_val, ensemble_pred_prob)
    logloss_scores.append(logloss)

# クロスバリデーションの平均ロス
mean_logloss = np.mean(logloss_scores)
print(f"Mean Logloss across folds: {mean_logloss}")


[1m7307/7307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step
Optimal weights: [1. 0.]
[1m7307/7307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step
Optimal weights: [1. 0.]
[1m7307/7307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step
Optimal weights: [1.00000000e+00 1.66533454e-16]
[1m7307/7307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step
Optimal weights: [1. 0.]
[1m7307/7307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step
Optimal weights: [1.00000000e+00 2.22044605e-16]
Mean Logloss across folds: 0.19284991064349905


平均化された重みを計算し最終的なloglossを計算する

In [None]:
import numpy as np
from sklearn.metrics import log_loss

# Optunaの各foldから得られた重み
weights = np.array([
    [0.51202528, 0.48797472, 0, 0],
    [0.523177199, 0.476822801, 2.15105711e-16, 2.24646690e-16],
    [0.528569302, 0.471430698, 6.93889390e-18, 1.04083409e-17],
    [0.530358099, 0.469641901, 8.32667268e-17, 2.66713734e-16],
    [1.0, 0.0, 0.0, 0.0]
])

# 平均化された重みの計算
average_weights = np.mean(weights, axis=0)
print("平均化された重み:", average_weights)

# 各モデルの予測確率 (NumPy配列として事前に用意されていると仮定)
# lgb_pred_prob, cat_pred_prob, xgb_pred_prob, nn_pred_prob が各サンプルの予測確率を保持している

# 平均化された重みを用いたアンサンブル予測
ensemble_pred_prob = (
    average_weights[0] * lgb_pred_prob +
    average_weights[1] * cat_pred_prob +
    average_weights[2] * xgb_pred_prob +
    average_weights[3] * nn_pred_prob
)

# loglossの計算
final_logloss = log_loss(y_val, ensemble_pred_prob)
print(f"Final Logloss with averaged weights: {final_logloss}")


平均化された重み: [6.18825976e-01 3.81174024e-01 6.10622663e-17 1.00353753e-16]
Final Logloss with averaged weights: 0.19729921208720516


### モデル間の相関を確認する

In [None]:
import pandas as pd
import numpy as np
from catboost import Pool, CatBoostClassifier
from xgboost import Booster, DMatrix
import lightgbm as lgb
import pickle
import joblib
from tensorflow.keras.models import load_model

# モデルファイルパス
lgb_model_file_path = '/content/drive/My Drive/signate/submission/lgbm_model_0101_1.txt'
cat_model_file_path = '/content/drive/My Drive/signate/submission/catboost_model_0101_1.cbm'
xgb_model_file_path = '/content/drive/My Drive/signate/submission/xgboost_model_0103_1.xgb'
nn_model_file_path = '/content/drive/My Drive/signate/submission/nn_model.keras'

saved_feature_columns_file = '/content/drive/My Drive/signate/submission/saved_feature_columns.pkl'

# 特徴量リストをロード
with open(saved_feature_columns_file, 'rb') as f:
    saved_feature_columns = pickle.load(f)

scaler_path = '/content/drive/My Drive/signate/submission/scaler_0103.pkl'
scaler = joblib.load(scaler_path)

# データの読み込み
file_path = '/content/drive/My Drive/signate/train/train_0101.tsv'
train = pd.read_csv(file_path, low_memory=False, sep='\t')

# 新しい特徴量を作成
train['C2_I11_interaction'] = train['C2'] * train['I11']
train['I5_I12_I6_sum'] = train['I5'] + train['I12'] + train['I6']

# 目的変数と特徴量を分ける
target_column = 'click'
X = train.drop(columns=[target_column, 'id'])
y = train[target_column]

# 明示的にカテゴリカル変数として指定
categorical_features = ['C1', 'C4', 'C6', 'C2_freq_group', 'C3_freq_group', 'C5_freq_group']
for col in categorical_features:
    if col in X.columns:
        X[col] = X[col].astype('category')

# XGBoost用データのエンコーディング
X_xgb = pd.get_dummies(X, columns=categorical_features)
for col in saved_feature_columns:
    if col not in X_xgb.columns:
        X_xgb[col] = 0
X_xgb = X_xgb[saved_feature_columns]

# モデルの読み込み
lgb_model = lgb.Booster(model_file=lgb_model_file_path)
cat_model = CatBoostClassifier()
cat_model.load_model(cat_model_file_path)
xgb_model = Booster()
xgb_model.load_model(xgb_model_file_path)
nn_model = load_model(nn_model_file_path)

# モデルの予測
lgb_pred_prob = lgb_model.predict(X)

cat_pool = Pool(X, cat_features=[X.columns.get_loc(col) for col in categorical_features if col in X.columns])
cat_pred_prob = cat_model.predict_proba(cat_pool)[:, 1]

dtrain_xgb = DMatrix(X_xgb)
xgb_pred_prob = xgb_model.predict(dtrain_xgb)

# ニューラルネットワークの予測 (スケーリングが必要な場合)
X_nn = scaler.transform(X)
nn_pred_prob = nn_model.predict(X_nn).flatten()

# 相関係数を計算
predictions = pd.DataFrame({
    'LightGBM': lgb_pred_prob,
    'CatBoost': cat_pred_prob,
    'XGBoost': xgb_pred_prob,
    'NeuralNetwork': nn_pred_prob
})
correlation_matrix = predictions.corr()

# 結果表示
print("Correlation Matrix:")
print(correlation_matrix)


[1m36533/36533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 2ms/step
Correlation Matrix:
               LightGBM  CatBoost   XGBoost  NeuralNetwork
LightGBM       1.000000  0.960318  0.962484       0.931595
CatBoost       0.960318  1.000000  0.963571       0.939616
XGBoost        0.962484  0.963571  1.000000       0.958222
NeuralNetwork  0.931595  0.939616  0.958222       1.000000


### アンサンブルモデルで実際のtestデータを予測する

In [None]:
import pandas as pd
import lightgbm as lgb
import numpy as np
from catboost import CatBoostClassifier, Pool
from xgboost import DMatrix, Booster
import pickle

# テストデータの読み込み
file_path = '/content/drive/My Drive/signate/test/test_0101.tsv'
test = pd.read_csv(file_path, low_memory=False, sep='\t')

# 新しい特徴量を作成
test['C2_I11_interaction'] = test['C2'] * test['I11']
test['I5_I12_I6_sum'] = test['I5'] + test['I12'] + test['I6']
test['C2_C3_C5_mean'] = (test['C2'] + test['C3'] + test['C5']) / 3
test['I_mean'] = (test['I11'] + test['I10'] + test['I5']) / 4

# ID列を保持
test_ids = test['id']

# 特徴量部分を取得
X_test = test.drop(columns=['id'])

# **カテゴリ変数の前処理（両モデルで共通化）**
test['C2_C3_C5_mean_cat'] = test['C2_C3_C5_mean'].round().astype(int)
categorical_features = ['C1', 'C4', 'C6', 'C2_freq_group', 'C3_freq_group', 'C5_freq_group', 'C2_C3_C5_mean_cat']

# CatBoost用の前処理（文字列化）
X_cat = test.copy()
for col in categorical_features:
    X_cat[col] = X_cat[col].astype(str)

# LightGBM用にカテゴリカル変数を明示的に指定
for col in categorical_features:
    if col in X_test.columns:  # 特徴量リストに存在する場合のみ変換
        X_test[col] = X_test[col].astype('category')

# 保存したモデルの読み込み
lgb_model = lgb.Booster(model_file='/content/drive/My Drive/signate/submission/lgbm_model_0101_1.txt')
cat_model = CatBoostClassifier()
cat_model.load_model('/content/drive/My Drive/signate/submission/catboost_model_0101_1.cbm')

# LightGBMの予測
lgb_pred_prob = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)

# CatBoostの予測
test_pool = Pool(X_test, cat_features=categorical_features)
cat_pred_prob = cat_model.predict_proba(test_pool)[:, 1]

# アンサンブル予測（重み付き平均）
ensemble_pred_prob = (
    6.18825976e-01 * lgb_pred_prob +
    3.81174024e-01 * cat_pred_prob
    )

# 結果をDataFrameにまとめる
submission = pd.DataFrame({
    'id': test_ids,
    'click': ensemble_pred_prob
})

# ヘッダーなしで保存
output_path = '/content/drive/My Drive/signate/submission/submission_0105_1.csv'
submission.to_csv(output_path, index=False, header=False)
print(f"Predictions saved to: {output_path}")


In [None]:
import pandas as pd
import lightgbm as lgb
import numpy as np
from catboost import CatBoostClassifier, Pool
from xgboost import DMatrix, Booster
import pickle
import joblib
from tensorflow.keras.models import load_model

# テストデータの読み込み
file_path = '/content/drive/My Drive/signate/test/test_0101.tsv'
test = pd.read_csv(file_path, low_memory=False, sep='\t')

# 新しい特徴量を作成
test['C2_I11_interaction'] = test['C2'] * test['I11']
test['I5_I12_I6_sum'] = test['I5'] + test['I12'] + test['I6']

# ID列を保持
test_ids = test['id']

# 特徴量部分を取得
X_test = test.drop(columns=['id'])

# 明示的にカテゴリカル変数として指定したい特徴量
categorical_features = ['C1', 'C4', 'C6', 'C2_freq_group', 'C3_freq_group', 'C5_freq_group']

# LightGBM用にカテゴリカル変数を指定
for col in categorical_features:
    if col in X_test.columns:
        X_test[col] = X_test[col].astype('category')

# 保存済みの特徴量リストをロード
saved_feature_columns_file = '/content/drive/My Drive/signate/submission/saved_feature_columns.pkl'
with open(saved_feature_columns_file, 'rb') as f:
    saved_feature_columns = pickle.load(f)

# スケーラーのロード
scaler_path = '/content/drive/My Drive/signate/submission/scaler_0103.pkl'
scaler = joblib.load(scaler_path)

# XGBoost用データのエンコーディング（One-Hot Encoding）
X_xgb = pd.get_dummies(X_test, columns=categorical_features)

# 訓練時の特徴量にない列を追加（列合わせ）
for col in saved_feature_columns:
    if col not in X_xgb.columns:
        X_xgb[col] = 0

# 訓練時の列構造に合わせる
X_xgb = X_xgb[saved_feature_columns]

# 保存したモデルのロード
lgb_model = lgb.Booster(model_file='/content/drive/My Drive/signate/submission/lgbm_model_0101_1.txt')

cat_model = CatBoostClassifier()
cat_model.load_model('/content/drive/My Drive/signate/submission/catboost_model_0101_1.cbm')

xgb_model = Booster()
xgb_model.load_model('/content/drive/My Drive/signate/submission/xgboost_model_0102_1.xgb')

nn_model = load_model('/content/drive/My Drive/signate/submission/nn_model.keras')

# LightGBMの予測
lgb_pred_prob = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)

# CatBoostの予測
test_pool = Pool(X_test, cat_features=categorical_features)
cat_pred_prob = cat_model.predict_proba(test_pool)[:, 1]

# XGBoostの予測
dtest_xgb = DMatrix(X_xgb)
xgb_pred_prob = xgb_model.predict(dtest_xgb)

# ニューラルネットワークの予測（スケーリング処理が必要な場合）
X_nn = scaler.transform(X_test)
nn_pred_prob = nn_model.predict(X_nn).flatten()

# アンサンブル予測（重み付き平均）
ensemble_pred_prob = (
    6.18825976e-01 * lgb_pred_prob +
    3.81174024e-01 * cat_pred_prob +
    6.10622663e-17 * xgb_pred_prob +
    1.00353753e-16 * nn_pred_prob
)

# 結果をDataFrameにまとめる
submission = pd.DataFrame({
    'id': test_ids,
    'click': ensemble_pred_prob
})

# ヘッダーなしで保存
output_path = '/content/drive/My Drive/signate/submission/submission_0104_1.csv'
submission.to_csv(output_path, index=False, header=False)
print(f"Predictions saved to: {output_path}")


[1m40472/40472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 2ms/step
Predictions saved to: /content/drive/My Drive/signate/submission/submission_0104_1.csv


In [None]:
submission.head()

Unnamed: 0,id,click
0,1169042,0.169686
1,1169043,0.032711
2,1169044,0.037033
3,1169045,0.014781
4,1169046,0.069477


In [None]:
submission.describe()

Unnamed: 0,id,click
count,1295086.0,1295086.0
mean,1816584.0,0.06175716
std,373859.3,0.07967757
min,1169042.0,0.0003387935
25%,1492813.0,0.01628162
50%,1816584.0,0.03652811
75%,2140356.0,0.08008931
max,2464127.0,0.9770248


In [None]:
# loglossが0.1999のデータの読み込み
file_path = '/content/drive/My Drive/signate/submission/submission_0102_3.csv'
test = pd.read_csv(file_path, low_memory=False)

test.describe()

Unnamed: 0,1169042,0.1829882657989573
count,1295085.0,1295085.0
mean,1816585.0,0.06216714
std,373859.0,0.08125776
min,1169043.0,0.0003099714
25%,1492814.0,0.01560428
50%,1816585.0,0.03592416
75%,2140356.0,0.08091434
max,2464127.0,0.9770468


In [None]:
# データの読み込み
file_path = '/content/drive/My Drive/signate/submission/submission_0102_2.csv'
train = pd.read_csv(file_path, low_memory=False)

train.describe()

Unnamed: 0,1169042,0.16421219898885858
count,1295085.0,1295085.0
mean,1816585.0,0.06184043
std,373859.0,0.07880005
min,1169043.0,0.0004077504
25%,1492814.0,0.0169043
50%,1816585.0,0.03718336
75%,2140356.0,0.07991774
max,2464127.0,0.9770476
