In [None]:
from google.colab import drive

# Google Driveをマウント
drive.mount('/content/drive')

# マウント完了後のメッセージ
print("Google Driveがマウントされました！")

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from catboost import Pool, CatBoostClassifier
from xgboost import Booster, DMatrix
import lightgbm as lgb
import pickle
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

# 保存済みモデルのパス
lgb_model_file = '/content/drive/My Drive/signate/submission/lgbm_model_0101_1.txt'
cat_model_file = '/content/drive/My Drive/signate/submission/catboost_model_0101_1.cbm'
xgb_model_file = '/content/drive/My Drive/signate/submission/xgboost_model_0102_1.xgb'
saved_feature_columns_file = '/content/drive/My Drive/signate/submission/saved_feature_columns.pkl'

# 特徴量リストをロード
with open(saved_feature_columns_file, 'rb') as f:
    saved_feature_columns = pickle.load(f)

# トレーニングデータの読み込み
file_path = '/content/drive/My Drive/signate/train/train_0101.tsv'
train = pd.read_csv(file_path, low_memory=False, sep='\t')

# 新しい特徴量を作成
train['C2_I11_interaction'] = train['C2'] * train['I11']
train['I5_I12_I6_sum'] = train['I5'] + train['I12'] + train['I6']

# 目的変数と特徴量を分ける
target_column = 'click'
X = train.drop(columns=[target_column, 'id'])
y = train[target_column]

# 明示的にカテゴリカル変数として指定
categorical_features = ['C1', 'C4', 'C6', 'C2_freq_group', 'C3_freq_group', 'C5_freq_group']
for col in categorical_features:
    if col in X.columns:
        X[col] = X[col].astype('category')

# XGBoost用データのエンコーディング
X_xgb = pd.get_dummies(X, columns=categorical_features)
for col in saved_feature_columns:
    if col not in X_xgb.columns:
        X_xgb[col] = 0
X_xgb = X_xgb[saved_feature_columns]

# モデルの読み込み
lgb_model = lgb.Booster(model_file=lgb_model_file)
cat_model = CatBoostClassifier()
cat_model.load_model(cat_model_file)
xgb_model = Booster()
xgb_model.load_model(xgb_model_file)

# ベースモデルの予測
lgb_pred = lgb_model.predict(X)
cat_pool = Pool(X, cat_features=[X.columns.get_loc(col) for col in categorical_features if col in X.columns])
cat_pred = cat_model.predict_proba(cat_pool)[:, 1]
dtrain_xgb = DMatrix(X_xgb)
xgb_pred = xgb_model.predict(dtrain_xgb)

# メタ特徴量を作成
meta_features = np.column_stack((lgb_pred, cat_pred, xgb_pred))

# クロスバリデーション設定
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# PyTorchでニューラルネットワークを構築
class MetaNN(nn.Module):
    def __init__(self):
        super(MetaNN, self).__init__()
        self.fc1 = nn.Linear(3, 16)
        self.fc2 = nn.Linear(16, 8)
        self.fc3 = nn.Linear(8, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

# デバイス設定
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
meta_model = MetaNN().to(device)

# ロス関数とオプティマイザ
criterion = nn.BCELoss()
optimizer = optim.Adam(meta_model.parameters(), lr=0.01)

# ログロスを計算するリスト
logloss_scores = []

# クロスバリデーションのループ
for train_idx, val_idx in skf.split(meta_features, y):
    # データ分割
    X_train, X_val = meta_features[train_idx], meta_features[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # PyTorchでのデータ変換
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1).to(device)
    X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(device)
    y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).unsqueeze(1).to(device)

    # 学習ループ
    for epoch in range(100):
        meta_model.train()
        optimizer.zero_grad()
        outputs = meta_model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        loss.backward()
        optimizer.step()

    # 検証データでの予測
    meta_model.eval()
    with torch.no_grad():
        val_pred = meta_model(X_val_tensor).cpu().numpy().flatten()
        logloss = log_loss(y_val, val_pred)
        logloss_scores.append(logloss)

# クロスバリデーションの平均logloss
final_logloss = np.mean(logloss_scores)
print(f'Final Logloss (stacking with Neural Network): {final_logloss}')


### ニューラルネットワークの保存

In [None]:
import torch

# モデルの保存パス
meta_model_save_path = '/content/drive/My Drive/signate/submission/meta_model_nn_0102_1.pth'

# トレーニング済みのニューラルネットワークを保存
torch.save(meta_model.state_dict(), meta_model_save_path)

print(f"Meta model (Neural Network) saved to: {meta_model_save_path}")


### 実際のテストデータで予測を行う

In [None]:
import pandas as pd
import numpy as np
import torch
from catboost import Pool
from xgboost import DMatrix
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler

# テストデータの読み込み
file_path = '/content/drive/My Drive/signate/test/test_0101.tsv'
test = pd.read_csv(file_path, low_memory=False, sep='\t')

# 新しい特徴量を作成
test['C2_I11_interaction'] = test['C2'] * test['I11']
test['I5_I12_I6_sum'] = test['I5'] + test['I12'] + test['I6']

# ID列を保持
test_ids = test['id']
X_test = test.drop(columns=['id'])

# LightGBM用カテゴリ型変数
categorical_features = ['C1', 'C4', 'C6', 'C2_freq_group', 'C3_freq_group', 'C5_freq_group']
for col in categorical_features:
    if col in X_test.columns:
        X_test[col] = X_test[col].astype('category')

# XGBoost用エンコーディング
X_test_xgb = pd.get_dummies(X_test, columns=categorical_features)
for col in saved_feature_columns:
    if col not in X_test_xgb.columns:
        X_test_xgb[col] = 0
X_test_xgb = X_test_xgb[saved_feature_columns]

# ベースモデルの予測
lgb_model = lgb.Booster(model_file=lgb_model_file)
cat_model = CatBoostClassifier()
cat_model.load_model(cat_model_file)
xgb_model = Booster()
xgb_model.load_model(xgb_model_file)

# LightGBMの予測
lgb_pred = lgb_model.predict(X_test)

# CatBoostの予測
test_pool = Pool(X_test, cat_features=[X_test.columns.get_loc(col) for col in categorical_features if col in X_test.columns])
cat_pred = cat_model.predict_proba(test_pool)[:, 1]

# XGBoostの予測
dtest_xgb = DMatrix(X_test_xgb)
xgb_pred = xgb_model.predict(dtest_xgb)

# メタ特徴量を作成
meta_features_test = np.column_stack((lgb_pred, cat_pred, xgb_pred))

# 標準化
scaler = StandardScaler()
meta_features_test = scaler.fit_transform(meta_features_test)

# トレーニング済みのニューラルネットワークをロード
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
meta_model = MetaNN()  # 同じニューラルネットワーク構造を再定義
meta_model.load_state_dict(torch.load('/content/drive/My Drive/signate/submission/meta_model_nn_0102_1.pth'))
meta_model.to(device)
meta_model.eval()

# テストデータで予測
X_meta_test = torch.tensor(meta_features_test, dtype=torch.float32).to(device)
with torch.no_grad():
    meta_pred_test = meta_model(X_meta_test).cpu().numpy().flatten()

# 提出ファイルの作成
submission = pd.DataFrame({
    'id': test_ids,
    'click': meta_pred_test
})
output_path = '/content/drive/My Drive/signate/submission/submission_meta_nn0103_1.csv'
submission.to_csv(output_path, index=False, header=False)
print(f"Submission saved to: {output_path}")


In [None]:
submission.head()