In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


### 加重アンサンブルを作成する

In [6]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

# データの読み込み
train_file_path = '/content/drive/My Drive/hotel_service/data/train_0110_6.csv'
train = pd.read_csv(train_file_path, low_memory=False)

# 特徴量と目的変数の分割
X = train.drop(columns=['y']).copy()
y = train['y'].copy()

# カテゴリ変数のリスト作成
categorical_features = list(X.select_dtypes(include='object').columns)

# ✅ LightGBM用前処理（カテゴリ変数を明示的に指定）
X_lgb = X.copy()
for col in categorical_features:
    X_lgb[col] = X_lgb[col].astype('category')  # LightGBMはカテゴリ変数をcategory型で扱う
# 数値データのNaNはそのまま（トレーニング時と一致させる）

# ✅ CatBoost用前処理（"missing" と中央値補完）
X_cat = X.copy()
for col in categorical_features:
    X_cat[col] = X_cat[col].fillna("missing").astype(str)
for col in X_cat.select_dtypes(include=[np.number]).columns:
    X_cat[col] = X_cat[col].fillna(X_cat[col].median())

# 保存済みモデルのロード
lgb_model_path = '/content/drive/My Drive/models/lgbm_final_model_0110_1.txt'
cat_model_path = '/content/drive/My Drive/hotel_service/model/catboost_0111_1.cbm'

lgb_model = lgb.Booster(model_file=lgb_model_path)
cat_model = CatBoostRegressor()
cat_model.load_model(cat_model_path)

# 各モデルで予測を実行
lgb_pred = lgb_model.predict(X_lgb)
cat_pred = cat_model.predict(X_cat)

# ✅ 加重平均 (比率: LightGBM 40%, CatBoost 60%)
ensemble_pred = (0.4 * lgb_pred) + (0.6 * cat_pred)

# RMSEの計算
rmse = np.sqrt(mean_squared_error(y, ensemble_pred))
print(f"\nWeighted Ensemble RMSE (4:6): {rmse:.4f}")

# ✅ 最適な比率の探索
best_rmse = float('inf')
best_ratio = None

for ratio in np.arange(0, 1.1, 0.1):
    ensemble_pred = (ratio * lgb_pred) + ((1 - ratio) * cat_pred)
    rmse = np.sqrt(mean_squared_error(y, ensemble_pred))
    if rmse < best_rmse:
        best_rmse = rmse
        best_ratio = ratio

print(f"\nOptimal Ratio (LightGBM:CatBoost) = {best_ratio:.2f}:{1 - best_ratio:.2f}")
print(f"Best RMSE: {best_rmse:.4f}")



Weighted Ensemble RMSE (4:6): 85.7186

Optimal Ratio (LightGBM:CatBoost) = 0.00:1.00
Best RMSE: 84.5024


In [10]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
import lightgbm as lgb
import os

# テストデータの読み込み
test_file_path = '/content/drive/My Drive/hotel_service/data/test_0110_6.csv'
test = pd.read_csv(test_file_path, low_memory=False)

# カテゴリカル変数リストの作成（テストデータから確認）
categorical_features = list(test.select_dtypes(include='object').columns)

# ✅ LightGBM用前処理（カテゴリ変数を明示的に指定し、NaNはそのまま）
X_lgb = test.copy()
for col in categorical_features:
    X_lgb[col] = X_lgb[col].astype('category')

# ✅ CatBoost用前処理（カテゴリ変数は'missing'、数値は中央値で補完）
X_cat = test.copy()
for col in categorical_features:
    X_cat[col] = X_cat[col].fillna("missing").astype(str)
for col in X_cat.select_dtypes(include=[np.number]).columns:
    X_cat[col] = X_cat[col].fillna(X_cat[col].median())

# 保存済みモデルのロード
lgb_model_path = '/content/drive/My Drive/models/lgbm_final_model_0110_1.txt'
cat_model_path = '/content/drive/My Drive/hotel_service/model/catboost_0111_1.cbm'

lgb_model = lgb.Booster(model_file=lgb_model_path)
cat_model = CatBoostRegressor()
cat_model.load_model(cat_model_path)

# ✅ 'id'列の削除（テストデータ用特徴量の準備）
X_lgb = X_lgb.drop(columns=['id'])
X_cat = X_cat.drop(columns=['id'])

# ✅ 予測の実行
lgb_pred = lgb_model.predict(X_lgb)
cat_pred = cat_model.predict(X_cat)

# ✅ 加重アンサンブル (LightGBM: 40%, CatBoost: 60%)
ensemble_pred = (0.4 * lgb_pred) + (0.6 * cat_pred)

# ✅ 提出ファイルの作成（ヘッダーなし）
submission = pd.DataFrame({
    'id': test['id'],
    'y': ensemble_pred
})

# ✅ 提出ファイルの保存
submission_file_path = '/content/drive/My Drive/hotel_service/submission/submission_0111_2.csv'
os.makedirs(os.path.dirname(submission_file_path), exist_ok=True)
submission.to_csv(submission_file_path, index=False, header=False)

print(f"✅ 提出ファイルが保存されました！場所: {submission_file_path}")


✅ 提出ファイルが保存されました！場所: /content/drive/My Drive/hotel_service/submission/submission_0111_2.csv


In [11]:
submission.head()

Unnamed: 0,id,y
0,0,219.033707
1,1,130.727626
2,2,102.443891
3,3,132.856878
4,4,145.742739
