In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install catboost


Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [8]:
! pip install optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

# optunaで最適なパラメータを探す

In [10]:
import optuna
from catboost import CatBoostRegressor, Pool
import pandas as pd
import numpy as np

# データの読み込み
train_path = '/content/drive/My Drive/signate/train/train_proceed.csv'
train_data = pd.read_csv(train_path, low_memory=True, dtype={'area_JIS_code': str, 'zip_code': str})

# 不要な列の削除
drop_columns = ['land_toshi', 'target_quarter', 'madori_number_all', 'normalized_full_address',
                'snapshot_window_angle', 'land_kenpei', 'land_youseki', 'year_built_quarter']
train_data = train_data.drop(columns=drop_columns)

# 異常値の削除
train_data = train_data[train_data["money_room"] < 1e+06]

# ターゲット変数と特徴量に分割
target = 'money_room'
X = train_data.drop(columns=[target])
y = train_data[target]

# カテゴリ列を指定
categorical_features = [
    'building_structure', 'land_youto', 'madori_kind_all',
    'normalized_addr2_name', 'normalized_eki_name1', 'area_JIS_code', 'zip_code'
]

# カテゴリ列を文字列型に変換
for col in categorical_features:
    if col in X.columns:
        X[col] = X[col].astype(str)

# CatBoost用のデータセット
data_pool = Pool(X, y, cat_features=categorical_features)

# Optunaの目的関数
def objective(trial):
    params = {
        'loss_function': 'RMSE',
        'iterations': trial.suggest_int('iterations', 300, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'depth': trial.suggest_int('depth', 6, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 5),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.5, 1.0),
        'cat_features': categorical_features,
        'random_seed': 42
    }

    # クロスバリデーション
    cv_results = cv(
        pool=data_pool,
        params=params,
        fold_count=3,  # クロスバリデーションのfold数
        early_stopping_rounds=20,  # 早期停止
        verbose=False
    )

    # 最小のRMSEを返す
    mean_rmse = np.min(cv_results['test-RMSE-mean'])
    return mean_rmse

# Optunaでの探索
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)  # 試行回数を50に設定

# 最適なパラメータ
best_params = study.best_params
print(f"Best Parameters: {best_params}")

# モデルのトレーニング
final_params = {
    'loss_function': 'RMSE',
    'iterations': best_params['iterations'],
    'learning_rate': best_params['learning_rate'],
    'depth': best_params['depth'],
    'l2_leaf_reg': best_params['l2_leaf_reg'],
    'bagging_temperature': best_params['bagging_temperature'],
    'cat_features': categorical_features,
    'random_seed': 42,
    'verbose': 100
}
best_model = CatBoostRegressor(**final_params)
best_model.fit(data_pool)

# モデルを保存
best_model.save_model('/content/drive/My Drive/signate/models/catboost_best_model.cbm')


[I 2024-12-14 05:41:09,714] A new study created in memory with name: no-name-e3b26f73-6c6d-4c8c-845c-477d87984a06


Training on fold [0/3]

bestTest = 9309.344302
bestIteration = 665

Training on fold [1/3]

bestTest = 9598.846077
bestIteration = 665

Training on fold [2/3]


[I 2024-12-14 05:51:14,446] Trial 0 finished with value: 9392.964168600953 and parameters: {'iterations': 666, 'learning_rate': 0.0490543109298396, 'depth': 10, 'l2_leaf_reg': 4.568227313528698, 'bagging_temperature': 0.9336037457980186}. Best is trial 0 with value: 9392.964168600953.



bestTest = 9270.702127
bestIteration = 665

Training on fold [0/3]

bestTest = 9410.448445
bestIteration = 610

Training on fold [1/3]

bestTest = 9721.709103
bestIteration = 609

Training on fold [2/3]


[I 2024-12-14 05:57:34,180] Trial 1 finished with value: 9481.958015526861 and parameters: {'iterations': 611, 'learning_rate': 0.07096514291983749, 'depth': 8, 'l2_leaf_reg': 4.154032829085025, 'bagging_temperature': 0.5595464936084327}. Best is trial 0 with value: 9392.964168600953.



bestTest = 9313.275081
bestIteration = 607

Training on fold [0/3]

bestTest = 9960.423555
bestIteration = 603

Training on fold [1/3]

bestTest = 10340.70749
bestIteration = 603

Training on fold [2/3]


[I 2024-12-14 06:01:52,492] Trial 2 finished with value: 10022.744304501628 and parameters: {'iterations': 604, 'learning_rate': 0.043503995909520816, 'depth': 6, 'l2_leaf_reg': 3.452297010845197, 'bagging_temperature': 0.8958124059153995}. Best is trial 0 with value: 9392.964168600953.



bestTest = 9767.101869
bestIteration = 603

Training on fold [0/3]

bestTest = 9442.485388
bestIteration = 735

Training on fold [1/3]

bestTest = 9794.071616
bestIteration = 736

Training on fold [2/3]


[I 2024-12-14 06:07:16,720] Trial 3 finished with value: 9555.619235308703 and parameters: {'iterations': 737, 'learning_rate': 0.09260913544380198, 'depth': 6, 'l2_leaf_reg': 3.969861649943286, 'bagging_temperature': 0.615884512768521}. Best is trial 0 with value: 9392.964168600953.



bestTest = 9430.245292
bestIteration = 736

Training on fold [0/3]

bestTest = 9638.104428
bestIteration = 353

Training on fold [1/3]

bestTest = 9983.316745
bestIteration = 353

Training on fold [2/3]


[I 2024-12-14 06:12:30,997] Trial 4 finished with value: 9737.329859368174 and parameters: {'iterations': 354, 'learning_rate': 0.045563332017603944, 'depth': 10, 'l2_leaf_reg': 2.1634868383540855, 'bagging_temperature': 0.6606409894182322}. Best is trial 0 with value: 9392.964168600953.



bestTest = 9590.568405
bestIteration = 353

Training on fold [0/3]

bestTest = 9830.304465
bestIteration = 492

Training on fold [1/3]

bestTest = 10157.24191
bestIteration = 492

Training on fold [2/3]


[I 2024-12-14 06:16:44,716] Trial 5 finished with value: 9895.315762198048 and parameters: {'iterations': 493, 'learning_rate': 0.04855578199316196, 'depth': 7, 'l2_leaf_reg': 2.8843002873566603, 'bagging_temperature': 0.737384811335957}. Best is trial 0 with value: 9392.964168600953.



bestTest = 9698.400908
bestIteration = 492

Training on fold [0/3]

bestTest = 9507.733638
bestIteration = 348

Training on fold [1/3]

bestTest = 9805.668652
bestIteration = 348

Training on fold [2/3]


[I 2024-12-14 06:21:55,527] Trial 6 finished with value: 9589.51790950961 and parameters: {'iterations': 349, 'learning_rate': 0.06335433337336883, 'depth': 10, 'l2_leaf_reg': 3.2182528530023835, 'bagging_temperature': 0.6545941461589766}. Best is trial 0 with value: 9392.964168600953.



bestTest = 9455.151438
bestIteration = 348

Training on fold [0/3]

bestTest = 9376.237363
bestIteration = 854

Training on fold [1/3]

bestTest = 9757.2656
bestIteration = 853

Training on fold [2/3]


[I 2024-12-14 06:30:44,992] Trial 7 finished with value: 9488.31322473296 and parameters: {'iterations': 855, 'learning_rate': 0.04209299185004348, 'depth': 8, 'l2_leaf_reg': 1.365333813472354, 'bagging_temperature': 0.7285898237081493}. Best is trial 0 with value: 9392.964168600953.



bestTest = 9331.186458
bestIteration = 854

Training on fold [0/3]

bestTest = 11027.1721
bestIteration = 379

Training on fold [1/3]

bestTest = 11446.77868
bestIteration = 379

Training on fold [2/3]


[I 2024-12-14 06:33:53,186] Trial 8 finished with value: 11075.995672259014 and parameters: {'iterations': 380, 'learning_rate': 0.018112994171935612, 'depth': 7, 'l2_leaf_reg': 2.7667481388788397, 'bagging_temperature': 0.647377558928831}. Best is trial 0 with value: 9392.964168600953.



bestTest = 10754.03623
bestIteration = 379

Training on fold [0/3]

bestTest = 9727.055457
bestIteration = 543

Training on fold [1/3]

bestTest = 10066.16352
bestIteration = 543

Training on fold [2/3]


[I 2024-12-14 06:37:49,020] Trial 9 finished with value: 9803.731518239445 and parameters: {'iterations': 544, 'learning_rate': 0.07267318771220012, 'depth': 6, 'l2_leaf_reg': 1.88159089425206, 'bagging_temperature': 0.598154804105359}. Best is trial 0 with value: 9392.964168600953.



bestTest = 9617.975578
bestIteration = 543

Best Parameters: {'iterations': 666, 'learning_rate': 0.0490543109298396, 'depth': 10, 'l2_leaf_reg': 4.568227313528698, 'bagging_temperature': 0.9336037457980186}
0:	learn: 28819.5453913	total: 323ms	remaining: 3m 34s
100:	learn: 10562.3589483	total: 35.6s	remaining: 3m 19s
200:	learn: 9590.8099727	total: 1m 13s	remaining: 2m 49s
300:	learn: 9075.4140713	total: 1m 49s	remaining: 2m 12s
400:	learn: 8727.3023294	total: 2m 27s	remaining: 1m 37s
500:	learn: 8478.3848082	total: 3m 3s	remaining: 1m
600:	learn: 8274.0921553	total: 3m 41s	remaining: 23.9s
665:	learn: 8155.9934162	total: 4m 6s	remaining: 0us


# 検証データでrmseを計算する

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# trainデータを80%:20%に分割（学習用と検証用）
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Poolデータを作成
train_pool = Pool(X_train, y_train, cat_features=categorical_features)
valid_pool = Pool(X_valid, y_valid, cat_features=categorical_features)

# 最適なパラメータでモデルを作成
final_model = CatBoostRegressor(**final_params)

# モデルをトレーニング（検証データで早期停止を設定）
final_model.fit(
    train_pool,
    eval_set=valid_pool,
    early_stopping_rounds=20,  # 検証データで性能が向上しない場合に早期停止
    verbose=100
)

# 検証データでの予測
y_pred_valid = final_model.predict(X_valid)

# 検証データでのRMSEを計算
valid_rmse = mean_squared_error(y_valid, y_pred_valid, squared=False)
print(f"Validation RMSE: {valid_rmse}")


0:	learn: 28628.7606742	test: 29679.4327894	best: 29679.4327894 (0)	total: 286ms	remaining: 3m 9s
100:	learn: 10476.2232703	test: 11311.7520638	best: 11311.7520638 (100)	total: 32.1s	remaining: 2m 59s
200:	learn: 9496.4744017	test: 10603.2475966	best: 10603.2475966 (200)	total: 1m 3s	remaining: 2m 27s
300:	learn: 8962.0398730	test: 10294.8013036	best: 10294.8013036 (300)	total: 1m 35s	remaining: 1m 56s
400:	learn: 8619.1820824	test: 10106.0571694	best: 10106.0571694 (400)	total: 2m 8s	remaining: 1m 25s
500:	learn: 8343.3858644	test: 9973.3014859	best: 9973.3014859 (500)	total: 2m 42s	remaining: 53.4s
600:	learn: 8143.1078652	test: 9886.5829957	best: 9886.5829957 (600)	total: 3m 15s	remaining: 21.2s
665:	learn: 8028.5522754	test: 9834.2226457	best: 9834.2226457 (665)	total: 3m 37s	remaining: 0us

bestTest = 9834.222646
bestIteration = 665

Validation RMSE: 9834.222645724632




# 提出用ファイルを作成する

In [12]:
from catboost import CatBoostRegressor
import pandas as pd

# モデルの保存パス
model_save_path = '/content/drive/My Drive/signate/models/catboost_best_model.cbm'

# モデルの読み込み
model = CatBoostRegressor()
model.load_model(model_save_path)

# テストデータの読み込み
test_path = '/content/drive/My Drive/signate/test/test_proceed.csv'
test_data = pd.read_csv(test_path, low_memory=False, encoding="utf-8", dtype={'area_JIS_code': str, 'zip_code': str})

# 不要な列の削除（必要に応じて修正）
test_data = test_data.drop(columns=[
    'land_toshi', 'target_quarter', 'madori_number_all',
    'normalized_full_address', 'snapshot_window_angle',
    'land_kenpei', 'land_youseki', 'year_built_quarter', 'index'
], errors='ignore')

# カテゴリ列の設定と変換
categorical_features = [
    'building_structure', 'land_youto', 'madori_kind_all',
    'normalized_addr2_name', 'normalized_eki_name1', 'area_JIS_code', 'zip_code'
]

for col in categorical_features:
    if col in test_data.columns:
        test_data[col] = test_data[col].astype(str)

# 予測
test_predictions = model.predict(test_data)

# 四捨五入して整数に変換
test_predictions_rounded = test_predictions.round().astype(int)

# サブミッションファイルの保存（列名を含めない）
submission_path = '/content/drive/My Drive/signate/submissions/submission.csv'
pd.DataFrame(test_predictions_rounded).to_csv(submission_path, index=False, header=False)

print(f"サブミッションファイルが保存されました: {submission_path}")


サブミッションファイルが保存されました: /content/drive/My Drive/signate/submissions/submission.csv


In [13]:
# サブミッションファイルのパス
submission_path = '/content/drive/My Drive/signate/submissions/submission.csv'

# 保存したファイルの読み込み
saved_submission = pd.read_csv(submission_path, header=None)

# ファイルの先頭5行を表示
print(saved_submission.head())


       0
0  63077
1  50486
2  63605
3  46218
4  80793
