lightGBMでモデルをトレーニングする

In [None]:
!pip install optuna




In [None]:
import optuna
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error as mae
import pandas as pd

# **データの読み込み**
train_file_path = "/content/drive/My Drive/apartment_price_2024_fall/train/train_proceeded_1121_no_code.parquet"
df = pd.read_parquet(train_file_path)

# **カテゴリカル特徴量の定義**
categorical_features = ["都道府県名", "地区名", "最寄駅：名称",
                        "間取り", "建物の構造", "用途", "今後の利用目的",
                        "都市計画", "改装", "取引の事情等"]

# **データの分割**
df_train = df[df["取引時点"] < 2023.75]
df_val = df[df["取引時点"] >= 2023.75]

col = "取引価格（総額）_log"
train_y = df_train[col]
train_x = df_train.drop(col, axis=1)

val_y = df_val[col]
val_x = df_val.drop(col, axis=1)

# **カテゴリカル特徴量をcategory型に変換**
for lgbm_feature in categorical_features:
    if lgbm_feature in train_x.columns:
        train_x[lgbm_feature] = train_x[lgbm_feature].astype("category")
    if lgbm_feature in val_x.columns:
        val_x[lgbm_feature] = val_x[lgbm_feature].astype("category")

# **Optunaのobjective関数**
def objective(trial):
    boosting_type = trial.suggest_categorical("boosting_type", ["gbdt", "goss"])

    # 基本パラメータ設定
    params = {
        "objective": "regression",
        "metric": "mae",
        "boosting_type": boosting_type,
        "num_leaves": trial.suggest_int("num_leaves", 20, 100),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.7, 0.9),
        "max_depth": trial.suggest_int("max_depth", 5, 15),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 10, 50),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 1e-2, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 1e-2, log=True),
        "verbose": -1,
    }

    # GBDTの場合のみバギング関連のパラメータを追加
    if boosting_type == "gbdt":
        params["bagging_fraction"] = trial.suggest_float("bagging_fraction", 0.7, 0.9)
        params["bagging_freq"] = trial.suggest_int("bagging_freq", 1, 10)

    # LightGBMのデータセット作成
    trains = lgb.Dataset(train_x, train_y, categorical_feature=categorical_features)
    valids = lgb.Dataset(val_x, val_y, categorical_feature=categorical_features)

    # モデルトレーニング
    model = lgb.train(
        params,
        trains,
        valid_sets=[valids],
        num_boost_round=5000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=50),
            lgb.log_evaluation(100)
        ]
    )

    # 検証データでの予測とスコア計算
    val_pred = model.predict(val_x)
    score = mae(val_y, val_pred)
    return score

# **Optunaの実行**
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

# **結果の出力**
print("最適なパラメータ: ", study.best_params)
print("最適なMAE: ", study.best_value)


[I 2024-11-21 08:01:30,928] A new study created in memory with name: no-name-3d5bb620-deaa-4316-b6ba-24d723d4d4b6


Training until validation scores don't improve for 50 rounds
[100]	valid_0's l1: 0.243595
[200]	valid_0's l1: 0.210415
[300]	valid_0's l1: 0.187858
[400]	valid_0's l1: 0.171083
[500]	valid_0's l1: 0.159087
[600]	valid_0's l1: 0.149482
[700]	valid_0's l1: 0.141951
[800]	valid_0's l1: 0.135244
[900]	valid_0's l1: 0.129589
[1000]	valid_0's l1: 0.124979
[1100]	valid_0's l1: 0.121212
[1200]	valid_0's l1: 0.11806
[1300]	valid_0's l1: 0.115495
[1400]	valid_0's l1: 0.113191
[1500]	valid_0's l1: 0.111404
[1600]	valid_0's l1: 0.109792
[1700]	valid_0's l1: 0.108484
[1800]	valid_0's l1: 0.107253
[1900]	valid_0's l1: 0.106124
[2000]	valid_0's l1: 0.105255
[2100]	valid_0's l1: 0.104476
[2200]	valid_0's l1: 0.103693
[2300]	valid_0's l1: 0.102973
[2400]	valid_0's l1: 0.102228
[2500]	valid_0's l1: 0.101645
[2600]	valid_0's l1: 0.101066
[2700]	valid_0's l1: 0.100458
[2800]	valid_0's l1: 0.0998576
[2900]	valid_0's l1: 0.0993878
[3000]	valid_0's l1: 0.0989356
[3100]	valid_0's l1: 0.0984533
[3200]	valid_0'

[I 2024-11-21 08:12:14,941] Trial 0 finished with value: 0.0918510506069184 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 35, 'learning_rate': 0.003978620327319041, 'feature_fraction': 0.795335421082564, 'max_depth': 5, 'min_data_in_leaf': 47, 'lambda_l1': 0.0029458496988450395, 'lambda_l2': 0.00013698462388915227, 'bagging_fraction': 0.8742379077354974, 'bagging_freq': 8}. Best is trial 0 with value: 0.0918510506069184.


Training until validation scores don't improve for 50 rounds
[100]	valid_0's l1: 0.275452
[200]	valid_0's l1: 0.258909
[300]	valid_0's l1: 0.244221
[400]	valid_0's l1: 0.231252
[500]	valid_0's l1: 0.219949
[600]	valid_0's l1: 0.210088
[700]	valid_0's l1: 0.201342
[800]	valid_0's l1: 0.193459
[900]	valid_0's l1: 0.186301
[1000]	valid_0's l1: 0.17976
[1100]	valid_0's l1: 0.173947
[1200]	valid_0's l1: 0.168594
[1300]	valid_0's l1: 0.16374
[1400]	valid_0's l1: 0.159403
[1500]	valid_0's l1: 0.1554
[1600]	valid_0's l1: 0.151779
[1700]	valid_0's l1: 0.148518
[1800]	valid_0's l1: 0.145265
[1900]	valid_0's l1: 0.142353
[2000]	valid_0's l1: 0.139504
[2100]	valid_0's l1: 0.136938
[2200]	valid_0's l1: 0.134649
[2300]	valid_0's l1: 0.132435
[2400]	valid_0's l1: 0.13028
[2500]	valid_0's l1: 0.128198
[2600]	valid_0's l1: 0.126198
[2700]	valid_0's l1: 0.124322


In [None]:
import lightgbm as lgb
import os
from sklearn.metrics import mean_absolute_error as mae

# **カテゴリカル特徴量の定義**
categorical_features = ["都道府県名", "地区名", "最寄駅：名称",
                        "間取り", "建物の構造", "用途", "今後の利用目的",
                        "都市計画", "改装", "取引の事情等"]

# **データの分割**
df_train = df[df["取引時点"] < 2023.75]
df_val = df[df["取引時点"] >= 2023.75]

col = "取引価格（総額）_log"
train_y = df_train[col]
train_x = df_train.drop(col, axis=1)

val_y = df_val[col]
val_x = df_val.drop(col, axis=1)

# **カテゴリカル特徴量をcategory型に変換**
for cat_feature in categorical_features:
    if cat_feature in train_x.columns:
        train_x[cat_feature] = train_x[cat_feature].astype("category")
    if cat_feature in val_x.columns:
        val_x[cat_feature] = val_x[cat_feature].astype("category")

# **LightGBM用データセットの作成（カテゴリカル特徴量を指定）**
trains = lgb.Dataset(train_x, train_y, categorical_feature=categorical_features)
valids = lgb.Dataset(val_x, val_y, categorical_feature=categorical_features)

# パラメータ設定（optunaで見つけたパラメータを使用、以下は手動設定）
params = {
    "objective": "regression",
    "metrics": "mae",
    "boosting_type": "dart",  # dart, goss, gbdt も検討
    "num_leaves": 31,  # 31~100 程度で調整 (データ量に応じて)
    "learning_rate": 0.05,  # 0.1, 0.01 なども検討
    "feature_fraction": 0.9,  # 0.7~0.9 程度で調整
    "bagging_fraction": 0.9,  # 0.7~0.9 程度で調整
    "bagging_freq": 10,  # 1~10 程度で調整
    "max_depth": -1,  # 決定木の深さ制限 (-1: 無制限)
    "min_data_in_leaf": 20,  # 葉ノードの最小データ数
    "lambda_l1": 0.0,  # L1正則化
    "lambda_l2": 0.0,  # L2正則化
    "seed": 42,
    "verbose": -1
}

# **モデルのトレーニング**
model = lgb.train(
    params,
    trains,
    valid_sets=[valids],
    num_boost_round=5000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(100)
    ]
)

# **モデルの保存**
save_dir = '/content/drive/My Drive/apartment_price_2024_fall/models'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

model_path = os.path.join(save_dir, 'lightgbm_model_1121.txt')
model.save_model(model_path)

print(f"モデルを保存しました: {model_path}")

# **テストデータの読み込み**
df_test = pd.read_parquet("/content/drive/My Drive/apartment_price_2024_fall/test/test_proceeded_1112_9.parquet")

# **ID列の存在確認**
if 'ID' not in df_test.columns:
    raise ValueError("テストデータにID列が必要です。")

# **IDをインデックスに設定**
df_test = df_test.set_index('ID')

# **テストデータのカテゴリカル変数もcategory型に変換**
for cat_feature in categorical_features:
    if cat_feature in df_test.columns:
        df_test[cat_feature] = df_test[cat_feature].astype("category")

# **テストデータでの予測**
test_pred = model.predict(df_test)

# **提出用のDataFrameを作成（IDと予測結果を含む）**
submission = pd.DataFrame({
    "ID": df_test.index,
    "取引価格（総額）_log": test_pred
})

# **提出用のCSVとして保存**
submission_path = "/content/drive/My Drive/apartment_price_2024_fall/test/submit_submission_1121_1_lightgbm.csv"
submission.to_csv(submission_path, index=False)

print(f"予測結果を{submission_path}に保存しました。")




[100]	valid_0's l1: 1.37854
[200]	valid_0's l1: 1.34164


KeyboardInterrupt: 

# Catboostでトレーニングする

!pip install catboost

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_absolute_error

# **カテゴリカル特徴量の定義（トレーニングと同じ列を指定）**
categorical_features = ["市区町村コード", "都道府県名", "地区名",
                        "最寄駅：名称", "間取り", "建物の構造", "用途",
                        "今後の利用目的", "都市計画", "改装", "取引の事情等"]

# **カテゴリカル変数を文字列型に変換し、NaNを"NaN"に置き換える**
for col in categorical_features:
    df[col] = df[col].astype(str).fillna("NaN").astype("category")

# **'取引時点' 列を数値型に変換**
df["取引時点"] = pd.to_numeric(df["取引時点"], errors="coerce")

# **データの分割**
df_train = df[df["取引時点"] < 2023.75]
df_val = df[df["取引時点"] >= 2023.75]

# 目的変数と特徴量を定義
col = "取引価格（総額）_log"
train_y = df_train[col]
train_x = df_train.drop(col, axis=1)

val_y = df_val[col]
val_x = df_val.drop(col, axis=1)

# **CatBoost用のPoolデータを作成（カテゴリカル変数を指定）**
train_pool = Pool(train_x, label=train_y, cat_features=categorical_features)
val_pool = Pool(val_x, label=val_y, cat_features=categorical_features)

# **モデルの定義とトレーニング**
model = CatBoostRegressor(
    iterations=3150,
    learning_rate=0.08,
    depth=13,
    eval_metric='MAE',
    random_seed=42,
    early_stopping_rounds=100,
    verbose=100,
    l2_leaf_reg=25
)

model.fit(train_pool, eval_set=val_pool)

# **モデルの保存**
model.save_model("/content/drive/My Drive/apartment_price_2024_fall/models/catboost_model_1115_1.cbm")

# **バリデーションデータでのMAE評価**
val_pred = model.predict(val_pool)
mae_score = mean_absolute_error(val_y, val_pred)
print(f"バリデーションデータのMAE: {mae_score}")

# **テストデータの読み込み**
df_test = pd.read_parquet("/content/drive/My Drive/apartment_price_2024_fall/test/test_proceeded_1114_1.parquet")

# **ID列の存在確認**
if 'ID' not in df_test.columns:
    raise ValueError("テストデータにID列が必要です。")

# **IDをインデックスに設定**
df_test = df_test.set_index('ID')

# **テストデータのカテゴリカル変数にNaNを補完**
for col in categorical_features:
    df_test[col] = df_test[col].astype(str).fillna("NaN").astype("category")

# **テストデータでの予測**
test_pool = Pool(df_test, cat_features=categorical_features)
test_pred = model.predict(test_pool)

# **提出用のDataFrameを作成（IDと予測結果を含む）**
submission = pd.DataFrame({
    "ID": df_test.index,
    "取引価格（総額）_log": test_pred
})

# **提出用のCSVとして保存**
submission_path = "/content/drive/My Drive/apartment_price_2024_fall/test/submit_submission_1115_1_catboost.csv"
submission.to_csv(submission_path, index=False)

print(f"予測結果を{submission_path}に保存しました。")


0:	learn: 0.2575023	test: 0.2787647	best: 0.2787647 (0)	total: 6.61s	remaining: 5h 46m 45s
100:	learn: 0.0901906	test: 0.0945548	best: 0.0945548 (100)	total: 12m 52s	remaining: 6h 28m 41s
200:	learn: 0.0842159	test: 0.0876505	best: 0.0876505 (200)	total: 25m 3s	remaining: 6h 7m 37s
300:	learn: 0.0815976	test: 0.0850019	best: 0.0850019 (300)	total: 36m 56s	remaining: 5h 49m 41s


In [None]:
import pandas as pd

# testデータの読み込み
test_file_path = "/content/drive/My Drive/apartment_price_2024_fall/test/test.csv"
df = pd.read_csv(test_file_path, index_col=0, low_memory=False, encoding="utf-8")

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19390 entries, 1000101 to 47017289
Data columns (total 26 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   種類          19390 non-null  object 
 1   地域          0 non-null      float64
 2   市区町村コード     19390 non-null  int64  
 3   都道府県名       19390 non-null  object 
 4   市区町村名       19390 non-null  object 
 5   地区名         19390 non-null  object 
 6   最寄駅：名称      19370 non-null  object 
 7   最寄駅：距離（分）   19370 non-null  object 
 8   間取り         18184 non-null  object 
 9   面積（㎡）       19390 non-null  int64  
 10  土地の形状       0 non-null      float64
 11  間口          0 non-null      float64
 12  延床面積（㎡）     0 non-null      float64
 13  建築年         18854 non-null  object 
 14  建物の構造       18499 non-null  object 
 15  用途          15162 non-null  object 
 16  今後の利用目的     18595 non-null  object 
 17  前面道路：方位     0 non-null      float64
 18  前面道路：種類     0 non-null      float64
 19  前面道路：幅員（ｍ）  0 non-nul

In [None]:
df.head()

Unnamed: 0_level_0,種類,地域,市区町村コード,都道府県名,市区町村名,地区名,最寄駅：名称,最寄駅：距離（分）,間取り,面積（㎡）,...,今後の利用目的,前面道路：方位,前面道路：種類,前面道路：幅員（ｍ）,都市計画,建ぺい率（％）,容積率（％）,取引時点,改装,取引の事情等
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000101,中古マンション等,,1101,北海道,札幌市中央区,大通西,西１８丁目,1,２ＤＫ,55,...,住宅,,,,商業,80.0,400.0,2024年第1四半期,未改装,
1000102,中古マンション等,,1101,北海道,札幌市中央区,大通西,西１８丁目,1,１ＬＤＫ,55,...,住宅,,,,商業,80.0,400.0,2024年第1四半期,改装済み,
1000103,中古マンション等,,1101,北海道,札幌市中央区,大通西,西１８丁目,0,２ＤＫ,35,...,住宅,,,,商業,80.0,400.0,2024年第1四半期,未改装,
1000105,中古マンション等,,1101,北海道,札幌市中央区,大通西,西１８丁目,5,２ＬＤＫ,50,...,事務所,,,,商業,80.0,400.0,2024年第1四半期,未改装,
1000106,中古マンション等,,1101,北海道,札幌市中央区,大通西,西１８丁目,5,１Ｋ,30,...,事務所,,,,商業,80.0,400.0,2024年第1四半期,未改装,


# testデータの取引価格を予測する

In [None]:
import pandas as pd
import lightgbm as lgb

# **カテゴリカル特徴量の定義（トレーニングと同じ列を指定）**
categorical_features = ["市区町村コード", "都道府県名", "市区町村名", "地区名",
                        "最寄駅：名称", "間取り", "建物の構造", "用途",
                        "今後の利用目的", "都市計画", "改装", "取引の事情等"]

# **モデルのロード**
model = lgb.Booster(model_file='/content/drive/My Drive/apartment_price_2024_fall/models/lightgbm_model_1109_1.txt')

# **テストデータの読み込み**
df_loaded = pd.read_parquet("/content/drive/My Drive/apartment_price_2024_fall/test/test_proceeded_1109_1.parquet")

# **ID列の存在確認**
if 'ID' not in df_loaded.columns:
    raise ValueError("テストデータにID列が必要です。")

# **IDをインデックスに設定**
df_loaded = df_loaded.set_index('ID')

# **カテゴリカル変数を再設定（念のため）**
for col in categorical_features:
    if col in df_loaded.columns:
        df_loaded[col] = df_loaded[col].astype('category')

# **予測を実行**
predict = model.predict(df_loaded, num_iteration=model.best_iteration)

# **提出用のDataFrameを作成（IDと予測結果を含む）**
submission = pd.DataFrame({
    "ID": df_loaded.index,
    "取引価格（総額）_log": predict
})

# **予測値の統計を確認（事後チェック）**
print(f"予測値の平均: {predict.mean()}, 標準偏差: {predict.std()}")

# **提出用のCSVとして保存**
submission.to_csv("/content/drive/My Drive/apartment_price_2024_fall/test/submission_1109_lightgbm.csv", index=False)

print("予測結果をsubmission_1109_lightgbm.csvに保存しました。")


予測値の平均: 6.975355164141524, 標準偏差: 0.33528623758488024
予測結果をsubmission_1109_lightgbm.csvに保存しました。
