In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import (FunctionTransformer, MinMaxScaler)
from sklearn.model_selection import (KFold, cross_validate, RandomizedSearchCV)
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
import lightgbm as lgbm
from lightgbm import LGBMRegressor

import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# data_path
root_path = os.path.dirname(os.getcwd())
data_path = os.path.join(root_path, "data/raw")
train_path = os.path.join(data_path, "train.csv")
train_df = pd.read_csv(train_path).head(50_000)
train_df.head()

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,FloodProbability
0,0,5,8,5,8,6,4,4,3,3,...,5,3,3,5,4,7,5,7,3,0.445
1,1,6,7,4,4,8,8,3,5,4,...,7,2,0,3,5,3,3,4,3,0.45
2,2,6,5,6,7,3,7,1,5,4,...,7,3,7,5,6,8,2,3,3,0.53
3,3,3,4,6,5,4,8,4,7,6,...,2,4,7,4,4,6,5,7,5,0.535
4,4,5,3,2,6,4,4,3,3,3,...,2,2,6,6,4,1,2,3,5,0.415


In [3]:
X = train_df.drop(columns=["id", "FloodProbability"])
y = train_df["FloodProbability"]

def create_features(s):
    """
    """
    s_copy = s.copy(deep=True)
    s_copy["fsum"] = s.sum(axis=1)
    s_copy["special1"] = s_copy["fsum"].isin(np.arange(72, 76))
    return s_copy
    
feature_engineer = FunctionTransformer(create_features)
preprocess = MinMaxScaler()
feature_selection = SelectKBest()
model = LGBMRegressor()
# Create the pipeline
pipeline = Pipeline(steps=[
    ("feature_engineer", feature_engineer),
    ("preprocess", preprocess),
    ("feature_selection", feature_selection),
    ("model", model)
])
# Cross validation
cv = KFold(shuffle=True, random_state=123)

In [8]:
param_grid = {
    "feature_selection__k": [5, 10, 15, 20, "all"],
    "model__n_estimators": [500],
    "model__num_leaves": np.arange(2, 512, step=2),
    "model__max_depth": np.arange(3, 16, step=1),
    "model__learning_rate": np.logspace(-4, 0, num=50),  # From 0.0001 to 1
    "model__min_child_samples": np.arange(1, 200, step=1),
    "model__subsample": np.linspace(0.6, 1.0, num=10),
    "model__colsample_bytree": np.linspace(0.6, 1.0, num=10),
    "model__reg_alpha": np.logspace(-9, 1, num=50),  # From 1e-9 to 10
    "model__reg_lambda": np.logspace(-9, 1, num=50),  # From 1e-9 to 10
    "model__n_jobs": [24]
}

early_stopping_callback = lgbm.early_stopping(stopping_rounds=50,
                                              verbose=False)

fit_params = {
    "model__callbacks": [early_stopping_callback],
    }

In [None]:
# Randomized search
search_cv = RandomizedSearchCV(estimator=pipeline,
                               param_distributions=param_grid, 
                               n_iter=100,
                               scoring="r2",
                               cv=cv,
                               random_state=123)

search_cv.fit(X, y, **fit_params)

# Print the best parameters and the best score
print("Best parameters found: ", search_cv.best_params_)
print("Best R2 score: ", search_cv.best_score_)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002510 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 203
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 10
[LightGBM] [Info] Start training from score 0.504403
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002256 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 203
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 10
[LightGBM] [Info] Start training from score 0.504554
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002777 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 202
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 10
[LightGBM] [Info] Start traini

Exception ignored on calling ctypes callback function: <function _log_callback at 0x7fae84cbe980>
Traceback (most recent call last):
  File "/home/user/kaggle-ps-s04e05/.venv/lib/python3.11/site-packages/lightgbm/basic.py", line 224, in _log_callback
    def _log_callback(msg: bytes) -> None:
    
KeyboardInterrupt: 


No further splits with positive gain, best gain: -inf
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002477 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 119
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 5
[LightGBM] [Info] Start training from score 0.504355
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002342 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 121
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 5
[LightGBM] [Info] Start training from score 0.504520
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002601 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 121
[LightGBM] [Info] Number of data points in the train set: 40000, number