In [1]:
from sklearn.ensemble import ExtraTreesRegressor
import optuna
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score
import numpy as np


In [2]:
data = pd.read_csv("train.csv")

In [3]:
def month_to_season(month):
    if 3 <= month <= 5:
        return '봄'
    elif 6 <= month <= 8:
        return '여름'
    elif 9 <= month <= 11:
        return '가을'
    else:
        return '겨울'
data['계절'] = data['월'].apply(month_to_season)


In [4]:
# ignore_features=['절대 온도(K)', '이슬점 온도(°C)', '포화 증기압(mbar)', '실제 증기압(mbar)',
#        '증기압 부족량(mbar)', '수증기 함량 (g/kg)', '공기 밀도 (g/m**3)','ID']
ignore_features=['포화 증기압(mbar)', '이슬점 온도(°C)','수증기 함량 (g/kg)','섭씨 온도(°⁣C)',
       'ID','월']
categorical_features = ['계절','일','측정 시간대']


In [5]:
data.drop(ignore_features,axis=1,inplace=True)
data = pd.get_dummies(data,columns=categorical_features)

In [6]:
datas = data.drop("풍속 (m/s)",axis=1).values
target = data["풍속 (m/s)"].values

In [7]:
X_train, X_test, y_train, y_test = train_test_split(datas, target, test_size=0.2, random_state=42)


In [8]:
from sklearn.model_selection import KFold, cross_val_score

def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 10, 1000)
    max_depth = trial.suggest_int("max_depth", 2, 100, log=True)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 100)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 100)

    model = ExtraTreesRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42,
    )

    # Use k-fold cross-validation for more robust evaluation
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_train, y_train, cv=kfold, scoring='neg_mean_absolute_error')
    mae = -1 * np.mean(cv_scores)

    return mae


In [9]:
study = optuna.create_study(direction="minimize", timeout=360)
study.optimize(objective, n_trials=1000, n_jobs=-1)

[I 2023-07-19 23:37:19,970] A new study created in memory with name: no-name-e675b3d0-f131-4e48-a249-522ae3231ec6
[I 2023-07-19 23:37:39,213] Trial 4 finished with value: 0.6824678342011313 and parameters: {'n_estimators': 39, 'max_depth': 13, 'min_samples_split': 98, 'min_samples_leaf': 7}. Best is trial 4 with value: 0.6824678342011313.
[I 2023-07-19 23:38:07,386] Trial 7 finished with value: 1.0502104178634462 and parameters: {'n_estimators': 279, 'max_depth': 2, 'min_samples_split': 63, 'min_samples_leaf': 35}. Best is trial 4 with value: 0.6824678342011313.
[I 2023-07-19 23:38:17,185] Trial 9 finished with value: 0.9521455283578757 and parameters: {'n_estimators': 235, 'max_depth': 4, 'min_samples_split': 65, 'min_samples_leaf': 41}. Best is trial 4 with value: 0.6824678342011313.
[I 2023-07-19 23:38:23,969] Trial 13 finished with value: 0.8360054237005088 and parameters: {'n_estimators': 44, 'max_depth': 8, 'min_samples_split': 18, 'min_samples_leaf': 59}. Best is trial 4 with va

In [None]:
print("Best trial:")
trial = study.best_trial
print("MAE: {}".format(trial.value))
print("Params: ")
for key, value in trial.params.items():
    print("{}: {}".format(key, value))

In [None]:
model_et = ExtraTreesRegressor(**study.best_params)

In [None]:
model_et.fit(X_train,y_train)


In [None]:
import pickle
with open("model_et.pkl", "wb") as model_file:
    pickle.dump(model_et, model_file)