In [1]:
from sklearn.ensemble import RandomForestRegressor
import optuna
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score
import numpy as np
from sklearn.preprocessing import MinMaxScaler


In [2]:
data = pd.read_csv("train.csv")

In [3]:
def month_to_season(month):
    if 3 <= month <= 5:
        return '봄'
    elif 6 <= month <= 8:
        return '여름'
    elif 9 <= month <= 11:
        return '가을'
    else:
        return '겨울'
data['계절'] = data['월'].apply(month_to_season)


In [4]:
# ignore_features=['절대 온도(K)', '이슬점 온도(°C)', '포화 증기압(mbar)', '실제 증기압(mbar)',
#        '증기압 부족량(mbar)', '수증기 함량 (g/kg)', '공기 밀도 (g/m**3)','ID']
ignore_features=['포화 증기압(mbar)', '이슬점 온도(°C)','수증기 함량 (g/kg)','섭씨 온도(°⁣C)',
       'ID','월','일']
categorical_features = ['계절','측정 시간대']


In [5]:
data.drop(ignore_features,axis=1,inplace=True)
data = pd.get_dummies(data,columns=categorical_features)

In [6]:
import random
random_alpha = []
for i in range(10):
    random_alpha.append(float(str(random.random())[:4]))
random_alpha

[0.63, 0.96, 0.4, 0.87, 0.94, 0.61, 0.29, 0.19, 0.45, 0.39]

In [7]:
data_copy = data.copy()
for i in random_alpha:
    temp_df = data.copy()
    temp_df[["절대 온도(K)","상대 습도 (%)","대기압(mbar)","실제 증기압(mbar)","증기압 부족량(mbar)","공기 밀도 (g/m**3)","풍향 (deg)","풍속 (m/s)"]] = temp_df[["절대 온도(K)","상대 습도 (%)","대기압(mbar)","실제 증기압(mbar)","증기압 부족량(mbar)","공기 밀도 (g/m**3)","풍향 (deg)","풍속 (m/s)"]]*i
    data_copy = pd.concat([data_copy, temp_df])


In [8]:
data_copy

Unnamed: 0,절대 온도(K),상대 습도 (%),대기압(mbar),실제 증기압(mbar),증기압 부족량(mbar),공기 밀도 (g/m**3),풍향 (deg),풍속 (m/s),계절_가을,계절_겨울,계절_봄,계절_여름,측정 시간대_새벽,측정 시간대_오전,측정 시간대_오후,측정 시간대_저녁
0,287.7800,76.1000,992.0800,12.1600,3.8200,1198.0600,155.6000,1.6100,0,0,0,1,0,0,0,1
1,290.8500,73.3000,991.0700,14.1700,5.1600,1183.6700,177.0000,1.6800,0,0,0,1,0,1,0,0
2,283.8400,74.2000,988.7100,8.9800,3.1200,1213.2200,146.2000,0.7300,1,0,0,0,0,0,0,1
3,277.3000,83.5000,1014.2500,7.4300,1.4700,1265.4800,264.5000,2.7100,0,1,0,0,0,1,0,0
4,290.8600,74.0000,995.7700,14.6800,5.1600,1187.4000,19.3400,1.0000,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36576,112.2381,21.1185,387.4767,3.4047,2.8821,468.3315,114.2310,0.3003,0,0,0,1,0,0,0,1
36577,108.3849,27.5730,385.3317,2.2113,0.9165,483.6234,82.0950,3.4983,0,1,0,0,0,1,0,0
36578,115.0890,31.5900,381.3810,7.4256,1.7433,449.7636,7.4958,1.0062,0,0,1,0,0,0,0,1
36579,116.9259,20.5959,385.4721,6.7899,6.0684,446.3979,87.9060,0.1404,1,0,0,0,0,0,0,1


In [9]:
datas = data_copy.drop("풍속 (m/s)",axis=1).values
target = data_copy["풍속 (m/s)"].values

In [10]:
X_train, X_test, y_train, y_test = train_test_split(datas, target, test_size=0.2, random_state=42)


In [11]:
# mm = MinMaxScaler()
# X_train = mm.fit_transform(X_train)
# X_test = mm.transform(X_test)

In [11]:
def objective(trial, X_train, y_train, X_test, y_test):
    n_estimators = trial.suggest_int("n_estimators", 10, 1000)
    max_depth = trial.suggest_int("max_depth", 2, 50)


    model = RandomForestRegressor(n_estimators=n_estimators,
                                  max_depth=max_depth,
                                  random_state=42,
                                  n_jobs=-1)
    model.fit(X_train, y_train)

    # Use k-fold cross-validation for more robust evaluation
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_test, y_test, cv=kfold, scoring='neg_mean_absolute_error')
    mae = -1 * np.mean(cv_scores)

    return mae


In [12]:
study = optuna.create_study(direction="minimize")
study.optimize(lambda trial: objective(trial, X_train, y_train, X_test, y_test),
               n_trials=1000,
               timeout=7200)


[I 2023-07-20 11:20:35,641] A new study created in memory with name: no-name-21625888-7c11-4b52-a1c6-cc63f15b8810
[I 2023-07-20 11:30:06,926] Trial 0 finished with value: 0.3067140975747299 and parameters: {'n_estimators': 591, 'max_depth': 41}. Best is trial 0 with value: 0.3067140975747299.
[I 2023-07-20 11:42:20,084] Trial 1 finished with value: 0.3066960435169384 and parameters: {'n_estimators': 665, 'max_depth': 36}. Best is trial 1 with value: 0.3066960435169384.
[I 2023-07-20 11:44:09,049] Trial 2 finished with value: 0.5074093916094524 and parameters: {'n_estimators': 428, 'max_depth': 3}. Best is trial 1 with value: 0.3066960435169384.
[I 2023-07-20 11:50:49,546] Trial 3 finished with value: 0.3068954376176388 and parameters: {'n_estimators': 347, 'max_depth': 29}. Best is trial 1 with value: 0.3066960435169384.
[I 2023-07-20 11:56:21,225] Trial 4 finished with value: 0.3069259579515138 and parameters: {'n_estimators': 274, 'max_depth': 50}. Best is trial 1 with value: 0.30669

In [13]:
print("Best trial:")
trial = study.best_trial
print("MAE: {}".format(trial.value))
print("Params: ")
for key, value in trial.params.items():
    print("{}: {}".format(key, value))

Best trial:
MAE: 0.3066897674219784
Params: 
n_estimators: 642
max_depth: 34


In [16]:
from optuna.visualization import plot_optimization_history

plot = plot_optimization_history(study)
plot.show()

In [17]:
model_rf = RandomForestRegressor(**study.best_params,n_jobs=-1)

In [11]:
model_rf = RandomForestRegressor(max_depth=34, n_estimators=642, n_jobs=-1)


In [12]:
model_rf.fit(X_train,y_train)


In [20]:
import pickle
with open("model_rf.pkl", "wb") as model_file:
    pickle.dump(model_rf, model_file)