In [2]:
#pip install optuna

Collecting optuna
  Downloading optuna-3.2.0-py3-none-any.whl (390 kB)
     ------------------------------------ 390.6/390.6 kB 459.4 kB/s eta 0:00:00
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting sqlalchemy>=1.3.0
  Downloading SQLAlchemy-2.0.19-cp37-cp37m-win_amd64.whl (2.0 MB)
     ---------------------------------------- 2.0/2.0 MB 433.0 kB/s eta 0:00:00
Collecting cmaes>=0.9.1
  Downloading cmaes-0.10.0-py3-none-any.whl (29 kB)
Collecting alembic>=1.5.0
  Downloading alembic-1.11.1-py3-none-any.whl (224 kB)
     ------------------------------------ 224.5/224.5 kB 457.9 kB/s eta 0:00:00
Collecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
     -------------------------------------- 78.7/78.7 kB 547.5 kB/s eta 0:00:00
Collecting greenlet!=0.4.17
  Downloading greenlet-2.0.2-cp37-cp37m-win_amd64.whl (192 kB)
     ------------------------------------ 192.4/192.4 kB 448.7 kB/s eta 0:00:00
Installing collected packages: greenlet, c

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
import optuna
import xgboost as xgb
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import cross_val_score

# train.csv 파일 불러오기
train_data = pd.read_csv('train.csv')

# 결측치를 평균값으로 대체
train_data = train_data.fillna(train_data.mean())

# '측정 시간대'를 원핫 인코딩하여 숫자 형태로 변환
encoder = OneHotEncoder(sparse=False)
time_encoded = encoder.fit_transform(train_data[['측정 시간대']])
time_encoded_df = pd.DataFrame(time_encoded, columns=encoder.get_feature_names(['측정 시간대']))
train_data = pd.concat([train_data, time_encoded_df], axis=1).drop(['측정 시간대'], axis=1)

# 풍속을 예측할 특성(입력 변수)과 풍속(출력 변수)을 분리합니다.
X_train = train_data.drop(['ID', '풍속 (m/s)'], axis=1)  # 입력 변수들
y_train = train_data['풍속 (m/s)']  # 출력 변수 (풍속)

# optuna를 활용한 하이퍼파라미터 최적화
def objective(trial):
    # RandomForestRegressor의 하이퍼파라미터 설정
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 3, 15)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_float('max_features', 0.1, 1.0)

    # RandomForestRegressor 모델 생성
    model = RandomForestRegressor(
        random_state=42,
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features
    )

    # Cross Validation을 활용하여 평균 RMSE 계산
    scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
    rmse_scores = np.sqrt(-scores)
    
    return np.mean(rmse_scores)

# optuna를 활용한 하이퍼파라미터 최적화
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

print('Number of finished trials:', len(study.trials))
print('Best trial:')
trial = study.best_trial
print('  Value: ', trial.value)
print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

# 최적의 하이퍼파라미터로 RandomForestRegressor 모델 생성
best_rf_model = RandomForestRegressor(
    random_state=42,
    n_estimators=study.best_params['n_estimators'],
    max_depth=study.best_params['max_depth'],
    min_samples_split=study.best_params['min_samples_split'],
    min_samples_leaf=study.best_params['min_samples_leaf'],
    max_features=study.best_params['max_features']
)

# XGBoostRegressor 모델 생성
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', max_depth=5)

# 보팅 앙상블 모델 생성
ensemble_model = VotingRegressor(estimators=[('rf', best_rf_model), ('xg', xg_reg)], n_jobs=-1)




  from .autonotebook import tqdm as notebook_tqdm
  
[I 2023-07-24 01:02:22,819] A new study created in memory with name: no-name-02281863-f39f-42ed-916d-0d45f28065ce
[I 2023-07-24 01:04:35,150] Trial 0 finished with value: 0.9743262429497547 and parameters: {'n_estimators': 597, 'max_depth': 8, 'min_samples_split': 20, 'min_samples_leaf': 10, 'max_features': 0.3789064127770013}. Best is trial 0 with value: 0.9743262429497547.
[I 2023-07-24 01:07:02,436] Trial 1 finished with value: 0.7847726470468487 and parameters: {'n_estimators': 315, 'max_depth': 11, 'min_samples_split': 17, 'min_samples_leaf': 2, 'max_features': 0.6070280160111738}. Best is trial 1 with value: 0.7847726470468487.
[I 2023-07-24 01:07:55,060] Trial 2 finished with value: 1.2856255947822235 and parameters: {'n_estimators': 282, 'max_depth': 3, 'min_samples_split': 2, 'min_samples_leaf': 9, 'max_features': 0.5369869073725746}. Best is trial 1 with value: 0.7847726470468487.
[I 2023-07-24 01:19:04,013] Trial 3 finishe

[I 2023-07-24 02:49:17,456] Trial 28 finished with value: 0.7104220160937482 and parameters: {'n_estimators': 383, 'max_depth': 13, 'min_samples_split': 6, 'min_samples_leaf': 3, 'max_features': 0.3876586482311468}. Best is trial 11 with value: 0.6554326793046454.
[I 2023-07-24 02:51:33,505] Trial 29 finished with value: 0.921195528518353 and parameters: {'n_estimators': 498, 'max_depth': 9, 'min_samples_split': 12, 'min_samples_leaf': 2, 'max_features': 0.2978823022342323}. Best is trial 11 with value: 0.6554326793046454.
[I 2023-07-24 02:56:38,760] Trial 30 finished with value: 0.6595756676677922 and parameters: {'n_estimators': 650, 'max_depth': 15, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': 0.384298545275087}. Best is trial 11 with value: 0.6554326793046454.
[I 2023-07-24 03:01:23,147] Trial 31 finished with value: 0.6595010527918257 and parameters: {'n_estimators': 604, 'max_depth': 15, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': 0.3650383283

[I 2023-07-24 04:47:02,297] Trial 59 finished with value: 0.6409839440108437 and parameters: {'n_estimators': 222, 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 0.5606087249373316}. Best is trial 59 with value: 0.6409839440108437.
[I 2023-07-24 04:49:05,153] Trial 60 finished with value: 0.8249793414608677 and parameters: {'n_estimators': 241, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 0.5603748849799177}. Best is trial 59 with value: 0.6409839440108437.
[I 2023-07-24 04:50:50,281] Trial 61 finished with value: 0.7168430674322374 and parameters: {'n_estimators': 168, 'max_depth': 15, 'min_samples_split': 3, 'min_samples_leaf': 9, 'max_features': 0.5405458812523934}. Best is trial 59 with value: 0.6409839440108437.
[I 2023-07-24 04:55:01,726] Trial 62 finished with value: 0.6613338906224921 and parameters: {'n_estimators': 313, 'max_depth': 14, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_features': 0.66272665

[I 2023-07-24 06:00:03,491] Trial 90 finished with value: 0.6877674819877225 and parameters: {'n_estimators': 102, 'max_depth': 13, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.7762585158869081}. Best is trial 81 with value: 0.634087398303833.
[I 2023-07-24 06:02:29,036] Trial 91 finished with value: 0.6348622897407237 and parameters: {'n_estimators': 170, 'max_depth': 15, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 0.6904910614606413}. Best is trial 81 with value: 0.634087398303833.
[I 2023-07-24 06:04:22,058] Trial 92 finished with value: 0.6354701652927998 and parameters: {'n_estimators': 132, 'max_depth': 15, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 0.684017222559297}. Best is trial 81 with value: 0.634087398303833.
[I 2023-07-24 06:06:19,833] Trial 93 finished with value: 0.6577091672584222 and parameters: {'n_estimators': 146, 'max_depth': 14, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 0.690743357974

Number of finished trials: 100
Best trial:
  Value:  0.634087398303833
  Params: 
    n_estimators: 209
    max_depth: 15
    min_samples_split: 2
    min_samples_leaf: 1
    max_features: 0.6852407823023655


TypeError: __init__() got an unexpected keyword argument 'regressors'

In [4]:
# 스태킹을 위한 모델 생성
stacking_model = StackingRegressor(
    estimators=[('rf', best_rf_model), ('xg', xg_reg)],
    final_estimator=best_rf_model
)

# 앙상블 모델 학습
ensemble_model.fit(X_train, y_train)

# 스태킹 모델 학습
stacking_model.fit(X_train, y_train)

# test.csv 파일 불러오기
test_data = pd.read_csv('test.csv')

# 결측치를 평균값으로 대체
test_data = test_data.fillna(test_data.mean())

# '측정 시간대'를 원핫 인코딩하여 숫자 형태로 변환
time_encoded = encoder.transform(test_data[['측정 시간대']])
time_encoded_df = pd.DataFrame(time_encoded, columns=encoder.get_feature_names(['측정 시간대']))
test_data = pd.concat([test_data, time_encoded_df], axis=1).drop(['측정 시간대'], axis=1)

# 테스트 데이터로 예측을 수행합니다.
X_test = test_data.drop('ID', axis=1)  # 테스트 입력 변수들

# 앙상블 모델 예측
ensemble_pred = ensemble_model.predict(X_test)

# 스태킹 모델 예측
stacking_pred = stacking_model.predict(X_test)

# 앙상블 결과와 스태킹 결과를 평균하여 최종 예측값 도출
final_pred = (ensemble_pred + stacking_pred) / 2

# Submit / 제출
submission = pd.read_csv('./sample_submission.csv')
submission['풍속 (m/s)'] = final_pred

# 예측 결과를 submission.csv 양식에 맞게 저장합니다.
submission.to_csv('submission.csv', index=False)

print("풍속 예측이 완료되었습니다. 결과가 submission.csv에 저장되었습니다.")

  app.launch_new_instance()


풍속 예측이 완료되었습니다. 결과가 submission.csv에 저장되었습니다.


In [5]:
# Cross Validation을 활용하여 평균 RMSE 계산
scores = cross_val_score(ensemble_model, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
rmse_scores = np.sqrt(-scores)

print("평균 RMSE:", np.mean(rmse_scores))

평균 RMSE: 0.7286177749296877


In [6]:
# Cross Validation을 활용하여 평균 RMSE 계산
scores = cross_val_score(stacking_model, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
rmse_scores = np.sqrt(-scores)

print("평균 RMSE:", np.mean(rmse_scores))

평균 RMSE: 0.6317175162823699
