In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
import optuna
import xgboost as xgb
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import cross_val_score

# train.csv 파일 불러오기
train_data = pd.read_csv('train.csv')

# 결측치를 평균값으로 대체
train_data = train_data.fillna(train_data.mean())

# '측정 시간대'를 원핫 인코딩하여 숫자 형태로 변환
encoder = OneHotEncoder(sparse=False)
time_encoded = encoder.fit_transform(train_data[['측정 시간대']])
time_encoded_df = pd.DataFrame(time_encoded, columns=encoder.get_feature_names(['측정 시간대']))
train_data = pd.concat([train_data, time_encoded_df], axis=1).drop(['측정 시간대'], axis=1)

# 풍속을 예측할 특성(입력 변수)과 풍속(출력 변수)을 분리합니다.
X_train = train_data.drop(['ID', '풍속 (m/s)'], axis=1)  # 입력 변수들
y_train = train_data['풍속 (m/s)']  # 출력 변수 (풍속)

# optuna를 활용한 하이퍼파라미터 최적화 (RandomForestRegressor 대신 XGBoost로 변경)
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 3, 15)
    learning_rate = trial.suggest_float('learning_rate', 0.001, 0.1)
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)

    # XGBoostRegressor 모델 생성
    model = xgb.XGBRegressor(
        objective='reg:squarederror',
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        random_state=42
    )

    # Cross Validation을 활용하여 평균 RMSE 계산
    scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
    rmse_scores = np.sqrt(-scores)
    
    return np.mean(rmse_scores)

# optuna를 활용한 하이퍼파라미터 최적화
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

print('Number of finished trials:', len(study.trials))
print('Best trial:')
trial = study.best_trial
print('  Value: ', trial.value)
print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

# 최적의 하이퍼파라미터로 XGBoostRegressor 모델 생성
best_xg_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=study.best_params['n_estimators'],
    max_depth=study.best_params['max_depth'],
    learning_rate=study.best_params['learning_rate'],
    subsample=study.best_params['subsample'],
    colsample_bytree=study.best_params['colsample_bytree'],
    random_state=42
)

# 보팅 앙상블 모델 생성 (RandomForestRegressor 대신 XGBoost로 변경)
ensemble_model = VotingRegressor(estimators=[('xg', best_xg_model), ('rf', best_rf_model)], n_jobs=-1)

# 스태킹을 위한 모델 생성 (RandomForestRegressor 대신 XGBoost로 변경)
stacking_model = StackingRegressor(
    estimators=[('xg', best_xg_model), ('rf', best_rf_model)],
    final_estimator=best_xg_model
)

# 앙상블 모델 학습
ensemble_model.fit(X_train, y_train)

# 스태킹 모델 학습
stacking_model.fit(X_train, y_train)

# test.csv 파일 불러오기
test_data = pd.read_csv('test.csv')

# 결측치를 평균값으로 대체
test_data = test_data.fillna(test_data.mean())

# '측정 시간대'를 원핫 인코딩하여 숫자 형태로 변환
time_encoded = encoder.transform(test_data[['측정 시간대']])
time_encoded_df = pd.DataFrame(time_encoded, columns=encoder.get_feature_names(['측정 시간대']))
test_data = pd.concat([test_data, time_encoded_df], axis=1).drop(['측정 시간대'], axis=1)

# 테스트 데이터로 예측을 수행합니다.
X_test = test_data.drop('ID', axis=1)  # 테스트 입력 변수들

# 앙상블 모델 예측
ensemble_pred = ensemble_model.predict(X_test)

# 스태킹 모델 예측
stacking_pred = stacking_model.predict(X_test)

# 앙상블 결과와 스태킹 결과를 평균하여 최종 예측값 도출
final_pred = (ensemble_pred + stacking_pred) / 2

# Submit / 제출
submission = pd.read_csv('./sample_submission.csv')
submission['풍속 (m/s)'] = final_pred

# 예측 결과를 submission.csv 양식에 맞게 저장합니다.
submission.to_csv('submission.csv', index=False)

print("풍속 예측이 완료되었습니다. 결과가 submission.csv에 저장되었습니다.")


  from .autonotebook import tqdm as notebook_tqdm
  
[I 2023-07-24 12:38:43,642] A new study created in memory with name: no-name-69df8848-adb7-484d-bec6-228de44de43f
[I 2023-07-24 12:39:21,554] Trial 0 finished with value: 1.0307959374072886 and parameters: {'n_estimators': 243, 'max_depth': 3, 'learning_rate': 0.042990257767997093, 'subsample': 0.788210115449548, 'colsample_bytree': 0.7733429689103928}. Best is trial 0 with value: 1.0307959374072886.
[I 2023-07-24 12:41:11,770] Trial 1 finished with value: 0.6516804565408565 and parameters: {'n_estimators': 172, 'max_depth': 11, 'learning_rate': 0.01966469162451124, 'subsample': 0.8516739396334921, 'colsample_bytree': 0.9087671790346366}. Best is trial 1 with value: 0.6516804565408565.
[I 2023-07-24 12:43:34,093] Trial 2 finished with value: 0.9069186595143041 and parameters: {'n_estimators': 842, 'max_depth': 3, 'learning_rate': 0.043455940133852296, 'subsample': 0.6547525003943131, 'colsample_bytree': 0.8401426167578367}. Best is t

In [None]:
# Cross Validation을 활용하여 평균 RMSE 계산
scores = cross_val_score(ensemble_model, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
rmse_scores = np.sqrt(-scores)

print("평균 RMSE:", np.mean(rmse_scores))

In [None]:
# Cross Validation을 활용하여 평균 RMSE 계산
scores = cross_val_score(stacking_model, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
rmse_scores = np.sqrt(-scores)

print("평균 RMSE:", np.mean(rmse_scores))