In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
import optuna
 

# train.csv 파일 불러오기
train_data = pd.read_csv('train.csv')

# 결측치를 평균값으로 대체
train_data = train_data.fillna(train_data.mean())

# '측정 시간대'를 원핫 인코딩하여 숫자 형태로 변환
encoder = OneHotEncoder(sparse=False)
time_encoded = encoder.fit_transform(train_data[['측정 시간대']])
time_encoded_df = pd.DataFrame(time_encoded, columns=encoder.get_feature_names(['측정 시간대']))
train_data = pd.concat([train_data, time_encoded_df], axis=1).drop(['측정 시간대'], axis=1)

# 풍속을 예측할 특성(입력 변수)과 풍속(출력 변수)을 분리합니다.
X_train = train_data.drop(['ID', '풍속 (m/s)'], axis=1)  # 입력 변수들
y_train = train_data['풍속 (m/s)']  # 출력 변수 (풍속)

# Optuna Objective 함수 정의
def objective(trial):
    # RandomForestRegressor의 파라미터 범위 설정
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 5, 15)
    min_samples_split = trial.suggest_float('min_samples_split', 0.1, 1.0)
    min_samples_leaf = trial.suggest_float('min_samples_leaf', 0.1, 0.5)

    # RandomForestRegressor 모델 생성
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )

    # 모델 학습
    model.fit(X_train, y_train)

    # 검증 데이터에 대한 예측 결과
    y_pred = model.predict(X_train)

    # 평균제곱오차를 최소화하는 것이 목표
    mse = mean_squared_error(y_train, y_pred)

    return mse

# Optuna로 파라미터 최적화
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# 최적 파라미터 출력
print("최적 파라미터: ", study.best_params)
print("최적 Objective 값: ", study.best_value)

# 최적 파라미터로 모델 재학습
best_params = study.best_params
best_model = RandomForestRegressor(**best_params, random_state=42)
best_model.fit(X_train, y_train)

# test.csv 파일 불러오기
test_data = pd.read_csv('test.csv')

# 결측치를 평균값으로 대체
test_data = test_data.fillna(test_data.mean())

# '측정 시간대'를 원핫 인코딩하여 숫자 형태로 변환
time_encoded = encoder.transform(test_data[['측정 시간대']])
time_encoded_df = pd.DataFrame(time_encoded, columns=encoder.get_feature_names(['측정 시간대']))
test_data = pd.concat([test_data, time_encoded_df], axis=1).drop(['측정 시간대'], axis=1)

# 테스트 데이터로 풍속 예측을 수행합니다.
X_test = test_data.drop('ID', axis=1)  # 테스트 입력 변수들
y_pred = best_model.predict(X_test)  # 테스트 데이터로 풍속 예측

# Submit / 제출
submission = pd.read_csv('./sample_submission.csv')
submission['풍속 (m/s)'] = y_pred

# 예측 결과를 submission.csv 양식에 맞게 저장합니다.
submission.to_csv('submission.csv', index=False)

print("풍속 예측이 완료되었습니다. 결과가 submission.csv에 저장되었습니다.")


  # This is added back by InteractiveShellApp.init_path()
[I 2023-07-25 00:31:30,886] A new study created in memory with name: no-name-27907b95-bca8-4bb0-b2e7-27791a8a83ab
[I 2023-07-25 00:31:31,238] Trial 0 finished with value: 2.3843233819693546 and parameters: {'n_estimators': 106, 'max_depth': 6, 'min_samples_split': 0.2979986454882762, 'min_samples_leaf': 0.43296234010544354}. Best is trial 0 with value: 2.3843233819693546.
[I 2023-07-25 00:31:31,471] Trial 1 finished with value: 2.3843242282888055 and parameters: {'n_estimators': 73, 'max_depth': 7, 'min_samples_split': 0.7956375153307056, 'min_samples_leaf': 0.19785947526505887}. Best is trial 0 with value: 2.3843233819693546.
[I 2023-07-25 00:31:31,843] Trial 2 finished with value: 2.3843230937261763 and parameters: {'n_estimators': 125, 'max_depth': 12, 'min_samples_split': 0.8813964376145657, 'min_samples_leaf': 0.446114239739243}. Best is trial 2 with value: 2.3843230937261763.
[I 2023-07-25 00:31:32,007] Trial 3 finished wi

[I 2023-07-25 00:33:38,215] Trial 29 finished with value: 2.1604687938901885 and parameters: {'n_estimators': 177, 'max_depth': 14, 'min_samples_split': 0.26264293548417394, 'min_samples_leaf': 0.2156231940792694}. Best is trial 27 with value: 2.0458750597536093.
[I 2023-07-25 00:33:47,909] Trial 30 finished with value: 2.092690898056708 and parameters: {'n_estimators': 189, 'max_depth': 14, 'min_samples_split': 0.2899643024423757, 'min_samples_leaf': 0.12906362411917074}. Best is trial 27 with value: 2.0458750597536093.
[I 2023-07-25 00:33:58,214] Trial 31 finished with value: 2.056423945678192 and parameters: {'n_estimators': 187, 'max_depth': 14, 'min_samples_split': 0.2789184666250899, 'min_samples_leaf': 0.12795792578750526}. Best is trial 27 with value: 2.0458750597536093.
[I 2023-07-25 00:34:04,209] Trial 32 finished with value: 2.1553713105249472 and parameters: {'n_estimators': 176, 'max_depth': 15, 'min_samples_split': 0.34073182935523844, 'min_samples_leaf': 0.17803595917795

[I 2023-07-25 00:39:27,483] Trial 61 finished with value: 2.0105767452005745 and parameters: {'n_estimators': 119, 'max_depth': 15, 'min_samples_split': 0.13609924810748883, 'min_samples_leaf': 0.10059678592707166}. Best is trial 59 with value: 2.005962064891046.
[I 2023-07-25 00:39:35,410] Trial 62 finished with value: 2.12704965823752 and parameters: {'n_estimators': 112, 'max_depth': 15, 'min_samples_split': 0.12857461295588252, 'min_samples_leaf': 0.15088996521796016}. Best is trial 59 with value: 2.005962064891046.
[I 2023-07-25 00:39:42,707] Trial 63 finished with value: 2.1252869886957826 and parameters: {'n_estimators': 103, 'max_depth': 15, 'min_samples_split': 0.17189455277459117, 'min_samples_leaf': 0.14769505169935243}. Best is trial 59 with value: 2.005962064891046.
[I 2023-07-25 00:39:54,402] Trial 64 finished with value: 2.0194606823445467 and parameters: {'n_estimators': 130, 'max_depth': 15, 'min_samples_split': 0.21169338959664255, 'min_samples_leaf': 0.11569846651676

[I 2023-07-25 00:45:09,486] Trial 93 finished with value: 2.010840222388253 and parameters: {'n_estimators': 142, 'max_depth': 15, 'min_samples_split': 0.1153545991085938, 'min_samples_leaf': 0.10793281716453494}. Best is trial 91 with value: 2.004508047512158.
[I 2023-07-25 00:45:20,894] Trial 94 finished with value: 2.0193488181678476 and parameters: {'n_estimators': 132, 'max_depth': 15, 'min_samples_split': 0.24347682095244552, 'min_samples_leaf': 0.12078067756591651}. Best is trial 91 with value: 2.004508047512158.
[I 2023-07-25 00:45:36,746] Trial 95 finished with value: 2.0088733196593935 and parameters: {'n_estimators': 147, 'max_depth': 14, 'min_samples_split': 0.1633319288782141, 'min_samples_leaf': 0.1003395606683985}. Best is trial 91 with value: 2.004508047512158.
[I 2023-07-25 00:45:48,067] Trial 96 finished with value: 2.0511363365281565 and parameters: {'n_estimators': 126, 'max_depth': 13, 'min_samples_split': 0.19462305921335324, 'min_samples_leaf': 0.1283971084288497

최적 파라미터:  {'n_estimators': 121, 'max_depth': 15, 'min_samples_split': 0.16896050258998352, 'min_samples_leaf': 0.10244725326986642}
최적 Objective 값:  2.004508047512158




풍속 예측이 완료되었습니다. 결과가 submission.csv에 저장되었습니다.


In [5]:
from sklearn.model_selection import cross_val_score
import numpy as np
# Cross Validation을 활용하여 평균 RMSE 계산
scores = cross_val_score(best_model, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
rmse_scores = np.sqrt(-scores)

print("평균 RMSE:", np.mean(rmse_scores))

평균 RMSE: 1.4170601386529476
