In [1]:
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
import numpy as np
import optuna

# Load the training data
train_data = pd.read_csv('train.csv')

# Fill missing values with the mean
train_data = train_data.fillna(train_data.mean())

# One-hot encode '측정 시간대' to numerical values
encoder = OneHotEncoder(sparse=False)
time_encoded = encoder.fit_transform(train_data[['측정 시간대']])
time_encoded_df = pd.DataFrame(time_encoded, columns=encoder.get_feature_names(['측정 시간대']))
train_data = pd.concat([train_data, time_encoded_df], axis=1).drop(['측정 시간대'], axis=1)

# Split the data into features (X) and target (y)
X_train = train_data.drop(['ID', '풍속 (m/s)'], axis=1)  # Input features
y_train = train_data['풍속 (m/s)']  # Target variable (풍속)

# Define the objective function for Optuna
def objective(trial):
    # Define the search space for hyperparameter optimization
    n_estimators = trial.suggest_int('n_estimators', 100, 300)
    max_depth = trial.suggest_categorical('max_depth', [None, 10, 20, 30])
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])
    
    # Create the ExtraTreesRegressor model with the suggested hyperparameters
    model = ExtraTreesRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        bootstrap=bootstrap,
        random_state=42
    )
    
    # Perform cross-validation and return the mean MAE score
    mae_scores = -cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=5)
    return np.mean(mae_scores)

# Create a Optuna study and optimize the objective function
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Get the best hyperparameters from the study
best_params = study.best_params

# Create the best ExtraTreesRegressor model with the found hyperparameters
best_model = ExtraTreesRegressor(random_state=42, **best_params)

# Fit the best model to the training data
best_model.fit(X_train, y_train)

# Load the test data
test_data = pd.read_csv('test.csv')

# Fill missing values with the mean
test_data = test_data.fillna(test_data.mean())

# One-hot encode '측정 시간대' to numerical values
time_encoded = encoder.transform(test_data[['측정 시간대']])
time_encoded_df = pd.DataFrame(time_encoded, columns=encoder.get_feature_names(['측정 시간대']))
test_data = pd.concat([test_data, time_encoded_df], axis=1).drop(['측정 시간대'], axis=1)

# Predict the wind speeds using the best model
X_test = test_data.drop('ID', axis=1)  # Test input features
y_pred = best_model.predict(X_test)  # Predict wind speeds using the test data

# Submit / 제출
submission = pd.read_csv('./sample_submission.csv')
submission['풍속 (m/s)'] = y_pred

# Save the predictions to submission.csv in the appropriate format
submission.to_csv('submission.csv', index=False)

print("풍속 예측이 완료되었습니다. 결과가 submission.csv에 저장되었습니다.")

# Print the best hyperparameters and the best MAE score
print("최적의 하이퍼파라미터:", best_params)
print("최적의 평균 MAE:", study.best_value)

  from .autonotebook import tqdm as notebook_tqdm
  if sys.path[0] == "":
[I 2023-07-26 21:58:33,869] A new study created in memory with name: no-name-979dc5df-5c84-42a2-b74d-7db457d9f953
[I 2023-07-26 21:59:20,990] Trial 0 finished with value: 0.4578321313488242 and parameters: {'n_estimators': 190, 'max_depth': 30, 'min_samples_split': 10, 'min_samples_leaf': 1, 'bootstrap': True}. Best is trial 0 with value: 0.4578321313488242.
[I 2023-07-26 22:00:11,116] Trial 1 finished with value: 0.4817303481759415 and parameters: {'n_estimators': 213, 'max_depth': 30, 'min_samples_split': 3, 'min_samples_leaf': 4, 'bootstrap': True}. Best is trial 0 with value: 0.4578321313488242.
[I 2023-07-26 22:00:40,857] Trial 2 finished with value: 0.6825353822628325 and parameters: {'n_estimators': 226, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 3, 'bootstrap': True}. Best is trial 0 with value: 0.4578321313488242.
[I 2023-07-26 22:01:43,294] Trial 3 finished with value: 0.42295432193427

[I 2023-07-26 22:37:56,145] Trial 31 finished with value: 0.39925852053761746 and parameters: {'n_estimators': 167, 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 2, 'bootstrap': False}. Best is trial 27 with value: 0.39589205566205415.
[I 2023-07-26 22:39:34,800] Trial 32 finished with value: 0.39914902300832217 and parameters: {'n_estimators': 183, 'max_depth': 30, 'min_samples_split': 3, 'min_samples_leaf': 2, 'bootstrap': False}. Best is trial 27 with value: 0.39589205566205415.
[I 2023-07-26 22:41:01,819] Trial 33 finished with value: 0.43767938942409473 and parameters: {'n_estimators': 208, 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 4, 'bootstrap': False}. Best is trial 27 with value: 0.39589205566205415.
[I 2023-07-26 22:42:05,283] Trial 34 finished with value: 0.4319913249554535 and parameters: {'n_estimators': 184, 'max_depth': 30, 'min_samples_split': 4, 'min_samples_leaf': 2, 'bootstrap': True}. Best is trial 27 with value: 0.39589205566205415

[I 2023-07-26 23:42:23,624] Trial 64 finished with value: 0.38843136161737435 and parameters: {'n_estimators': 260, 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': False}. Best is trial 57 with value: 0.3883782526308096.
[I 2023-07-26 23:45:09,148] Trial 65 finished with value: 0.3884071884284316 and parameters: {'n_estimators': 261, 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': False}. Best is trial 57 with value: 0.3883782526308096.
[I 2023-07-26 23:48:05,223] Trial 66 finished with value: 0.3885230069797293 and parameters: {'n_estimators': 275, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': False}. Best is trial 57 with value: 0.3883782526308096.
[I 2023-07-26 23:51:04,588] Trial 67 finished with value: 0.388357585940862 and parameters: {'n_estimators': 289, 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': False}. Best is trial 67 with value: 0.388357585940862.
[I 

[I 2023-07-27 01:10:25,565] Trial 97 finished with value: 0.39494975184442793 and parameters: {'n_estimators': 299, 'max_depth': 20, 'min_samples_split': 3, 'min_samples_leaf': 1, 'bootstrap': False}. Best is trial 79 with value: 0.38825477079960924.
[I 2023-07-27 01:13:24,848] Trial 98 finished with value: 0.388357585940862 and parameters: {'n_estimators': 289, 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': False}. Best is trial 79 with value: 0.38825477079960924.
[I 2023-07-27 01:14:19,478] Trial 99 finished with value: 0.6737574520430841 and parameters: {'n_estimators': 294, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': False}. Best is trial 79 with value: 0.38825477079960924.


풍속 예측이 완료되었습니다. 결과가 submission.csv에 저장되었습니다.
최적의 하이퍼파라미터: {'n_estimators': 295, 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': False}
최적의 평균 MAE: 0.38825477079960924


In [2]:
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
import numpy as np
import optuna

# Load the training data
train_data = pd.read_csv('train.csv')

# Fill missing values with the mean
train_data = train_data.fillna(train_data.mean())

# One-hot encode '측정 시간대' to numerical values
encoder = OneHotEncoder(sparse=False)
time_encoded = encoder.fit_transform(train_data[['측정 시간대']])
time_encoded_df = pd.DataFrame(time_encoded, columns=encoder.get_feature_names(['측정 시간대']))
train_data = pd.concat([train_data, time_encoded_df], axis=1).drop(['측정 시간대'], axis=1)

# Split the data into features (X) and target (y)
X_train = train_data.drop(['ID', '풍속 (m/s)'], axis=1)  # Input features
y_train = train_data['풍속 (m/s)']  # Target variable (풍속)

# Define the objective function for Optuna
def objective(trial):
    # Define the search space for hyperparameter optimization
    n_estimators = trial.suggest_int('n_estimators', 100, 300)
    max_depth = trial.suggest_categorical('max_depth', [None, 10, 20, 30])
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])
    
    # Create the ExtraTreesRegressor model with the suggested hyperparameters
    model = ExtraTreesRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        bootstrap=bootstrap,
        random_state=42
    )
    
    # Perform cross-validation and return the mean MAE score
    mae_scores = -cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=5)
    return np.mean(mae_scores)

# Create a Optuna study with a sampler and optimize the objective function
sampler = optuna.samplers.TPESampler(seed=42)  # Tree-structured Parzen Estimator Sampler
study = optuna.create_study(direction='minimize', sampler=sampler)
study.optimize(objective, n_trials=200)  # Increase n_trials for more iterations

# Get the best hyperparameters from the study
best_params = study.best_params

# Create the best ExtraTreesRegressor model with the found hyperparameters
best_model = ExtraTreesRegressor(random_state=42, **best_params)

# Fit the best model to the training data
best_model.fit(X_train, y_train)

# Load the test data
test_data = pd.read_csv('test.csv')

# Fill missing values with the mean
test_data = test_data.fillna(test_data.mean())

# One-hot encode '측정 시간대' to numerical values
time_encoded = encoder.transform(test_data[['측정 시간대']])
time_encoded_df = pd.DataFrame(time_encoded, columns=encoder.get_feature_names(['측정 시간대']))
test_data = pd.concat([test_data, time_encoded_df], axis=1).drop(['측정 시간대'], axis=1)

# Predict the wind speeds using the best model
X_test = test_data.drop('ID', axis=1)  # Test input features
y_pred = best_model.predict(X_test)  # Predict wind speeds using the test data

# Submit / 제출
submission = pd.read_csv('./sample_submission.csv')
submission['풍속 (m/s)'] = y_pred

# Save the predictions to submission.csv in the appropriate format
submission.to_csv('submission.csv', index=False)

print("풍속 예측이 완료되었습니다. 결과가 submission.csv에 저장되었습니다.")

# Print the best hyperparameters and the best MAE score
print("최적의 하이퍼파라미터:", best_params)
print("최적의 평균 MAE:", study.best_value)


  if sys.path[0] == "":
[I 2023-07-27 05:02:18,189] A new study created in memory with name: no-name-bdbd9f9b-0575-4998-a19d-f56af82683fc
[I 2023-07-27 05:04:36,881] Trial 0 finished with value: 0.4143009408680956 and parameters: {'n_estimators': 175, 'max_depth': None, 'min_samples_split': 3, 'min_samples_leaf': 1, 'bootstrap': True}. Best is trial 0 with value: 0.4143009408680956.
[I 2023-07-27 05:06:18,288] Trial 1 finished with value: 0.6735887469996727 and parameters: {'n_estimators': 242, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 1, 'bootstrap': False}. Best is trial 0 with value: 0.4143009408680956.
[I 2023-07-27 05:07:13,342] Trial 2 finished with value: 0.6793368399973639 and parameters: {'n_estimators': 186, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 2, 'bootstrap': True}. Best is trial 0 with value: 0.4143009408680956.
[I 2023-07-27 05:08:56,217] Trial 3 finished with value: 0.4846149857403949 and parameters: {'n_estimators': 203, 'max_de

[I 2023-07-27 06:51:17,064] Trial 32 finished with value: 0.38887843126312127 and parameters: {'n_estimators': 263, 'max_depth': 30, 'min_samples_split': 3, 'min_samples_leaf': 1, 'bootstrap': False}. Best is trial 29 with value: 0.3888514258494432.
[I 2023-07-27 06:52:50,458] Trial 33 finished with value: 0.6736467289434345 and parameters: {'n_estimators': 238, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 1, 'bootstrap': False}. Best is trial 29 with value: 0.3888514258494432.
[I 2023-07-27 06:58:07,204] Trial 34 finished with value: 0.3884598979025625 and parameters: {'n_estimators': 258, 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': False}. Best is trial 34 with value: 0.3884598979025625.
[I 2023-07-27 07:01:55,999] Trial 35 finished with value: 0.4112470621039007 and parameters: {'n_estimators': 283, 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': True}. Best is trial 34 with value: 0.3884598979025625.
[I 2

[I 2023-07-27 08:30:48,326] Trial 65 finished with value: 0.38847285843033197 and parameters: {'n_estimators': 292, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': False}. Best is trial 57 with value: 0.3884358411005984.
[I 2023-07-27 08:33:46,644] Trial 66 finished with value: 0.38861414495207175 and parameters: {'n_estimators': 289, 'max_depth': None, 'min_samples_split': 3, 'min_samples_leaf': 1, 'bootstrap': False}. Best is trial 57 with value: 0.3884358411005984.
[I 2023-07-27 08:36:07,783] Trial 67 finished with value: 0.39855361394219724 and parameters: {'n_estimators': 272, 'max_depth': None, 'min_samples_split': 4, 'min_samples_leaf': 2, 'bootstrap': False}. Best is trial 57 with value: 0.3884358411005984.
[I 2023-07-27 08:39:28,056] Trial 68 finished with value: 0.38847285843033197 and parameters: {'n_estimators': 292, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': False}. Best is trial 57 with value: 0.3884358411

[I 2023-07-27 09:57:18,781] Trial 98 finished with value: 0.6740811337695344 and parameters: {'n_estimators': 146, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 1, 'bootstrap': False}. Best is trial 86 with value: 0.3884176477472228.
[I 2023-07-27 10:00:31,127] Trial 99 finished with value: 0.38854216497343314 and parameters: {'n_estimators': 283, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': False}. Best is trial 86 with value: 0.3884176477472228.
[I 2023-07-27 10:01:50,447] Trial 100 finished with value: 0.43654880964027704 and parameters: {'n_estimators': 260, 'max_depth': None, 'min_samples_split': 7, 'min_samples_leaf': 1, 'bootstrap': True}. Best is trial 86 with value: 0.3884176477472228.
[I 2023-07-27 10:04:55,403] Trial 101 finished with value: 0.38843215534682274 and parameters: {'n_estimators': 268, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': False}. Best is trial 86 with value: 0.388417647747

[I 2023-07-27 11:28:25,812] Trial 131 finished with value: 0.3884484211164801 and parameters: {'n_estimators': 258, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': False}. Best is trial 123 with value: 0.3884143172874491.
[I 2023-07-27 11:31:29,469] Trial 132 finished with value: 0.38843215534682274 and parameters: {'n_estimators': 268, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': False}. Best is trial 123 with value: 0.3884143172874491.
[I 2023-07-27 11:34:36,013] Trial 133 finished with value: 0.3884660420041145 and parameters: {'n_estimators': 270, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': False}. Best is trial 123 with value: 0.3884143172874491.
[I 2023-07-27 11:37:46,735] Trial 134 finished with value: 0.3885216592950974 and parameters: {'n_estimators': 277, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': False}. Best is trial 123 with value: 0.38841

[W 2023-07-27 13:14:56,846] Trial 154 failed with value None.


KeyboardInterrupt: 