In [1]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import pandas as pd

train_data = pd.read_csv(r'C:\Users\Liao\Desktop\learning\ml\final\datasets\train_data_A.csv')
test_data = pd.read_csv(r'C:\Users\Liao\Desktop\learning\ml\final\datasets\test_data_A.csv')
future_data = pd.read_csv(r'C:\Users\Liao\Desktop\learning\ml\final\datasets\future_data_A.csv')
X_train = train_data.drop(columns=['id', 'home_team_win'])
y_train = train_data['home_team_win']
X_test = test_data.drop(columns=['id'])

In [2]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int("n_estimators", 10, 200, log=True),  # 樹的數量
        'max_depth': trial.suggest_int('max_depth', 10, 30, log=True),  # 樹的最大深度
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),  # 分裂最小樣本數
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),  # 葉子最小樣本數
        'max_features': trial.suggest_float('max_features', 0.1, 1.0),  # 每次分裂使用的特徵比例
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False])  # 是否使用放回抽樣
    }

    model = RandomForestClassifier(random_state=42, **params)
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy').mean()
    return score


# 創建 Optuna 的 Study
study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: objective(trial), n_trials=10)

# 輸出最佳結果
print("Best_param:", study.best_params)
print("Best_accuracy:", study.best_value)

# 使用最佳參數重新訓練模型
best_params = study.best_params
final_model = RandomForestClassifier(random_state=42, **best_params)
final_model.fit(X_train, y_train)

[I 2024-12-20 11:44:05,610] A new study created in memory with name: no-name-537fb048-102a-4e6e-828a-b9520b8098a2
[I 2024-12-20 11:51:09,764] Trial 0 finished with value: 0.5447716152112567 and parameters: {'n_estimators': 147, 'max_depth': 14, 'min_samples_split': 10, 'min_samples_leaf': 18, 'max_features': 0.4311926117937329, 'bootstrap': False}. Best is trial 0 with value: 0.5447716152112567.
[I 2024-12-20 11:53:13,972] Trial 1 finished with value: 0.5376346798563633 and parameters: {'n_estimators': 42, 'max_depth': 27, 'min_samples_split': 8, 'min_samples_leaf': 4, 'max_features': 0.3480397410035191, 'bootstrap': True}. Best is trial 0 with value: 0.5447716152112567.
[I 2024-12-20 11:55:18,883] Trial 2 finished with value: 0.5362786866308187 and parameters: {'n_estimators': 20, 'max_depth': 15, 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': 0.8853298387217058, 'bootstrap': True}. Best is trial 0 with value: 0.5447716152112567.
[I 2024-12-20 12:02:39,304] Trial 3 f

Best_param: {'n_estimators': 43, 'max_depth': 17, 'min_samples_split': 6, 'min_samples_leaf': 14, 'max_features': 0.8985277297867666, 'bootstrap': True}
Best_accuracy: 0.5477545635525642


In [3]:
y_test_pred = final_model.predict(X_test)
predicted_results = y_test_pred > 0

# 輸出預測結果
output_df = pd.DataFrame({
    'id': range(len(X_test)),
    'home_team_win': predicted_results
})

print("預測結果範例:")
print(output_df.head())
output_df.to_csv('rf_predictions1-2.csv', index=False)

預測結果範例:
   id  home_team_win
0   0           True
1   1          False
2   2           True
3   3          False
4   4          False


In [3]:
X_train = train_data2.drop(columns=['id', 'home_team_win'])
y_train = train_data2['home_team_win'].map({1: True, -1: False})
X_test = test_data2.drop(columns=['id'])

In [5]:
y_test_pred = final_model.predict(X_test)
predicted_results = y_test_pred > 0

# 輸出預測結果
output_df = pd.DataFrame({
    'id': range(len(X_test)),
    'home_team_win': predicted_results
})

print("預測結果範例:")
print(output_df.head())
output_df.to_csv('rf_predictions1.csv', index=False)

預測結果範例:
   id  home_team_win
0   0           True
1   1          False
2   2           True
3   3          False
4   4          False


In [11]:
X_train = train_data2.drop(columns=['id', 'home_team_win', 'season'])
y_train = train_data2['home_team_win'].map({1: True, -1: False})
future_X_test = future_data2.drop(columns=['id', 'season'])

In [13]:
def objective(trial):
    # 分割訓練集與驗證集
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42)

    params = {
        'n_estimators': trial.suggest_int("n_estimators", 10, 250, log=True),  # 樹的數量
        'max_depth': trial.suggest_int('max_depth', 10, 100, log=True),  # 樹的最大深度
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),  # 分裂最小樣本數
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),  # 葉子最小樣本數
        'max_features': trial.suggest_float('max_features', 0.1, 1),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False])  # 是否使用放回抽樣
    }

    model = RandomForestClassifier(random_state=42, **params)

    model.fit(X_train_split, y_train_split)

    y_val_pred = model.predict(X_val_split)
    accuracy = accuracy_score(y_val_split, y_val_pred)
    return accuracy

# 創建 Optuna 的 Study
study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: objective(trial), n_trials=20)

# 輸出最佳結果
print("Best_param:", study.best_params)
print("Best_accuracy:", study.best_value)

# 使用最佳參數重新訓練模型
best_params = study.best_params
final_model = RandomForestClassifier(random_state=42, **best_params)
final_model.fit(X_train, y_train)

[I 2024-12-13 14:18:07,942] A new study created in memory with name: no-name-2bf973cd-c9b0-40d7-8db0-5e974117e650
[I 2024-12-13 14:18:33,003] Trial 0 finished with value: 0.5492321589882565 and parameters: {'n_estimators': 91, 'max_depth': 86, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': 0.10394778309470266, 'bootstrap': False}. Best is trial 0 with value: 0.5492321589882565.
[I 2024-12-13 14:19:17,068] Trial 1 finished with value: 0.5460704607046071 and parameters: {'n_estimators': 31, 'max_depth': 99, 'min_samples_split': 3, 'min_samples_leaf': 10, 'max_features': 0.6933359963919009, 'bootstrap': False}. Best is trial 0 with value: 0.5492321589882565.
[I 2024-12-13 14:19:35,282] Trial 2 finished with value: 0.5578139114724481 and parameters: {'n_estimators': 15, 'max_depth': 29, 'min_samples_split': 8, 'min_samples_leaf': 7, 'max_features': 0.5686188688443872, 'bootstrap': True}. Best is trial 2 with value: 0.5578139114724481.
[I 2024-12-13 14:20:40,969] Trial 3 fin

Best_param: {'n_estimators': 58, 'max_depth': 20, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 0.5162179948644036, 'bootstrap': False}
Best_accuracy: 0.573170731707317


In [14]:
y_test_pred = final_model.predict(future_X_test)
predicted_results = y_test_pred > 0

# 輸出預測結果
output_df = pd.DataFrame({
    'id': range(len(future_X_test)),
    'home_team_win': predicted_results
})

print("預測結果範例:")
print(output_df.head())
output_df.to_csv('rf_predictions2-2.csv', index=False)

預測結果範例:
   id  home_team_win
0   0           True
1   1           True
2   2           True
3   3          False
4   4           True
