In [2]:
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import pandas as pd
from collections import OrderedDict
import numpy as np
import xgboost as xgb
from bayes_opt import BayesianOptimization
from sklearn.datasets import make_classification

train_data2 = pd.read_csv('clean_train_data.csv')
test_data2 = pd.read_csv('clean_test_data.csv')
future_data2 = pd.read_csv('clean_future_test_data.csv')

In [2]:
X_train = train_data2.drop(columns=['id', 'home_team_win', 'season'])
y_train = train_data2['home_team_win']
X_test = future_data2.drop(columns=['id', 'season'])

In [4]:
params = {
    'max_depth': 4,
    'n_estimators': 175,
    'subsample': 0.75,
    'learning_rate': 0.01,
    'alpha': 0.0011688861452798418,
    'lambda': 0.0012674126399294001,
    'gamma': 1,
    'min_child_weight': 8,
    'colsample_bytree': 0.567110634660098,
}

model = XGBClassifier(**params, eval_metric='logloss')
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
print(f"Average CV accuracy: {cv_scores.mean():.4f}")

y_val_pred = cross_val_predict(model, X_train, y_train, cv=5)
val_accuracy = accuracy_score(y_train, y_val_pred)
print(f"Validation Accuracy: {val_accuracy:.4f}")

model.fit(X_train, y_train) 
y_test_pred = model.predict(X_test)
predicted_results = y_test_pred > 0

output_df = pd.DataFrame({
    'id': range(len(X_test)),
    'prediction': predicted_results
})

print("模型訓練完成，預測結果範例：")
print(output_df.head())
output_df.to_csv('xgb_predictions2-4.csv', index=False)


Average CV accuracy: 0.5544
Validation Accuracy: 0.5544
模型訓練完成，預測結果範例：
   id  prediction
0   0        True
1   1        True
2   2        True
3   3       False
4   4        True


In [3]:
def xgb_evaluate(max_depth, learning_rate, n_estimators, min_child_weight, gamma, subsample, colsample_bytree, alpha):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'max_depth': int(max_depth), 
        'learning_rate': learning_rate,
        'n_estimators': int(n_estimators), 
        'min_child_weight': min_child_weight,
        'gamma': gamma,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'alpha': alpha,
    }
    # 初始化模型
    model = xgb.XGBClassifier(**params)
    # 交叉驗證計算平均準確率
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    return scores.mean()

# 定義超參數範圍
pbounds = {
    'max_depth': (7, 10),
    'n_estimators': (150, 200),
    'subsample': (0.7, 0.8),
    'learning_rate': (0.01, 0.05),
    'alpha': (0, 0.002),
    'gamma': (0, 10),
    'min_child_weight': (6, 10),
    'colsample_bytree': (0.5, 0.6),
}

optimizer = BayesianOptimization(
    f=xgb_evaluate,
    pbounds=pbounds,
    random_state=42,
)

optimizer.maximize(init_points=5, n_iter=30)


|   iter    |  target   |   alpha   | colsam... |   gamma   | learni... | max_depth | min_ch... | n_esti... | subsample |
-------------------------------------------------------------------------------------------------------------------------


  warn(msg, stacklevel=1)


| [39m1        [39m | [39m0.5487   [39m | [39m0.0007491[39m | [39m0.5951   [39m | [39m7.32     [39m | [39m0.03395  [39m | [39m7.468    [39m | [39m6.624    [39m | [39m152.9    [39m | [39m0.7866   [39m |
| [39m2        [39m | [39m0.5413   [39m | [39m0.001202 [39m | [39m0.5708   [39m | [39m0.2058   [39m | [39m0.0488   [39m | [39m9.497    [39m | [39m6.849    [39m | [39m159.1    [39m | [39m0.7183   [39m |
| [35m3        [39m | [35m0.552    [39m | [35m0.0006085[39m | [35m0.5525   [39m | [35m4.319    [39m | [35m0.02165  [39m | [35m8.836    [39m | [35m6.558    [39m | [35m164.6    [39m | [35m0.7366   [39m |
| [39m4        [39m | [39m0.5448   [39m | [39m0.0009121[39m | [39m0.5785   [39m | [39m1.997    [39m | [39m0.03057  [39m | [39m8.777    [39m | [39m6.186    [39m | [39m180.4    [39m | [39m0.7171   [39m |
| [35m5        [39m | [35m0.559    [39m | [35m0.0001301[39m | [35m0.5949   [39m | [35m9.656    [39m | 

In [4]:
best_result = optimizer.max
best_params = best_result['params']
best_validation_accuracy = best_result['target']

print(f"Best Validation Accuracy: {best_validation_accuracy:.4f}")
print("Best Parameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

# 使用最佳參數訓練模型
final_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    max_depth=int(best_params['max_depth']),
    learning_rate=best_params['learning_rate'],
    n_estimators=int(best_params['n_estimators']),
    min_child_weight=best_params['min_child_weight'],
    gamma=best_params['gamma'],
    subsample=best_params['subsample'],
    colsample_bytree=best_params['colsample_bytree'],
    alpha=best_params['alpha']
)

final_model.fit(X_train, y_train)

Best Validation Accuracy: 0.5609
Best Parameters:
alpha: 0.0019652387557311116
colsample_bytree: 0.528598125673456
gamma: 9.577409626580337
learning_rate: 0.021779728095541996
max_depth: 7.236061318662626
min_child_weight: 6.1336271753170575
n_estimators: 187.44994833096283
subsample: 0.7047730756872099


In [5]:
y_test_pred = final_model.predict(X_test)
predicted_results = y_test_pred > 0

output_df = pd.DataFrame({
    'id': range(len(X_test)),
    'home_team_win': predicted_results
})

print("模型訓練完成，預測結果範例：")
print(output_df.head())
output_df.to_csv('xgb_predictions2-4.csv', index=False)

模型訓練完成，預測結果範例：
   id  home_team_win
0   0           True
1   1           True
2   2           True
3   3          False
4   4           True


Average CV accuracy: 0.5552
Validation Accuracy: 0.5552
模型訓練完成，預測結果範例：
   id  prediction
0   0        True
1   1       False
2   2        True
3   3        True
4   4       False


In [3]:
def xgb_evaluate(max_depth, learning_rate, n_estimators, min_child_weight, gamma, subsample, colsample_bytree):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'max_depth': int(max_depth),
        'learning_rate': learning_rate,
        'n_estimators': int(n_estimators),
        'min_child_weight': min_child_weight,
        'gamma': gamma,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
    }

    model = xgb.XGBClassifier(**params)
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    return scores.mean()

pbounds = {
    'n_estimators': (50, 200),
    'max_depth': (3, 10),
    'learning_rate': (0.01, 0.1),
    'subsample': (0.7, 0.9),
    'colsample_bytree': (0.5, 1),
    'gamma': (0, 1),
    'min_child_weight': (1, 10),
}
optimizer = BayesianOptimization(
    f=xgb_evaluate,  
    pbounds=pbounds,  
    random_state=42
)

optimizer.maximize(init_points=5, n_iter=25)

|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | n_esti... | subsample |
-------------------------------------------------------------------------------------------------------------


  warn(msg, stacklevel=1)


| [39m1        [39m | [39m0.538    [39m | [39m0.6873   [39m | [39m0.9507   [39m | [39m0.07588  [39m | [39m7.191    [39m | [39m2.404    [39m | [39m73.4     [39m | [39m0.7116   [39m |
| [35m2        [39m | [35m0.5496   [39m | [35m0.9331   [39m | [35m0.6011   [39m | [35m0.07373  [39m | [35m3.144    [39m | [35m9.729    [39m | [35m174.9    [39m | [35m0.7425   [39m |
| [35m3        [39m | [35m0.5541   [39m | [35m0.5909   [39m | [35m0.1834   [39m | [35m0.03738  [39m | [35m6.673    [39m | [35m4.888    [39m | [35m93.68    [39m | [35m0.8224   [39m |
| [39m4        [39m | [39m0.5527   [39m | [39m0.5697   [39m | [39m0.2921   [39m | [39m0.04297  [39m | [39m6.192    [39m | [39m8.067    [39m | [39m79.95    [39m | [39m0.8028   [39m |
| [39m5        [39m | [39m0.5506   [39m | [39m0.7962   [39m | [39m0.04645  [39m | [39m0.06468  [39m | [39m4.194    [39m | [39m1.585    [39m | [39m192.3    [39m | [39m0.8931   [39m |


In [4]:
best_result = optimizer.max
best_params = best_result['params']
best_validation_accuracy = best_result['target']

print(f"Best Validation Accuracy: {best_validation_accuracy:.4f}")
print("Best Parameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

# 使用最佳參數訓練模型
final_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    max_depth=int(best_params['max_depth']),
    learning_rate=best_params['learning_rate'],
    n_estimators=int(best_params['n_estimators']),
    min_child_weight=best_params['min_child_weight'],
    gamma=best_params['gamma'],
    subsample=best_params['subsample'],
    colsample_bytree=best_params['colsample_bytree']
)

final_model.fit(X_train, y_train)

Best Validation Accuracy: 0.5590
Best Parameters:
colsample_bytree: 0.8922665485053749
gamma: 0.6492196885419822
learning_rate: 0.04585937135163588
max_depth: 3.4120814346797497
min_child_weight: 9.281372078619311
n_estimators: 82.01197644627693
subsample: 0.8323098468063317


In [5]:
y_test_pred = final_model.predict(future_X_test)
predicted_results = y_test_pred > 0

output_df = pd.DataFrame({
    'id': range(len(future_X_test)),
    'home_team_win': predicted_results
})

print("模型訓練完成，預測結果範例：")
print(output_df.head())
output_df.to_csv('xgb_predictions2-4.csv', index=False)

模型訓練完成，預測結果範例：
   id  home_team_win
0   0           True
1   1           True
2   2           True
3   3          False
4   4           True


In [9]:
y_test_pred = final_model.predict(X_test)
predicted_results = y_test_pred > 0

output_df = pd.DataFrame({
    'id': range(len(X_test)),
    'home_team_win': predicted_results
})

print("模型訓練完成，預測結果範例：")
print(output_df.head())
output_df.to_csv('xgb_predictions2-4.csv', index=False)

模型訓練完成，預測結果範例：
   id  home_team_win
0   0           True
1   1           True
2   2           True
3   3          False
4   4           True


In [8]:
def objective(trial, X=X_train, y=y_train):

    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'objective': 'binary:logistic',
        'booster': 'gbtree',
        'max_depth': trial.suggest_int('max_depth', 3, 10),        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.05, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.05, 1.0),
        'eta': trial.suggest_float('eta', 1e-3, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 1.0, log=True),
        'lambda': trial.suggest_float('lambda', 1e-3, 1.0, log=True),
        'gamma': trial.suggest_float('gamma', 1e-3, 1.0, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 20),
        'grow_policy': trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    }

    model = XGBClassifier(**params, eval_metric='logloss')
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy').mean()
    
    return score

study = optuna.create_study(direction='maximize') 
study.optimize(objective, n_trials=30) 

print("Best_param:", study.best_params)
print("Best_accuracy:", study.best_value)

best_params = study.best_params
final_model = XGBClassifier(**best_params, eval_metric='logloss')
final_model.fit(X_train, y_train)

[I 2024-12-14 13:05:42,771] A new study created in memory with name: no-name-ea31c9c4-4bc5-4958-8969-9fbe48a8f459
[I 2024-12-14 13:06:02,941] Trial 0 finished with value: 0.5229880019268377 and parameters: {'n_estimators': 131, 'max_depth': 6, 'learning_rate': 0.1838494735816237, 'subsample': 0.34680786266869545, 'colsample_bytree': 0.7662293681594029, 'eta': 0.00729708864174367, 'alpha': 0.0011401469435815932, 'lambda': 0.013876239047696703, 'gamma': 0.030484574403201747, 'min_child_weight': 17, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 0.5229880019268377.
[I 2024-12-14 13:06:14,032] Trial 1 finished with value: 0.532701177476417 and parameters: {'n_estimators': 66, 'max_depth': 6, 'learning_rate': 0.10300244619191695, 'subsample': 0.21080994051975765, 'colsample_bytree': 0.6636335151752781, 'eta': 0.06702111955263605, 'alpha': 0.005051443481210759, 'lambda': 0.1416883463643405, 'gamma': 0.15997665126503005, 'min_child_weight': 12, 'grow_policy': 'lossguide'}. Best is t

Best_param: {'n_estimators': 85, 'max_depth': 3, 'learning_rate': 0.02319983111142826, 'subsample': 0.882289511962925, 'colsample_bytree': 0.2730567691076503, 'eta': 0.4120104707640015, 'alpha': 0.15444373821622273, 'lambda': 0.0010071217986628495, 'gamma': 0.0011340023529327415, 'min_child_weight': 15, 'grow_policy': 'depthwise'}
Best_accuracy: 0.5522437130543247


In [25]:
X_train = train_data2.drop(columns=['id', 'home_team_win', 'season'])
y_train = train_data2['home_team_win']
future_X_test = future_data2.drop(columns=['id', 'season'])

In [3]:
train_data = pd.read_csv(r'C:\Users\Liao\Desktop\learning\ml\final\datasets\train_data_A.csv')
test_data = pd.read_csv(r'C:\Users\Liao\Desktop\learning\ml\final\datasets\test_data_A.csv')
future_data = pd.read_csv(r'C:\Users\Liao\Desktop\learning\ml\final\datasets\future_data_A.csv')
X_train = train_data.drop(columns=['id', 'home_team_win','season'])
y_train = train_data['home_team_win']
X_test = future_data.drop(columns=['id','season'])

In [4]:
# 定義目標函數
def objective(trial, X=X_train, y=y_train):
    params = {
        'n_estimators': 190,
        'max_depth': 9,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'eta': trial.suggest_float('eta', 1e-3, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 1.0, log=True),
        'lambda': trial.suggest_float('lambda', 1e-3, 1.0, log=True),
        'gamma': trial.suggest_float('gamma', 1e-3, 1.0, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 20),
        'grow_policy': trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    }
    model = XGBClassifier(**params, eval_metric='logloss')
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy').mean()
    return score

# 創建 Study
study = optuna.create_study(direction='maximize')  # 目標是最大化準確率
study.optimize(objective, n_trials=20)  # 進行 50 次試驗

# 輸出最佳結果
print("最佳參數:", study.best_params)
print("最佳驗證準確率:", study.best_value)

best_params = study.best_params
final_model = XGBClassifier(**best_params, use_label_encoder=False, eval_metric='logloss')
final_model.fit(X_train, y_train)  # 在整個訓練集上訓練模型

[I 2024-12-20 11:22:52,579] A new study created in memory with name: no-name-53381090-f35d-4547-88ff-4ba527d938bb
[I 2024-12-20 11:23:10,238] Trial 0 finished with value: 0.523990617975166 and parameters: {'learning_rate': 0.2966031807915316, 'subsample': 0.8768665150299515, 'colsample_bytree': 0.9260595366661355, 'eta': 0.00813147863769441, 'alpha': 0.2624032603165584, 'lambda': 0.01774218943241022, 'gamma': 0.1669988342426915, 'min_child_weight': 7, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.523990617975166.
[I 2024-12-20 11:24:02,160] Trial 1 finished with value: 0.5365509139351071 and parameters: {'learning_rate': 0.11363042190907098, 'subsample': 0.9351822677480746, 'colsample_bytree': 0.5428739787204209, 'eta': 0.3051732384649888, 'alpha': 0.0024384575692800565, 'lambda': 0.04352658648757632, 'gamma': 0.6043014157038241, 'min_child_weight': 6, 'grow_policy': 'lossguide'}. Best is trial 1 with value: 0.5365509139351071.
[I 2024-12-20 11:24:21,701] Trial 2 finished 

最佳參數: {'learning_rate': 0.021638547561702377, 'subsample': 0.5086510194871081, 'colsample_bytree': 0.5911778667016948, 'eta': 0.07067754231207002, 'alpha': 0.77333147645985, 'lambda': 0.006253859069907904, 'gamma': 0.29484705103798725, 'min_child_weight': 20, 'grow_policy': 'lossguide'}
最佳驗證準確率: 0.5556140503414373


In [5]:
y_test_pred = final_model.predict(X_test)
predicted_results = y_test_pred > 0

output_df = pd.DataFrame({
    'id': range(len(X_test)),
    'home_team_win': predicted_results
})

print("預測結果範例:")
print(output_df.head())
output_df.to_csv('xgb_predictions2.csv', index=False)

預測結果範例:
   id  home_team_win
0   0           True
1   1           True
2   2           True
3   3          False
4   4           True


In [20]:
X_train = train_data2.drop(columns=['id', 'home_team_win', 'season'])
y_train = train_data2['home_team_win']
future_X_test = future_data2.drop(columns=['id', 'season'])

In [3]:
X_train_slpit, X_val, y_train_slpit, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
params = {
    'max_depth': 9,
    'n_estimators': 190,
    'subsample': 0.7362501984120449,
    'eta': 0.012419546340276555,
    'alpha': 0.0011688861452798418,
    'lambda': 0.0012674126399294001,
    'gamma': 8.572539672595768,
    'min_child_weight': 8,
    'grow_policy': 'depthwise',
    'colsample_bytree': 0.567110634660098,
}

model = XGBClassifier(**params, eval_metric='logloss')
model.fit(X_train_slpit, y_train_slpit)
y_val_pred = model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"accuracy: {val_accuracy:.4f}")
y_test_pred = model.predict(future_X_test)
predicted_results = y_test_pred > 0

output_df = pd.DataFrame({
    'id': range(len(future_X_test)),
    'prediction': predicted_results
})

print("模型訓練完成，預測結果範例：")
print(output_df.head())
output_df.to_csv('xgb_predictions2-2.csv', index=False)


accuracy: 0.5587
模型訓練完成，預測結果範例：
   id  prediction
0   0        True
1   1        True
2   2        True
3   3       False
4   4        True
