In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization
from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb

In [2]:
train_data = pd.read_csv('clean_train_data.csv')
test_data = pd.read_csv('clean_test_data.csv')
future_data = pd.read_csv('clean_future_test_data.csv')

### Logistic Regression

In [3]:
X_train = train_data.drop(columns=['id', 'home_team_win', 'season'])
y_train = train_data['home_team_win']
X_test = future_data.drop(columns=['id', 'season'])

In [4]:
def lr_evaluate(C, penalty, solver):
    model = LogisticRegression(
        C=C,
        penalty=penalty,
        solver=solver,
        max_iter=1000,
        random_state=42,
    )
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    return scores.mean()
pbounds = {
    'C': (0.01, 10),  # 正則化強度，C 越小正則化越強
    'penalty': (0, 1),  # 0 表示 'l1'，1 表示 'l2'
    'solver': (0, 1),  # 0 表示 'liblinear'，1 表示 'saga'
}

penalty_map = {0: 'l1', 1: 'l2'}
solver_map = {0: 'liblinear', 1: 'saga'}

def wrapped_lr_evaluate(C, penalty, solver):
    penalty = penalty_map[int(round(penalty))]
    solver = solver_map[int(round(solver))]
    return lr_evaluate(C, penalty, solver)

optimizer = BayesianOptimization(
    f=wrapped_lr_evaluate,
    pbounds=pbounds,
    random_state=42,
)

optimizer.maximize(init_points=5, n_iter=20)

best_params = optimizer.max['params']
best_penalty = penalty_map[int(round(best_params['penalty']))]
best_solver = solver_map[int(round(best_params['solver']))]

print("Best_param:")
print(f"C: {best_params['C']:.4f}")
print(f"penalty: {best_penalty}")
print(f"solver: {best_solver}")

LR_model = LogisticRegression(
    C=best_params['C'],
    penalty=best_penalty,
    solver=best_solver,
    max_iter=1000,
    random_state=42,
)
LR_model.fit(X_train, y_train)
print("模型訓練完成！")

|   iter    |  target   |     C     |  penalty  |  solver   |
-------------------------------------------------------------


  warn(msg, stacklevel=1)


| [39m1        [39m | [39m0.5511   [39m | [39m3.752    [39m | [39m0.9507   [39m | [39m0.732    [39m |
| [39m2        [39m | [39m0.5511   [39m | [39m5.991    [39m | [39m0.156    [39m | [39m0.156    [39m |
| [35m3        [39m | [35m0.5512   [39m | [35m0.5903   [39m | [35m0.8662   [39m | [35m0.6011   [39m |
| [39m4        [39m | [39m0.5509   [39m | [39m7.084    [39m | [39m0.02058  [39m | [39m0.9699   [39m |
| [39m5        [39m | [39m0.551    [39m | [39m8.326    [39m | [39m0.2123   [39m | [39m0.1818   [39m |
| [39m6        [39m | [39m0.5512   [39m | [39m0.6466   [39m | [39m0.846    [39m | [39m0.6075   [39m |
| [35m7        [39m | [35m0.5513   [39m | [35m0.1722   [39m | [35m0.01481  [39m | [35m0.02931  [39m |
| [35m8        [39m | [35m0.5566   [39m | [35m0.04734  [39m | [35m0.02316  [39m | [35m0.9865   [39m |
| [39m9        [39m | [39m0.5504   [39m | [39m0.3525   [39m | [39m0.01743  [39m | [39m0.9355   

### Random Forest

In [5]:
def rf_evaluate(n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features):
    model = RandomForestClassifier(
        n_estimators=int(n_estimators),  # 森林中的樹的數量
        max_depth=int(max_depth),  # 樹的最大深度
        min_samples_split=int(min_samples_split),  # 每個節點的最小分割數量
        min_samples_leaf=int(min_samples_leaf),  # 每個葉子節點的最小樣本數量
        max_features=max_features,  # 每次分割考慮的最大特徵數量
        random_state=42,
        n_jobs=-1  # 使用所有 CPU 核心加速計算
    )
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    return scores.mean()

pbounds = {
    'n_estimators': (50, 300),  # 樹的數量範圍
    'max_depth': (5, 20),  # 樹的最大深度範圍
    'min_samples_split': (2, 20),  # 最小分割樣本數量
    'min_samples_leaf': (1, 10),  # 最小葉子樣本數量
    'max_features': (0.1, 1.0),  # 最大特徵數比例
}

optimizer = BayesianOptimization(
    f=rf_evaluate,
    pbounds=pbounds,
    random_state=42,
)

optimizer.maximize(init_points=5, n_iter=10)

best_params = optimizer.max['params']
print("Best_param:")
print(f"n_estimators: {int(best_params['n_estimators'])}")
print(f"max_depth: {int(best_params['max_depth'])}")
print(f"min_samples_split: {int(best_params['min_samples_split'])}")
print(f"min_samples_leaf: {int(best_params['min_samples_leaf'])}")
print(f"max_features: {best_params['max_features']:.4f}")

RF_model = RandomForestClassifier(
    n_estimators=int(best_params['n_estimators']),
    max_depth=int(best_params['max_depth']),
    min_samples_split=int(best_params['min_samples_split']),
    min_samples_leaf=int(best_params['min_samples_leaf']),
    max_features=best_params['max_features'],
    random_state=42,
    n_jobs=-1,
)
RF_model.fit(X_train, y_train)
print("模型訓練完成！")

|   iter    |  target   | max_depth | max_fe... | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------


  warn(msg, stacklevel=1)


| [39m1        [39m | [39m0.5427   [39m | [39m10.62    [39m | [39m0.9556   [39m | [39m7.588    [39m | [39m12.78    [39m | [39m89.0     [39m |
| [35m2        [39m | [35m0.5567   [39m | [35m7.34     [39m | [35m0.1523   [39m | [35m8.796    [39m | [35m12.82    [39m | [35m227.0    [39m |
| [39m3        [39m | [39m0.554    [39m | [39m5.309    [39m | [39m0.9729   [39m | [39m8.492    [39m | [39m5.822    [39m | [39m95.46    [39m |
| [39m4        [39m | [39m0.5534   [39m | [39m7.751    [39m | [39m0.3738   [39m | [39m5.723    [39m | [39m9.775    [39m | [39m122.8    [39m |
| [39m5        [39m | [39m0.5465   [39m | [39m14.18    [39m | [39m0.2255   [39m | [39m3.629    [39m | [39m8.595    [39m | [39m164.0    [39m |
| [39m6        [39m | [39m0.5526   [39m | [39m7.245    [39m | [39m0.3037   [39m | [39m5.217    [39m | [39m9.648    [39m | [39m122.2    [39m |
| [39m7        [39m | [39m0.5534   [39m | [39m6.409    [39

### Support Vector Machine

In [12]:
from sklearn.preprocessing import StandardScaler
best_params = {'C': 6296.748648379015, 'kernel': 'rbf', 'gamma': 'scale'}
SVM_model = SVC(**best_params, probability=True)
SVM_model.fit(X_train, y_train)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
SVM_model.fit(X_train_scaled, y_train)
print("模型訓練完成！")

模型訓練完成！


### XGBoost..


In [9]:
def xgb_evaluate(max_depth, learning_rate, n_estimators, min_child_weight, gamma, subsample, colsample_bytree, alpha, reg_lambda):
    params = {
        'objective': 'binary:logistic',  # 二分类问题
        'eval_metric': 'logloss',  # 评估标准为logloss
        'max_depth': int(max_depth),
        'learning_rate': learning_rate,
        'n_estimators': int(n_estimators),
        'min_child_weight': min_child_weight,
        'gamma': gamma,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'alpha': alpha, 
        'reg_lambda': reg_lambda
    }
    
    model = xgb.XGBClassifier(**params)
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    return scores.mean()

pbounds = {
    'max_depth': (7, 11),  # 设定max_depth范围
    'n_estimators': (50, 150),  # n_estimators范围
    'subsample': (0.7, 1),  # 样本采样比例范围
    'learning_rate': (0.01, 0.2),  # 学习率范围
    'alpha': (0.0001, 0.2),  # L1正则化系数范围
    'reg_lambda': (0.0001, 0.2),  # L2正则化系数范围
    'gamma': (0, 5),  # gamma范围
    'min_child_weight': (1, 10),  # 子节点的最小样本权重
    'colsample_bytree': (0.3, 1)  # 每棵树的特征采样比例范围
}

optimizer = BayesianOptimization(
    f=xgb_evaluate,  # 目标函数为xgb_evaluate
    pbounds=pbounds,  # 超参数范围
    random_state=42,
)

optimizer.maximize(init_points=5, n_iter=20)

# 获取最佳参数
best_params = optimizer.max['params']
print("Best_param:")
print(f"max_depth: {int(best_params['max_depth'])}")
print(f"learning_rate: {best_params['learning_rate']:.4f}")
print(f"n_estimators: {int(best_params['n_estimators'])}")
print(f"min_child_weight: {best_params['min_child_weight']:.4f}")
print(f"gamma: {best_params['gamma']:.4f}")
print(f"subsample: {best_params['subsample']:.4f}")
print(f"colsample_bytree: {best_params['colsample_bytree']:.4f}")
print(f"alpha: {best_params['alpha']:.4f}")
print(f"lambda: {best_params['reg_lambda']:.4f}")

XGB_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    max_depth=int(best_params['max_depth']),
    learning_rate=best_params['learning_rate'],
    n_estimators=int(best_params['n_estimators']),
    min_child_weight=best_params['min_child_weight'],
    gamma=best_params['gamma'],
    subsample=best_params['subsample'],
    colsample_bytree=best_params['colsample_bytree'],
    alpha=best_params['alpha'],
    reg_lambda=best_params['reg_lambda'],
    random_state=42
)

XGB_model.fit(X_train, y_train)

print("XGBoost模型训练完成！")


|   iter    |  target   |   alpha   | colsam... |   gamma   | learni... | max_depth | min_ch... | n_esti... | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------


  warn(msg, stacklevel=1)


| [39m1        [39m | [39m0.5417   [39m | [39m0.07497  [39m | [39m0.9655   [39m | [39m3.66     [39m | [39m0.1237   [39m | [39m7.624    [39m | [39m2.404    [39m | [39m55.81    [39m | [39m0.1732   [39m | [39m0.8803   [39m |
| [39m2        [39m | [39m0.5384   [39m | [39m0.1416   [39m | [39m0.3144   [39m | [39m4.85     [39m | [39m0.1682   [39m | [39m7.849    [39m | [39m2.636    [39m | [39m68.34    [39m | [39m0.06092  [39m | [39m0.8574   [39m |
| [35m3        [39m | [35m0.5435   [39m | [35m0.08645  [39m | [35m0.5039   [39m | [35m3.059    [39m | [35m0.0365   [39m | [35m8.169    [39m | [35m4.297    [39m | [35m95.61    [39m | [35m0.1571   [39m | [35m0.7599   [39m |
| [39m4        [39m | [39m0.5363   [39m | [39m0.1029   [39m | [39m0.7147   [39m | [39m0.2323   [39m | [39m0.1254   [39m | [39m7.682    [39m | [39m1.585    [39m | [39m144.9    [39m | [39m0.1931   [39m | [39m0.9425   [39m |
| [35m5        [39m | 

### Blending

In [13]:
X_train = train_data.drop(columns=['id', 'home_team_win', 'season'])
y_train = train_data['home_team_win']
X_test = future_data.drop(columns=['id', 'season'])

xgb_train_pred = XGB_model.predict_proba(X_train)[:, 1]
rf_train_pred = RF_model.predict_proba(X_train)[:, 1]
svm_train_pred = SVM_model.predict_proba(X_train_scaled)[:, 1]
lr_train_pred = LR_model.predict_proba(X_train)[:, 1]

# 堆疊訓練集預測
stacked_train_predictions = np.column_stack((xgb_train_pred, rf_train_pred, svm_train_pred, lr_train_pred))

# 訓練堆疊模型
stacked_lr = LogisticRegression(max_iter=1000)
stacked_lr.fit(stacked_train_predictions, y_train)

# 生成測試集概率預測
xgb_test_pred = XGB_model.predict_proba(X_test)[:, 1]
rf_test_pred = RF_model.predict_proba(X_test)[:, 1]
svm_test_pred = SVM_model.predict_proba(X_test_scaled)[:, 1]
lr_test_pred = LR_model.predict_proba(X_test)[:, 1]

# 堆疊測試集預測
stacked_test_predictions = np.column_stack((xgb_test_pred, rf_test_pred, svm_test_pred, lr_test_pred))

# 最終預測
final_predictions = stacked_lr.predict(stacked_test_predictions)
final_predictions = final_predictions.astype(bool)

# 保存結果
output_df = pd.DataFrame({
    'id': future_data['id'],
    'home_team_win': final_predictions
})

output_df.to_csv('blended_predictions.csv', index=False)

print("模型訓練完成，預測結果示例：")
print(output_df.head())

模型訓練完成，預測結果示例：
   id  home_team_win
0   0          False
1   1           True
2   2           True
3   3          False
4   4           True
