In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
future_data = pd.read_csv('future.csv')

In [2]:
#past
# 分離訓練與測試資料
X_train = train_data.drop(columns=['home_team_win'])
y_train = train_data['home_team_win']
X_test = test_data.drop(columns=['home_team_win'])

# 分割訓練集與驗證集
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)
# 初始化 Logistic Regression 模型
lr_model = LogisticRegression(penalty='l1', solver='liblinear',max_iter=1000, random_state=42)

# 選取特徵（基於 Logistic Regression 模型的係數進行篩選）
selector = SelectFromModel(lr_model, threshold="median")
selector.fit(X_train_split, y_train_split)

# 使用篩選後的特徵
X_train_selected = selector.transform(X_train_split)
X_val_selected = selector.transform(X_val_split)
X_test_selected = selector.transform(X_test)

# 選中的特徵名稱
selected_features = X_train.columns[selector.get_support()]
print(f"選中的特徵: {list(selected_features)}")

# 使用篩選後的特徵重新訓練 Logistic Regression 模型
lr_model.fit(X_train_selected, y_train_split)

# 驗證模型性能
y_val_pred = lr_model.predict(X_val_selected)
val_accuracy = accuracy_score(y_val_split, y_val_pred)
print(f"驗證準確率: {val_accuracy:.4f}")

# 使用最佳模型預測測試資料
y_test_pred = lr_model.predict(X_test_selected)

# 將預測結果存入 DataFrame
output_df = pd.DataFrame({
    'id': range(len(X_test)),
    'home_team_win': y_test_pred
})

print("預測結果範例:")
print(output_df.head())

# 如果需要，儲存預測結果
output_df.to_csv('lr_predictions1.csv', index=False)

選中的特徵: ['is_night_game', 'home_team_rest', 'home_batting_batting_avg_10RA', 'home_batting_onbase_perc_10RA', 'home_batting_onbase_plus_slugging_10RA', 'away_batting_onbase_perc_10RA', 'away_batting_leverage_index_avg_10RA', 'home_pitching_earned_run_avg_10RA', 'home_pitching_SO_batters_faced_10RA', 'home_pitching_BB_batters_faced_10RA', 'away_pitching_earned_run_avg_10RA', 'away_pitching_SO_batters_faced_10RA', 'away_pitching_H_batters_faced_10RA', 'home_pitcher_earned_run_avg_10RA', 'away_pitcher_earned_run_avg_10RA', 'away_pitcher_H_batters_faced_10RA', 'home_team_errors_mean', 'home_team_errors_std', 'home_team_errors_skew', 'away_team_errors_skew', 'home_team_spread_mean', 'home_team_spread_std', 'home_team_spread_skew', 'away_team_spread_mean', 'away_team_spread_skew', 'home_team_wins_std', 'home_team_wins_skew', 'away_team_wins_skew', 'home_batting_batting_avg_mean', 'home_batting_batting_avg_std', 'home_batting_onbase_perc_mean', 'home_batting_onbase_plus_slugging_mean', 'home_b

In [3]:
#future
# 分離訓練與測試資料
X_train = train_data.drop(columns=['home_team_win','season'])
y_train = train_data['home_team_win']
X_test = future_data.drop(columns=['home_team_win', 'season'])

# 分割訓練集與驗證集
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

# 初始化 Logistic Regression 模型
lr_model = LogisticRegression(max_iter=1000, random_state=42)

# 選取特徵（基於 Logistic Regression 模型的係數進行篩選）
selector = SelectFromModel(lr_model, threshold="median")
selector.fit(X_train_split, y_train_split)

# 使用篩選後的特徵
X_train_selected = selector.transform(X_train_split)
X_val_selected = selector.transform(X_val_split)
X_test_selected = selector.transform(X_test)

# 選中的特徵名稱
selected_features = X_train.columns[selector.get_support()]
print(f"選中的特徵: {list(selected_features)}")

# 使用篩選後的特徵重新訓練 Logistic Regression 模型
lr_model.fit(X_train_selected, y_train_split)

# 驗證模型性能
y_val_pred = lr_model.predict(X_val_selected)
val_accuracy = accuracy_score(y_val_split, y_val_pred)
print(f"驗證準確率: {val_accuracy:.4f}")

# 使用最佳模型預測測試資料
y_test_pred = lr_model.predict(X_test_selected)

# 將預測結果存入 DataFrame
output_df = pd.DataFrame({
    'id': range(len(X_test)),
    'home_team_win': y_test_pred
})

print("預測結果範例:")
print(output_df.head())

# 如果需要，儲存預測結果
output_df.to_csv('lr_predictions2.csv', index=False)

選中的特徵: ['is_night_game', 'home_team_rest', 'home_batting_batting_avg_10RA', 'home_batting_onbase_perc_10RA', 'home_batting_onbase_plus_slugging_10RA', 'away_batting_onbase_perc_10RA', 'away_batting_leverage_index_avg_10RA', 'home_pitching_earned_run_avg_10RA', 'home_pitching_SO_batters_faced_10RA', 'home_pitching_BB_batters_faced_10RA', 'away_pitching_earned_run_avg_10RA', 'away_pitching_SO_batters_faced_10RA', 'away_pitching_H_batters_faced_10RA', 'home_pitcher_earned_run_avg_10RA', 'away_pitcher_earned_run_avg_10RA', 'away_pitcher_H_batters_faced_10RA', 'away_pitcher_BB_batters_faced_10RA', 'home_team_errors_mean', 'home_team_errors_std', 'home_team_errors_skew', 'away_team_errors_skew', 'home_team_spread_mean', 'home_team_spread_std', 'away_team_spread_mean', 'away_team_spread_skew', 'home_team_wins_std', 'home_team_wins_skew', 'away_team_wins_skew', 'home_batting_batting_avg_mean', 'home_batting_batting_avg_std', 'home_batting_onbase_perc_mean', 'home_batting_onbase_plus_slugging_m

In [10]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization
from sklearn.feature_selection import SelectFromModel
import optuna
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

train_data = pd.read_csv(r'C:\Users\Liao\Desktop\learning\ml\final\datasets\train_data_A.csv')
test_data = pd.read_csv(r'C:\Users\Liao\Desktop\learning\ml\final\datasets\test_data_A.csv')
future_data = pd.read_csv(r'C:\Users\Liao\Desktop\learning\ml\final\datasets\future_data_A.csv')

In [11]:
X_train = train_data.drop(columns=['id', 'home_team_win'])
y_train = train_data['home_team_win']
X_test = test_data.drop(columns=['id'])

In [12]:
scaler = StandardScaler()

# 定義超參數搜索目標函數
def objective(trial):
    # 定義要調整的超參數
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2', 'elasticnet', 'none'])
    C = trial.suggest_float('C', 1e-3, 1e2, log=True)
    solver = trial.suggest_categorical('solver', ['lbfgs', 'saga', 'liblinear'])
    l1_ratio = None
    if penalty == 'elasticnet':
        l1_ratio = trial.suggest_float('l1_ratio', 0.0, 1.0)
    if penalty == 'none' and solver != 'lbfgs':
        raise optuna.exceptions.TrialPruned()
    if penalty == 'l1' and solver not in ['liblinear', 'saga']:
        raise optuna.exceptions.TrialPruned()
    if penalty == 'elasticnet' and solver != 'saga':
        raise optuna.exceptions.TrialPruned()
    
    # 定義 Logistic Regression 模型
    model = Pipeline([
        ('scaler', scaler),
        ('lr', LogisticRegression(
            penalty=penalty,
            C=C,
            solver=solver,
            l1_ratio=l1_ratio,
            max_iter=1000,
            random_state=42
        ))
    ])
    
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

best_params = study.best_params
best_score = study.best_value
print("Best hyperparameters:", best_params)
print(f"Best validation AUC: {best_score:.4f}")

# 使用最佳超參數訓練最終模型
final_model = Pipeline([
    ('scaler', scaler),
    ('lr', LogisticRegression(
        **best_params,
        max_iter=1000,
        random_state=42
    ))
])
final_model.fit(X_train, y_train)

[I 2024-12-20 03:58:51,881] A new study created in memory with name: no-name-1cd4e35d-db92-44f3-8b5d-cc4acd404211
[I 2024-12-20 03:58:51,885] Trial 0 pruned. 
[I 2024-12-20 03:59:09,221] Trial 1 finished with value: 0.5682312450376781 and parameters: {'penalty': 'elasticnet', 'C': 11.38762736895537, 'solver': 'saga', 'l1_ratio': 0.8655887290558352}. Best is trial 1 with value: 0.5682312450376781.
[I 2024-12-20 03:59:09,877] Trial 2 finished with value: 0.5166046362832428 and parameters: {'penalty': 'l1', 'C': 0.00261755286004732, 'solver': 'liblinear'}. Best is trial 1 with value: 0.5682312450376781.
[I 2024-12-20 03:59:09,880] Trial 3 pruned. 
[I 2024-12-20 03:59:09,881] Trial 4 pruned. 
[I 2024-12-20 03:59:09,884] Trial 5 pruned. 
[I 2024-12-20 03:59:10,831] Trial 6 finished with value: 0.5682869698734248 and parameters: {'penalty': 'l2', 'C': 0.2840230792703287, 'solver': 'lbfgs'}. Best is trial 6 with value: 0.5682869698734248.
[I 2024-12-20 03:59:25,031] Trial 7 finished with valu

Best hyperparameters: {'penalty': 'l1', 'C': 0.02326970407861033, 'solver': 'saga'}
Best validation AUC: 0.5753


In [13]:
y_test_pred = final_model.predict(X_test)
predicted_results = y_test_pred > 0

# 儲存結果
output_df = pd.DataFrame({
    'id': range(len(X_test)),
    'home_team_win': predicted_results
})

print("預測結果範例:")
print(output_df.head())
output_df.to_csv('lr_prediction1-2.csv', index=False)

預測結果範例:
   id  home_team_win
0   0           True
1   1          False
2   2           True
3   3           True
4   4          False


In [14]:
X_train = train_data.drop(columns=['id', 'home_team_win','season'])
y_train = train_data['home_team_win']
X_test = future_data.drop(columns=['id','season'])

In [15]:
scaler = StandardScaler()

# 定義超參數搜索目標函數
def objective(trial):
    # 定義要調整的超參數
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2', 'elasticnet', 'none'])
    C = trial.suggest_float('C', 1e-3, 1e2, log=True)
    solver = trial.suggest_categorical('solver', ['lbfgs', 'saga', 'liblinear'])
    l1_ratio = None
    if penalty == 'elasticnet':
        l1_ratio = trial.suggest_float('l1_ratio', 0.0, 1.0)
    if penalty == 'none' and solver != 'lbfgs':
        raise optuna.exceptions.TrialPruned()
    if penalty == 'l1' and solver not in ['liblinear', 'saga']:
        raise optuna.exceptions.TrialPruned()
    if penalty == 'elasticnet' and solver != 'saga':
        raise optuna.exceptions.TrialPruned()
    
    # 定義 Logistic Regression 模型
    model = Pipeline([
        ('scaler', scaler),
        ('lr', LogisticRegression(
            penalty=penalty,
            C=C,
            solver=solver,
            l1_ratio=l1_ratio,
            max_iter=1000,
            random_state=42
        ))
    ])
    
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

best_params = study.best_params
best_score = study.best_value
print("Best hyperparameters:", best_params)
print(f"Best validation AUC: {best_score:.4f}")

# 使用最佳超參數訓練最終模型
final_model = Pipeline([
    ('scaler', scaler),
    ('lr', LogisticRegression(
        **best_params,
        max_iter=1000,
        random_state=42
    ))
])
final_model.fit(X_train, y_train)

[I 2024-12-20 04:00:58,004] A new study created in memory with name: no-name-59f7cb6b-11bf-42aa-a73d-a2713ae22ed7
[I 2024-12-20 04:00:58,008] Trial 0 pruned. 
[I 2024-12-20 04:00:58,878] Trial 1 finished with value: 0.5685547475703273 and parameters: {'penalty': 'none', 'C': 1.6450837697210967, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.5685547475703273.
[I 2024-12-20 04:00:58,880] Trial 2 pruned. 
[I 2024-12-20 04:00:59,837] Trial 3 finished with value: 0.5685547475703273 and parameters: {'penalty': 'none', 'C': 12.432644903832825, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.5685547475703273.
[I 2024-12-20 04:01:13,157] Trial 4 finished with value: 0.5689689276727375 and parameters: {'penalty': 'l2', 'C': 0.048250938694890694, 'solver': 'saga'}. Best is trial 4 with value: 0.5689689276727375.
[I 2024-12-20 04:01:14,127] Trial 5 finished with value: 0.5685547475703273 and parameters: {'penalty': 'none', 'C': 2.092201147706196, 'solver': 'lbfgs'}. Best is trial 4 with value

Best hyperparameters: {'penalty': 'l2', 'C': 0.0010118268662031615, 'solver': 'lbfgs'}
Best validation AUC: 0.5752


In [17]:
y_test_pred = final_model.predict(X_test)
predicted_results = y_test_pred > 0

# 儲存結果
output_df = pd.DataFrame({
    'id': range(len(X_test)),
    'home_team_win': predicted_results
})

print("預測結果範例:")
print(output_df.head())
output_df.to_csv('lr_prediction2-2.csv', index=False)

預測結果範例:
   id  home_team_win
0   0           True
1   1           True
2   2           True
3   3          False
4   4           True
