In [14]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier

In [5]:
matches = pd.read_csv('gold_FPro_data_combination.csv')
matches

Unnamed: 0,Date,shots,shots_on_target,shots_on_target_pct,average_shot_distance,shots_free_kicks,pens_made,pens_att,xg,npxg,...,Season,form_home,form_away,xg_rolling,npxg_rolling,shots_rolling,tackles_rolling,h2h_avg_result_3,h2h_avg_goals_3,h2h_avg_goals_against_3
0,2017-08-12,14,4,28.6,19.5,2,0,0,1.9,1.9,...,1,2.514085,0.110982,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,2017-08-12,6,2,33.3,18.1,0,0,0,0.3,0.3,...,1,0.027746,1.763046,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,2017-08-13,18,6,33.3,19.0,0,0,0,2.5,2.5,...,1,1.500000,0.500000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,2017-08-13,21,5,23.8,18.3,0,0,0,2.1,2.1,...,1,1.568401,0.365653,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,2017-08-13,9,1,11.1,17.8,0,0,0,0.5,0.5,...,1,0.002654,1.425330,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1534,2024-05-19,5,2,40.0,20.3,2,0,0,0.6,0.6,...,6,0.335759,4.058811,1.366667,1.133333,11.666667,15.333333,0.333333,0.333333,3.000000
1535,2024-05-19,26,5,19.2,14.1,0,0,0,2.9,2.9,...,6,4.712486,0.083940,1.933333,1.933333,15.666667,24.000000,0.666667,3.000000,0.666667
1536,2024-05-19,11,4,36.4,17.0,0,0,0,1.3,1.3,...,6,0.874124,0.149489,0.800000,0.800000,12.666667,19.666667,0.333333,0.666667,2.333333
1537,2024-05-19,3,2,66.7,10.3,0,0,0,0.4,0.4,...,6,0.156381,5.273781,0.966667,0.966667,9.333333,20.666667,0.000000,0.666667,2.333333


In [6]:
matches = matches.sort_values('Date')

In [16]:
tscv = TimeSeriesSplit(n_splits=10)

In [17]:
X = matches.drop(columns=['target', 'Date'])
y = matches['target']

## Áp dụng TimeSeriesSplit trên 3 mô hình tham số tối ưu đã tìm được với GridSearch

In [18]:
lr = LogisticRegression(C=10, max_iter=1000, multi_class='multinomial', penalty='l2', solver='lbfgs')
rf = RandomForestClassifier(max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=500)
xgb = XGBClassifier(colsample_bytree=0.8, learning_rate=0.01, max_depth=10, n_estimators=500, subsample=0.8)

In [19]:
models = {'Logistic Regression': lr, 'Random Forest': rf, 'XGBoost': xgb}

In [20]:
for name, model in models.items():
    print(f"\n=== {name} ===")
    acc_scores = []

    for fold, (train_index, test_index) in enumerate(tscv.split(X)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        acc_scores.append(acc)
        print(f"Fold {fold + 1}: Accuracy = {acc:.4f}")

    print(f"Average Accuracy: {np.mean(acc_scores):.4f}")


=== Logistic Regression ===
Fold 1: Accuracy = 0.8201
Fold 2: Accuracy = 0.8777
Fold 3: Accuracy = 0.8633
Fold 4: Accuracy = 0.9137
Fold 5: Accuracy = 0.8633
Fold 6: Accuracy = 0.9209
Fold 7: Accuracy = 0.9353
Fold 8: Accuracy = 0.9712
Fold 9: Accuracy = 0.9281
Fold 10: Accuracy = 0.9496
Average Accuracy: 0.9043

=== Random Forest ===
Fold 1: Accuracy = 0.7626
Fold 2: Accuracy = 0.7914
Fold 3: Accuracy = 0.7266
Fold 4: Accuracy = 0.8058
Fold 5: Accuracy = 0.8129
Fold 6: Accuracy = 0.7914
Fold 7: Accuracy = 0.8345
Fold 8: Accuracy = 0.9137
Fold 9: Accuracy = 0.8058
Fold 10: Accuracy = 0.8777
Average Accuracy: 0.8122

=== XGBoost ===
Fold 1: Accuracy = 0.8417
Fold 2: Accuracy = 0.8417
Fold 3: Accuracy = 0.8921
Fold 4: Accuracy = 0.9353
Fold 5: Accuracy = 0.9137
Fold 6: Accuracy = 0.8993
Fold 7: Accuracy = 0.9209
Fold 8: Accuracy = 0.9568
Fold 9: Accuracy = 0.9065
Fold 10: Accuracy = 0.9281
Average Accuracy: 0.9036


* Dựa trên kết quả từ TimeSeriesSplit với số lần split là 10, nhóm nhận thấy rằng hai mô hình Logistic Regression và XGBoost cho thấy độ ổn định tốt khi độ chính xác trung bình lần lượt là 90.43% và 90.36%, tuy nhiên dựa trên kết quả về confusion matrix ở file huấn luyện với cả hai đặc trưng là Rolling và Form thì nhóm vẫn lựa chọn XGBoost làm mô hình cuối cùng cho việc dự đoán vì độ đo Precision ổn định hơn Logistic Regression