In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

from xgboost import XGBClassifier

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [2]:
matches = pd.read_csv('gold_FPro_data_combination.csv')
matches

Unnamed: 0,Date,shots,shots_on_target,shots_on_target_pct,average_shot_distance,shots_free_kicks,pens_made,pens_att,xg,npxg,npxg_per_shot,xg_net,npxg_net,tackles,tackles_won,tackles_def_3rd,tackles_mid_3rd,tackles_att_3rd,challenge_tackles,challenges,challenge_tackles_pct,challenges_lost,blocks,blocked_shots,blocked_passes,interceptions,tackles_interceptions,clearances,errors,venue_code,time,day_code,hometeam_code,opponent_code,target,Season,form_home,form_away,xg_rolling,npxg_rolling,shots_rolling,tackles_rolling,h2h_avg_result_3,h2h_avg_goals_3,h2h_avg_goals_against_3
0,2017-08-12,14,4,28.6,19.5,2,0,0,1.9,1.9,0.17,-0.9,-0.9,10,6,6,4,0,2,9,22.2,7,9,2,7,9,19,8,1,0,17.5,5,6,7,1,1,2.514085,0.110982,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,2017-08-12,6,2,33.3,18.1,0,0,0,0.3,0.3,0.05,-0.3,-0.3,10,8,9,1,0,6,12,50.0,6,10,5,5,12,22,43,1,1,17.5,5,7,6,0,1,0.027746,1.763046,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,2017-08-13,18,6,33.3,19.0,0,0,0,2.5,2.5,0.14,-0.5,-0.5,16,11,8,6,2,2,6,33.3,4,5,0,5,5,21,26,0,0,13.5,6,0,8,1,1,1.500000,0.500000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,2017-08-13,21,5,23.8,18.3,0,0,0,2.1,2.1,0.10,1.9,1.9,14,8,3,6,5,2,5,40.0,3,6,1,5,11,25,21,0,1,16.0,6,2,9,1,1,1.568401,0.365653,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,2017-08-13,9,1,11.1,17.8,0,0,0,0.5,0.5,0.05,-0.5,-0.5,21,16,12,7,2,8,31,25.8,23,14,8,6,12,33,27,1,0,16.0,6,9,2,0,1,0.002654,1.425330,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1534,2024-05-19,5,2,40.0,20.3,2,0,0,0.6,0.6,0.13,0.4,0.4,17,11,14,3,0,3,8,37.5,5,21,10,11,5,22,29,2,0,16.0,6,4,5,0,6,0.335759,4.058811,1.366667,1.133333,11.666667,15.333333,0.333333,0.333333,3.000000
1535,2024-05-19,26,5,19.2,14.1,0,0,0,2.9,2.9,0.12,-0.9,-0.9,7,6,5,2,0,2,12,16.7,10,16,1,15,8,15,5,0,1,16.0,6,5,4,1,6,4.712486,0.083940,1.933333,1.933333,15.666667,24.000000,0.666667,3.000000,0.666667
1536,2024-05-19,11,4,36.4,17.0,0,0,0,1.3,1.3,0.12,0.7,0.7,21,14,8,11,2,12,15,80.0,3,16,9,7,7,28,16,0,0,16.0,6,2,7,1,6,0.874124,0.149489,0.800000,0.800000,12.666667,19.666667,0.333333,0.666667,2.333333
1537,2024-05-19,3,2,66.7,10.3,0,0,0,0.4,0.4,0.12,0.6,0.6,14,9,12,2,0,7,15,46.7,8,9,7,2,4,18,18,0,0,16.0,6,9,6,0,6,0.156381,5.273781,0.966667,0.966667,9.333333,20.666667,0.000000,0.666667,2.333333


# Splitting for Training and Testing

In [3]:
train = matches[matches['Date'] <= '2023-05-29'].copy()
test = matches[matches['Date'] > '2023-05-29'].copy()

train.drop(columns='Date', inplace=True)
test.drop(columns='Date', inplace=True)

In [4]:
print(f'Percentage of Training Data: {(train.shape[0] / matches.shape[0]) * 100}')
print(f'Percentage of Testing Data: {(test.shape[0] / matches.shape[0]) * 100}')

Percentage of Training Data: 85.7050032488629
Percentage of Testing Data: 14.294996751137102


In [5]:
X_train = train.drop(columns='target')
y_train = train['target']

X_test = test.drop(columns='target')
y_test = test['target']

# Model

## Default Parameter

In [6]:
lr = LogisticRegression(multi_class='multinomial', solver='lbfgs')
rf = RandomForestClassifier()
xgb = XGBClassifier(objective='multi:softprob', num_class=3)

In [7]:
lr.fit(X_train, y_train)
rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)

In [8]:
lr_pred = lr.predict(X_test)
rf_pred = rf.predict(X_test)
xgb_pred = xgb.predict(X_test)

In [9]:
lr_acc = accuracy_score(y_test, lr_pred)
rf_acc = accuracy_score(y_test, rf_pred)
xgb_acc = accuracy_score(y_test, xgb_pred)

In [10]:
print(f'Accuracy of Logistic Regression: {lr_acc}')

combined_lr = pd.DataFrame(dict(actual=y_test, prediction=lr_pred))
pd.crosstab(index=combined_lr['actual'], columns=combined_lr['prediction'])

Accuracy of Logistic Regression: 0.8409090909090909


prediction,0,1,2
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,70,3,10
1,0,78,5
2,4,13,37


In [11]:
print(f'Accuracy of RandomForest: {rf_acc}')

combined_rf = pd.DataFrame(dict(actual=y_test, prediction=rf_pred))
pd.crosstab(index=combined_rf['actual'], columns=combined_rf['prediction'])

Accuracy of RandomForest: 0.8045454545454546


prediction,0,1,2
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,70,5,8
1,0,79,4
2,8,18,28


In [12]:
print(f'Accuracy of XGBoost: {xgb_acc}')

combined_xgb = pd.DataFrame(dict(actual=y_test, prediction=xgb_pred))
pd.crosstab(index=combined_xgb['actual'], columns=combined_xgb['prediction'])

Accuracy of XGBoost: 0.9136363636363637


prediction,0,1,2
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,74,1,8
1,0,79,4
2,3,3,48


## Using GridSearchCV for best parameters

In [13]:
def tune_model(model_name, X_train, y_train):
    if model_name == 'lr':
        model = LogisticRegression(random_state=42)
        param_grid = {
            'penalty': ['l2'],
            'C': [0.01, 0.1, 1, 10],
            'solver': ['lbfgs'],
            'multi_class': ['multinomial'],
            'max_iter': [1000]
        }

    elif model_name == 'rf':
        model = RandomForestClassifier(random_state=42)
        param_grid = {
            'n_estimators': [100, 300, 500],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }

    elif model_name == 'xgb':
        model = XGBClassifier(
            objective='multi:softprob',
            num_class=3,
            eval_metric='mlogloss',
            use_label_encoder=False,
            random_state=42
        )
        param_grid = {
            'n_estimators': [100, 300, 500],
            'max_depth': [3, 6, 10],
            'learning_rate': [0.01, 0.1, 0.2],
            'subsample': [0.8, 1],
            'colsample_bytree': [0.8, 1]
        }

    else:
        raise ValueError("model_name must be one of: 'lr', 'rf', 'xgb'")

    grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
    grid.fit(X_train, y_train)

    print(f"[{model_name.upper()}] Best params:", grid.best_params_)
    print(f"[{model_name.upper()}] Best score: {grid.best_score_:.4f}")
    return grid.best_estimator_

In [14]:
best_lr = tune_model('lr', X_train, y_train)
best_rf = tune_model('rf', X_train, y_train)
best_xgb = tune_model('xgb', X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[LR] Best params: {'C': 10, 'max_iter': 1000, 'multi_class': 'multinomial', 'penalty': 'l2', 'solver': 'lbfgs'}
[LR] Best score: 0.9447
Fitting 5 folds for each of 81 candidates, totalling 405 fits
[RF] Best params: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 500}
[RF] Best score: 0.8416
Fitting 5 folds for each of 108 candidates, totalling 540 fits
[XGB] Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 500, 'subsample': 0.8}
[XGB] Best score: 0.9356


In [15]:
lr_best_model_pred = best_lr.predict(X_test)
rf_best_model_pred = best_rf.predict(X_test)
xgb_best_model_pred = best_xgb.predict(X_test)

In [16]:
lr_best_model_acc = accuracy_score(y_test, lr_best_model_pred)
rf_best_model_acc = accuracy_score(y_test, rf_best_model_pred)
xgb_best_model_acc = accuracy_score(y_test, xgb_best_model_pred)

In [17]:
print(f'Accuracy of Logistic Regression with best parameters: {lr_best_model_acc}')

combined_best_lr = pd.DataFrame(dict(actual=y_test, predictions=lr_best_model_pred))
pd.crosstab(index=combined_best_lr['actual'], columns=combined_best_lr['predictions'])

Accuracy of Logistic Regression with best parameters: 0.9181818181818182


predictions,0,1,2
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,72,0,11
1,0,78,5
2,1,1,52


In [18]:
print(f'Accuracy of Random Forest with best parameters: {rf_best_model_acc}')

combined_best_rf = pd.DataFrame(dict(actual=y_test, predictions=rf_best_model_pred))
pd.crosstab(index=combined_best_rf['actual'], columns=combined_best_rf['predictions'])

Accuracy of Random Forest with best parameters: 0.8227272727272728


predictions,0,1,2
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,72,5,6
1,0,77,6
2,8,14,32


In [19]:
print(f'Accuracy of XGBoost with best parameters: {xgb_best_model_acc}')

combined_best_xgb = pd.DataFrame(dict(actual=y_test, predictions=xgb_best_model_pred))
pd.crosstab(index=combined_best_xgb['actual'], columns=combined_best_xgb['predictions'])

Accuracy of XGBoost with best parameters: 0.9


predictions,0,1,2
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,75,2,6
1,0,79,4
2,2,8,44


## Comparison

In [20]:
print('Default Logistic Regression Classification Report')
print('=' * 30)
default_lr_classification_report = classification_report(y_test, lr_pred)
print(default_lr_classification_report)
print('\n' * 2)
print('GridSearch-applied Logistic Regression Classification Report')
print('=' * 30)
best_lr_classification_report = classification_report(y_test, lr_best_model_pred)
print(best_lr_classification_report)


Default Logistic Regression Classification Report
              precision    recall  f1-score   support

           0       0.95      0.84      0.89        83
           1       0.83      0.94      0.88        83
           2       0.71      0.69      0.70        54

    accuracy                           0.84       220
   macro avg       0.83      0.82      0.82       220
weighted avg       0.84      0.84      0.84       220




GridSearch-applied Logistic Regression Classification Report
              precision    recall  f1-score   support

           0       0.99      0.87      0.92        83
           1       0.99      0.94      0.96        83
           2       0.76      0.96      0.85        54

    accuracy                           0.92       220
   macro avg       0.91      0.92      0.91       220
weighted avg       0.93      0.92      0.92       220



In [21]:
print('Default Random Forest Classification Report')
print('=' * 30)
default_rf_classification_report = classification_report(y_test, rf_pred)
print(default_rf_classification_report)
print('\n' * 2)
print('GridSearch-applied Random Forest Classification Report')
print('=' * 30)
best_rf_classification_report = classification_report(y_test, rf_best_model_pred)
print(best_rf_classification_report)


Default Random Forest Classification Report
              precision    recall  f1-score   support

           0       0.90      0.84      0.87        83
           1       0.77      0.95      0.85        83
           2       0.70      0.52      0.60        54

    accuracy                           0.80       220
   macro avg       0.79      0.77      0.77       220
weighted avg       0.80      0.80      0.80       220




GridSearch-applied Random Forest Classification Report
              precision    recall  f1-score   support

           0       0.90      0.87      0.88        83
           1       0.80      0.93      0.86        83
           2       0.73      0.59      0.65        54

    accuracy                           0.82       220
   macro avg       0.81      0.80      0.80       220
weighted avg       0.82      0.82      0.82       220



In [22]:
print('Default XGBoost Classification Report')
print('=' * 30)
default_xgb_classification_report = classification_report(y_test, xgb_pred)
print(default_xgb_classification_report)
print('\n' * 2)
print('GridSearch-applied XGBoost Classification Report')
print('=' * 30)
best_xgb_classification_report = classification_report(y_test, xgb_best_model_pred)
print(best_xgb_classification_report)


Default XGBoost Classification Report
              precision    recall  f1-score   support

           0       0.96      0.89      0.93        83
           1       0.95      0.95      0.95        83
           2       0.80      0.89      0.84        54

    accuracy                           0.91       220
   macro avg       0.90      0.91      0.91       220
weighted avg       0.92      0.91      0.91       220




GridSearch-applied XGBoost Classification Report
              precision    recall  f1-score   support

           0       0.97      0.90      0.94        83
           1       0.89      0.95      0.92        83
           2       0.81      0.81      0.81        54

    accuracy                           0.90       220
   macro avg       0.89      0.89      0.89       220
weighted avg       0.90      0.90      0.90       220



In [None]:
def predict_future_match(home_team_name, opponent_name, date, df_hist, model, team_mapping, window=3):
    """
    Hàm predict_future_match sẽ nhận các tham số đầu vào gồm:
    - Tên các đội chủ nhà, đội khách
    - Ngày diễn ra trận đấu
    - Dữ liệu lịch sử
    - Mô hình dự đoán
    - Danh sách ánh xạ tên các đội bóng sang giá trị số
    Hàm này thực hiện việc dự đoán kết quả trận đấu dựa trên trung bình 3 trận đấu trước đó
    """
    home_code = team_mapping.get(home_team_name)
    opp_code = team_mapping.get(opponent_name)
    if home_code is None or opp_code is None:
        raise ValueError("Tên đội không nằm trong danh sách team_mapping.")
    elif home_code == opp_code:
        raise ValueError("Hai đội không thể cùng tên")

    df = df_hist.copy()
    df['Date'] = pd.to_datetime(df['Date'])
    df = df[df['Date'] < pd.to_datetime(date)]

    recent_home = df[df['hometeam_code'] == home_code].sort_values(by='Date').tail(window)

    h2h = df[(df['hometeam_code'] == home_code) & (df['opponent_code'] == opp_code)].sort_values(by='Date').tail(window)

    features = df.drop(columns=['Date', 'target']).columns.tolist()

    input_dict = {}

    for feat in features:
        if feat == 'hometeam_code':
            input_dict[feat] = home_code
        elif feat == 'opponent_code':
            input_dict[feat] = opp_code
        elif feat.startswith('h2h_avg_'):
            input_dict[feat] = h2h[feat].mean() if feat in h2h.columns and not h2h.empty else 0.0
        elif feat.endswith('_rolling'):
            input_dict[feat] = recent_home[feat].mean() if feat in recent_home.columns and not recent_home.empty else 0.0
        elif feat == 'venue_code':
            input_dict[feat] = 1
        else:
            input_dict[feat] = recent_home[feat].mean() if feat in recent_home.columns and not recent_home.empty else 0.0

    X_input = pd.DataFrame([input_dict])
    prediction = model.predict(X_input)[0]
    probas = model.predict_proba(X_input)[0]
    label_map = {0: 'L', 1: 'W', 2: 'D'}

    return {
        'prediction': label_map[prediction],
        'probabilities': {
            'L': round(probas[0], 3),
            'W': round(probas[1], 3),
            'D': round(probas[2], 3),
        }
    }

In [24]:
team_mapping = {
    'Tottenham': 0,
    'Chelsea': 1,
    'Manchester Utd': 2,
    'Liverpool': 3,
    'Everton': 4,
    'Arsenal': 5,
    'Manchester City': 6,
    'Brighton': 7,
    'Newcastle Utd': 8,
    'West Ham': 9,
    'Crystal Palace': 10
}

In [26]:
import joblib
joblib.dump(best_xgb, 'Final_chosen_model.pkl')

['Final_chosen_model.pkl']

In [27]:
true_label_2425 = [0, 0, 1, 1, 0, 2, 0, 1, 2, 0,
                   0, 0, 2, 2, 2, 1, 1, 0, 0, 2, 1,
                   1, 0, 1, 1, 1, 1, 2, 1, 1]

matches_2425 = [
    ['Everton', 'Brighton', '2024-08-17'],
    ['Chelsea', 'Manchester City', '2024-08-18'],
    ['Brighton', 'Manchester Utd', '2024-08-24'],
    ['Tottenham', 'Everton', '2024-08-24'],
    ['Crystal Palace', 'West Ham', '2024-08-24'],
    ['Arsenal', 'Brighton', '2024-08-31'],
    ['West Ham', 'Manchester City', '2024-08-31'],
    ['Newcastle Utd', 'Tottenham', '2024-09-01'],
    ['Chelsea', 'Crystal Palace', '2024-09-01'],
    ['Manchester Utd', 'Liverpool', '2024-09-01'],
    ['Tottenham', 'Arsenal', '2024-09-15'],
    ['West Ham', 'Chelsea', '2024-09-21'],
    ['Crystal Palace', 'Manchester Utd', '2024-09-21'],
    ['Manchester City', 'Arsenal', '2024-09-22'],
    ['Newcastle Utd', 'Manchester City', '2024-09-28'],
    ['Everton', 'Crystal Palace', '2024-09-28'],
    ['Chelsea', 'Brighton', '2024-09-28'],
    ['Manchester Utd', 'Tottenham', '2024-09-29'],
    ['Crystal Palace', 'Liverpool', '2024-10-05'],
    ['Everton', 'Newcastle Utd', '2024-10-05'],
    ['Brighton', 'Tottenham', '2024-10-06'],
    ['Tottenham', 'West Ham', '2024-10-19'],
    ['Newcastle Utd', 'Brighton', '2024-10-19'],
    ['Liverpool', 'Chelsea', '2024-10-20'],
    ['West Ham', 'Manchester Utd', '2024-10-27'],
    ['Chelsea', 'Newcastle Utd', '2024-10-27'],
    ['Crystal Palace', 'Tottenham', '2024-10-27'],
    ['Arsenal', 'Liverpool', '2024-10-27'],
    ['Newcastle Utd', 'Arsenal', '2024-11-02'],
    ['Liverpool', 'Brighton', '2024-11-02']
]

In [None]:
predicted_labels = []

# Dự đoán kết quả từng trận đấu trong mùa giải 2024-2025
for match in matches_2425:
    home_team, opponent_team, match_date = match
    result = predict_future_match(
        home_team_name=home_team,
        opponent_name=opponent_team,
        date=match_date,
        df_hist=matches,
        model=best_lr,
        team_mapping=team_mapping
    )
    
    # Lấy nhãn dự đoán (dựa vào xác suất cao nhất)
    predicted_label = max(
        result['probabilities'], 
        key=result['probabilities']
        .get)
    
    label_to_int = {'L': 0, 'W': 1, 'D': 2}
    predicted_labels.append(label_to_int[predicted_label])

accuracy = accuracy_score(true_label_2425, predicted_labels)
print(f"Accuracy on 30 matches: {accuracy:.3f}")


Accuracy on 30 matches: 0.333


In [29]:
for i, match in enumerate(matches_2425):
    print(f"Match: {match}, Predicted: {predicted_labels[i]}, True: {true_label_2425[i]}")


Match: ['Everton', 'Brighton', '2024-08-17'], Predicted: 0, True: 0
Match: ['Chelsea', 'Manchester City', '2024-08-18'], Predicted: 1, True: 0
Match: ['Brighton', 'Manchester Utd', '2024-08-24'], Predicted: 0, True: 1
Match: ['Tottenham', 'Everton', '2024-08-24'], Predicted: 0, True: 1
Match: ['Crystal Palace', 'West Ham', '2024-08-24'], Predicted: 1, True: 0
Match: ['Arsenal', 'Brighton', '2024-08-31'], Predicted: 1, True: 2
Match: ['West Ham', 'Manchester City', '2024-08-31'], Predicted: 0, True: 0
Match: ['Newcastle Utd', 'Tottenham', '2024-09-01'], Predicted: 2, True: 1
Match: ['Chelsea', 'Crystal Palace', '2024-09-01'], Predicted: 1, True: 2
Match: ['Manchester Utd', 'Liverpool', '2024-09-01'], Predicted: 0, True: 0
Match: ['Tottenham', 'Arsenal', '2024-09-15'], Predicted: 0, True: 0
Match: ['West Ham', 'Chelsea', '2024-09-21'], Predicted: 0, True: 0
Match: ['Crystal Palace', 'Manchester Utd', '2024-09-21'], Predicted: 1, True: 2
Match: ['Manchester City', 'Arsenal', '2024-09-22']

In [None]:
predicted_labels = []

# Dự đoán từng trận trong mùa giải 2024-2025
for match in matches_2425:
    home_team, opponent_team, match_date = match
    result = predict_future_match(
        home_team_name=home_team,
        opponent_name=opponent_team,
        date=match_date,
        df_hist=matches,
        model=best_rf,
        team_mapping=team_mapping
    )
    predicted_label = max(
        result['probabilities'], 
        key=result['probabilities']
        .get)
    
    label_to_int = {'L': 0, 'W': 1, 'D': 2}
    predicted_labels.append(label_to_int[predicted_label])

accuracy = accuracy_score(true_label_2425, predicted_labels)
print(f"Accuracy on 30 matches: {accuracy:.3f}")


Accuracy on 30 matches: 0.433


In [31]:
for i, match in enumerate(matches_2425):
    print(f"Match: {match}, Predicted: {predicted_labels[i]}, True: {true_label_2425[i]}")

Match: ['Everton', 'Brighton', '2024-08-17'], Predicted: 0, True: 0
Match: ['Chelsea', 'Manchester City', '2024-08-18'], Predicted: 1, True: 0
Match: ['Brighton', 'Manchester Utd', '2024-08-24'], Predicted: 0, True: 1
Match: ['Tottenham', 'Everton', '2024-08-24'], Predicted: 0, True: 1
Match: ['Crystal Palace', 'West Ham', '2024-08-24'], Predicted: 1, True: 0
Match: ['Arsenal', 'Brighton', '2024-08-31'], Predicted: 1, True: 2
Match: ['West Ham', 'Manchester City', '2024-08-31'], Predicted: 0, True: 0
Match: ['Newcastle Utd', 'Tottenham', '2024-09-01'], Predicted: 1, True: 1
Match: ['Chelsea', 'Crystal Palace', '2024-09-01'], Predicted: 1, True: 2
Match: ['Manchester Utd', 'Liverpool', '2024-09-01'], Predicted: 0, True: 0
Match: ['Tottenham', 'Arsenal', '2024-09-15'], Predicted: 0, True: 0
Match: ['West Ham', 'Chelsea', '2024-09-21'], Predicted: 0, True: 0
Match: ['Crystal Palace', 'Manchester Utd', '2024-09-21'], Predicted: 1, True: 2
Match: ['Manchester City', 'Arsenal', '2024-09-22']

In [None]:
predicted_labels = []

# Dự đoán các trận đấu trong mùa giải 2024 - 2025
for match in matches_2425:
    home_team, opponent_team, match_date = match
    result = predict_future_match(
        home_team_name=home_team,
        opponent_name=opponent_team,
        date=match_date,
        df_hist=matches,
        model=best_xgb,
        team_mapping=team_mapping
    )
    
    # Lấy nhãn dự đoán (dựa vào xác suất cao nhất)
    predicted_label = max(
        result['probabilities'], 
        key=result['probabilities']
        .get)
    
    label_to_int = {'L': 0, 'W': 1, 'D': 2}
    predicted_labels.append(label_to_int[predicted_label])

accuracy = accuracy_score(true_label_2425, predicted_labels)
print(f"Accuracy on 30 matches: {accuracy:.3f}")


Accuracy on 30 matches: 0.333


In [33]:
for i, match in enumerate(matches_2425):
    print(f"Match: {match}, Predicted: {predicted_labels[i]}, True: {true_label_2425[i]}")


Match: ['Everton', 'Brighton', '2024-08-17'], Predicted: 0, True: 0
Match: ['Chelsea', 'Manchester City', '2024-08-18'], Predicted: 1, True: 0
Match: ['Brighton', 'Manchester Utd', '2024-08-24'], Predicted: 0, True: 1
Match: ['Tottenham', 'Everton', '2024-08-24'], Predicted: 0, True: 1
Match: ['Crystal Palace', 'West Ham', '2024-08-24'], Predicted: 1, True: 0
Match: ['Arsenal', 'Brighton', '2024-08-31'], Predicted: 1, True: 2
Match: ['West Ham', 'Manchester City', '2024-08-31'], Predicted: 0, True: 0
Match: ['Newcastle Utd', 'Tottenham', '2024-09-01'], Predicted: 2, True: 1
Match: ['Chelsea', 'Crystal Palace', '2024-09-01'], Predicted: 1, True: 2
Match: ['Manchester Utd', 'Liverpool', '2024-09-01'], Predicted: 0, True: 0
Match: ['Tottenham', 'Arsenal', '2024-09-15'], Predicted: 0, True: 0
Match: ['West Ham', 'Chelsea', '2024-09-21'], Predicted: 0, True: 0
Match: ['Crystal Palace', 'Manchester Utd', '2024-09-21'], Predicted: 1, True: 2
Match: ['Manchester City', 'Arsenal', '2024-09-22']