##Load the File Directly with Pandas

In [11]:
import pandas as pd


X_train_url = 'https://raw.githubusercontent.com/cafrontadoe/laliga-match-prediction/refs/heads/main/phase3_model_training/X_train.csv'
X_test_url = 'https://raw.githubusercontent.com/cafrontadoe/laliga-match-prediction/refs/heads/main/phase3_model_training/X_test.csv'
y_train_url = 'https://raw.githubusercontent.com/cafrontadoe/laliga-match-prediction/refs/heads/main/phase3_model_training/y_train.csv'
y_test_url = 'https://raw.githubusercontent.com/cafrontadoe/laliga-match-prediction/refs/heads/main/phase3_model_training/y_test.csv'

X_train: any;
X_test: any;
y_train: any;
y_test: any;

try:
    # Load X_train
    X_train = pd.read_csv(X_train_url)
    print("X_train loaded successfully.")

    # Load X_test
    X_test = pd.read_csv(X_test_url)
    print("X_test loaded successfully.")

    y_train = pd.read_csv(y_train_url).squeeze()
    print("y_train loaded successfully.")

    y_test = pd.read_csv(y_test_url).squeeze()
    print("y_test loaded successfully.")

    print("\nFirst 5 rows of X_train:")
    print(X_train.head())
    print("\nFirst 5 values of y_train:")
    print(y_train.head())

except Exception as e:
    print(f"An error occurred while loading the files: {e}")
    print("Please double-check that the URLs are correct and accessible (raw URLs).")
    print("Also, ensure the files are truly public and the URLs point to the raw content, not the HTML page.")

X_train loaded successfully.
X_test loaded successfully.
y_train loaded successfully.
y_test loaded successfully.

First 5 rows of X_train:
   home_avg_goals_scored_last_5  home_avg_goals_conceded_last_5  \
0                           0.0                             0.0   
1                           0.0                             0.0   
2                           0.0                             0.0   
3                           0.0                             0.0   
4                           0.0                             0.0   

   home_win_rate_last_5  home_draw_rate_last_5  home_loss_rate_last_5  \
0                   0.0                    0.0                    0.0   
1                   0.0                    0.0                    0.0   
2                   0.0                    0.0                    0.0   
3                   0.0                    0.0                    0.0   
4                   0.0                    0.0                    0.0   

   away_avg_goals_

## Feature Scaling

scales features to a specific range, usually 0 to 1). StandardScaler is a good general choice

In [12]:
from sklearn.preprocessing import StandardScaler

# Identify numerical columns for scaling
# Assuming columns like 'home_team_goal', 'away_team_goal', and the 'avg_goals'/'win_rate' stats are numerical.
# The 'False'/'True' columns (one-hot encoded teams) should NOT be scaled.
# Let's assume all columns NOT starting with 'home_' or 'away_' and are not boolean are numerical.

# For demonstration, let's assume all columns in X are numerical except the boolean ones.
# You might need to adjust this based on your actual feature set.
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
# If you have boolean columns that were created from one-hot encoding, they will be of dtype 'bool'.
# You should exclude them from scaling.
boolean_cols = X_train.select_dtypes(include=['bool']).columns

# Columns to scale: all numerical columns that are not boolean
cols_to_scale = [col for col in numerical_cols if col not in boolean_cols]

print("cols_to_scale", cols_to_scale);

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data's numerical columns
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])

# Transform the test data using the *fitted* scaler
X_test_scaled[cols_to_scale] = scaler.transform(X_test[cols_to_scale])

print("\n--- Feature Scaling ---")
print("First 5 rows of X_train_scaled (features after scaling):")
print(X_train_scaled.head())
print("-" * 50)

cols_to_scale ['home_avg_goals_scored_last_5', 'home_avg_goals_conceded_last_5', 'home_win_rate_last_5', 'home_draw_rate_last_5', 'home_loss_rate_last_5', 'away_avg_goals_scored_last_5', 'away_avg_goals_conceded_last_5', 'away_win_rate_last_5', 'away_draw_rate_last_5', 'away_loss_rate_last_5']

--- Feature Scaling ---
First 5 rows of X_train_scaled (features after scaling):
   home_avg_goals_scored_last_5  home_avg_goals_conceded_last_5  \
0                     -1.924194                       -2.087476   
1                     -1.924194                       -2.087476   
2                     -1.924194                       -2.087476   
3                     -1.924194                       -2.087476   
4                     -1.924194                       -2.087476   

   home_win_rate_last_5  home_draw_rate_last_5  home_loss_rate_last_5  \
0             -1.440168              -1.273225               -1.53813   
1             -1.440168              -1.273225               -1.53813   
2

##Model Selection &  Model Training
###Logistic Regression model & Random forest

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier # You'd need to install xgboost: pip install xgboost

# --- Logistic Regression ---
print("\n--- Model Training (Logistic Regression) ---")
# Initialize the Logistic Regression model
# max_iter is increased for convergence, solver can be 'liblinear' for small datasets or 'lbfgs' for larger ones
# multi_class='multinomial' is for true multi-class problems (default for 'lbfgs')
log_reg_model = LogisticRegression(max_iter=1000, random_state=42, solver='lbfgs', multi_class='multinomial')

# Train the model
log_reg_model.fit(X_train_scaled, y_train)
print("Logistic Regression model trained successfully.")
print("-" * 50)

# --- Random Forest Classifier (Another option) ---
print("\n--- Model Training (Random Forest Classifier) ---")
# Initialize the Random Forest model
# n_estimators is the number of trees, random_state for reproducibility
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1) # n_jobs=-1 uses all available cores

# Train the model
rf_model.fit(X_train_scaled, y_train)
print("Random Forest Classifier model trained successfully.")
print("-" * 50)


--- Model Training (Logistic Regression) ---




Logistic Regression model trained successfully.
--------------------------------------------------

--- Model Training (Random Forest Classifier) ---
Random Forest Classifier model trained successfully.
--------------------------------------------------


## Model Evaluation

In [14]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np # Ensure numpy is imported

# --- Evaluate Logistic Regression Model ---
print("\n--- Evaluating Logistic Regression Model ---")

# Make predictions on the scaled test set
y_pred_log_reg = log_reg_model.predict(X_test_scaled)

# Calculate Accuracy
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
print(f"Logistic Regression Accuracy: {accuracy_log_reg:.4f}")

# Display Classification Report
print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, y_pred_log_reg))

# Display Confusion Matrix
print("\nLogistic Regression Confusion Matrix:")
# The labels for your 'match_result_encoded' are:
# 0: Away Win
# 1: Draw
# 2: Home Win
# Adjust if your encoding is different.
conf_matrix_log_reg = confusion_matrix(y_test, y_pred_log_reg)
print(conf_matrix_log_reg)
# You can also visualize this with seaborn.heatmap for better readability

# --- Evaluate Random Forest Model ---
print("\n--- Evaluating Random Forest Classifier Model ---")

# Make predictions on the scaled test set
y_pred_rf = rf_model.predict(X_test_scaled)

# Calculate Accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf:.4f}")

# Display Classification Report
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

# Display Confusion Matrix
print("\nRandom Forest Confusion Matrix:")
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
print(conf_matrix_rf)
print("-" * 50)

# Optional: Predict probabilities (useful for calibration or custom thresholds)
# y_proba_log_reg = log_reg_model.predict_proba(X_test_scaled)
# y_proba_rf = rf_model.predict_proba(X_test_scaled)
# print("\nFirst 5 predicted probabilities (Logistic Regression):")
# print(y_proba_log_reg[:5])


--- Evaluating Logistic Regression Model ---
Logistic Regression Accuracy: 0.5190

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.54      0.83      0.65       304
           1       0.34      0.07      0.12       184
           2       0.51      0.46      0.49       196

    accuracy                           0.52       684
   macro avg       0.46      0.45      0.42       684
weighted avg       0.48      0.52      0.46       684


Logistic Regression Confusion Matrix:
[[252  14  38]
 [124  13  47]
 [ 95  11  90]]

--- Evaluating Random Forest Classifier Model ---
Random Forest Accuracy: 0.5058

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.54      0.83      0.65       304
           1       0.29      0.09      0.13       184
           2       0.49      0.39      0.44       196

    accuracy                           0.51       684
   macro avg   

## Synthetic Augmentation (SMOTE) for DRAW


In [15]:
pip install imbalanced-learn



In [16]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import Pipeline # Use imblearn's Pipeline for resampling steps
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN # A combination of SMOTE and Edited Nearest Neighbours

# --- Assume X_train, X_test, y_train, y_test are already loaded/defined ---
# (e.g., from your 'split_data' directory)
# input_dir = 'split_data'
# X_train = pd.read_parquet(os.path.join(input_dir, 'X_train.parquet'))
# X_test = pd.read_parquet(os.path.join(input_dir, 'X_test.parquet'))
# y_train = pd.read_parquet(os.path.join(input_dir, 'y_train.parquet'))['match_result_encoded']
# y_test = pd.read_parquet(os.path.join(input_dir, 'y_test.parquet'))['match_result_encoded']

print("Data loaded. Proceeding with scaling and SMOTE integration.")
print("-" * 50)

# --- Define Numerical Columns for Scaling ---
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

# --- TimeSeriesSplit for Cross-Validation ---
tscv = TimeSeriesSplit(n_splits=5) # Adjust n_splits as needed

# --- 1. Random Forest with SMOTE in a Pipeline ---
print("\n--- Tuning Random Forest with SMOTE ---")

# Define the pipeline steps: Scaling -> SMOTE -> Classifier
# SMOTE is applied *after* scaling within the pipeline
pipeline_rf_smote = Pipeline([
    ('scaler', StandardScaler()),
    ('oversampler', SMOTE(random_state=42)), # You can also try SMOTEENN or SMOTETomek
    ('classifier', RandomForestClassifier(random_state=42, n_jobs=-1))
])

# Define the parameter grid for GridSearchCV
# You can tune SMOTE parameters (e.g., k_neighbors) and classifier parameters
param_grid_rf_smote = {
    'oversampler__k_neighbors': [3, 5, 7], # Number of nearest neighbors for SMOTE
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5],
    # 'classifier__class_weight': [None, 'balanced'] # You can still try class_weight with SMOTE
}

grid_search_rf_smote = GridSearchCV(
    pipeline_rf_smote,
    param_grid_rf_smote,
    cv=tscv, # Use TimeSeriesSplit
    scoring='f1_weighted', # Use f1_weighted to balance all classes, or 'accuracy'
    n_jobs=-1,
    verbose=2
)

# Fit the pipeline on the original (unscaled) X_train and y_train
# The pipeline handles scaling and SMOTE internally for each fold
grid_search_rf_smote.fit(X_train, y_train)

print(f"\nBest parameters for Random Forest with SMOTE: {grid_search_rf_smote.best_params_}")
print(f"Best cross-validation score (Random Forest with SMOTE): {grid_search_rf_smote.best_score_:.4f}")

best_rf_smote_model = grid_search_rf_smote.best_estimator_
y_pred_rf_smote = best_rf_smote_model.predict(X_test) # Predict on original X_test, pipeline handles scaling

print("\n--- Evaluated Tuned Random Forest Model with SMOTE ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf_smote):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf_smote))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf_smote))
print("-" * 50)


# --- 2. XGBoost with SMOTE in a Pipeline (Optional, but powerful) ---
print("\n--- Tuning XGBoost with SMOTE ---")

pipeline_xgb_smote = Pipeline([
    ('scaler', StandardScaler()),
    ('oversampler', SMOTE(random_state=42)),
    ('classifier', XGBClassifier(objective='multi:softmax', num_class=3, eval_metric='mlogloss',
                                 use_label_encoder=False, random_state=42, n_jobs=-1))
])

param_grid_xgb_smote = {
    'oversampler__k_neighbors': [3, 5],
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [3, 5, 7],
    'classifier__learning_rate': [0.05, 0.1, 0.2],
    # 'classifier__scale_pos_weight': [1, 2] # More complex with multi-class, often better to use sample_weight or SMOTE
}

grid_search_xgb_smote = GridSearchCV(
    pipeline_xgb_smote,
    param_grid_xgb_smote,
    cv=tscv,
    scoring='f1_weighted', # Or 'accuracy'
    n_jobs=-1,
    verbose=2
)

grid_search_xgb_smote.fit(X_train, y_train)

print(f"\nBest parameters for XGBoost with SMOTE: {grid_search_xgb_smote.best_params_}")
print(f"Best cross-validation score (XGBoost with SMOTE): {grid_search_xgb_smote.best_score_:.4f}")

best_xgb_smote_model = grid_search_xgb_smote.best_estimator_
y_pred_xgb_smote = best_xgb_smote_model.predict(X_test)

print("\n--- Evaluated Tuned XGBoost Model with SMOTE ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb_smote):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_xgb_smote))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_xgb_smote))
print("-" * 50)

Data loaded. Proceeding with scaling and SMOTE integration.
--------------------------------------------------

--- Tuning Random Forest with SMOTE ---
Fitting 5 folds for each of 36 candidates, totalling 180 fits

Best parameters for Random Forest with SMOTE: {'classifier__max_depth': 10, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200, 'oversampler__k_neighbors': 5}
Best cross-validation score (Random Forest with SMOTE): 0.4606

--- Evaluated Tuned Random Forest Model with SMOTE ---
Accuracy: 0.5102

Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.75      0.66       304
           1       0.34      0.22      0.26       184
           2       0.46      0.42      0.44       196

    accuracy                           0.51       684
   macro avg       0.46      0.46      0.45       684
weighted avg       0.48      0.51      0.49       684


Confusion Matrix:
[[227  35  42]
 [ 91  40  53]
 [ 70  44  82]]
---

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Best parameters for XGBoost with SMOTE: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__n_estimators': 100, 'oversampler__k_neighbors': 5}
Best cross-validation score (XGBoost with SMOTE): 0.4644

--- Evaluated Tuned XGBoost Model with SMOTE ---
Accuracy: 0.4883

Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.73      0.64       304
           1       0.28      0.17      0.22       184
           2       0.46      0.40      0.43       196

    accuracy                           0.49       684
   macro avg       0.43      0.44      0.43       684
weighted avg       0.46      0.49      0.46       684


Confusion Matrix:
[[223  38  43]
 [101  32  51]
 [ 74  43  79]]
--------------------------------------------------


## refine the Random Forest tuning

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

print("Data loaded. Proceeding with more refined tuning.")
print("-" * 50)

# Define Numerical Columns for Scaling
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

# TimeSeriesSplit for Cross-Validation
tscv = TimeSeriesSplit(n_splits=5)

# --- Refined Random Forest with SMOTE in a Pipeline ---
print("\n--- Refined Tuning Random Forest with SMOTE ---")

pipeline_rf_smote_refined = Pipeline([
    ('scaler', StandardScaler()),
    ('oversampler', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(random_state=42, n_jobs=-1))
])

# Refined parameter grid based on previous best results
# Focus on values around the previous best ('classifier__max_depth': 10, 'oversampler__k_neighbors': 5)
param_grid_rf_smote_refined = {
    'oversampler__k_neighbors': [3, 5, 7], # Keep exploring k_neighbors
    'classifier__n_estimators': [150, 200, 250], # Narrow range around 200
    'classifier__max_depth': [8, 10, 12], # Narrow range around 10
    'classifier__min_samples_split': [2, 3, 5], # Narrow range around 2
    'classifier__max_features': ['sqrt', 'log2', 0.8] # Add max_features
}

grid_search_rf_smote_refined = GridSearchCV(
    pipeline_rf_smote_refined,
    param_grid_rf_smote_refined,
    cv=tscv,
    scoring='f1_weighted', # Continue using f1_weighted for balanced evaluation
    n_jobs=-1,
    verbose=2
)

grid_search_rf_smote_refined.fit(X_train, y_train)

print(f"\nBest parameters for Refined Random Forest with SMOTE: {grid_search_rf_smote_refined.best_params_}")
print(f"Best cross-validation score (Refined Random Forest with SMOTE): {grid_search_rf_smote_refined.best_score_:.4f}")

best_rf_smote_refined_model = grid_search_rf_smote_refined.best_estimator_
y_pred_rf_smote_refined = best_rf_smote_refined_model.predict(X_test)

print("\n--- Evaluated Refined Tuned Random Forest Model with SMOTE ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf_smote_refined):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf_smote_refined))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf_smote_refined))
print("-" * 50)

Data loaded. Proceeding with more refined tuning.
--------------------------------------------------

--- Refined Tuning Random Forest with SMOTE ---
Fitting 5 folds for each of 243 candidates, totalling 1215 fits
