In [1]:
!pip install catboost


Collecting catboost
  Obtaining dependency information for catboost from https://files.pythonhosted.org/packages/1c/e1/78e635a1e5f0066bd02a1ecfd658ad09fe30d275c65c2d0dd76fe253e648/catboost-1.2.7-cp311-cp311-win_amd64.whl.metadata
  Using cached catboost-1.2.7-cp311-cp311-win_amd64.whl.metadata (1.2 kB)
Using cached catboost-1.2.7-cp311-cp311-win_amd64.whl (101.7 MB)
Installing collected packages: catboost
Successfully installed catboost-1.2.7



[notice] A new release of pip is available: 23.2.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import mean_absolute_error
import numpy as np

# Load dataset
df = pd.read_csv(r"C:\Users\91798\Desktop\html\jypter\Football-main\Football-main\matches.csv")

# Select relevant columns
df = df[['team', 'opponent', 'venue', 'xg', 'sh', 'sot', 'dist', 'fk', 'pk', 'pkatt']]

# Convert categorical values to numeric using Label Encoding
label_encoders = {}
for col in ['team', 'opponent']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Manually map venue to 0 (Home) and 1 (Away)
venue_mapping = {'Home': 0, 'Away': 1}
df['venue'] = df['venue'].map(venue_mapping)

# Feature Engineering: Calculate accuracy and efficiency metrics
df['shot_accuracy'] = df['sot'] / df['sh'].replace(0, np.nan)  # Shots on target / Total shots
df['penalty_conversion'] = df['pk'] / df['pkatt'].replace(0, np.nan)  # Penalties scored / Penalties attempted
df.fillna(0, inplace=True)  # Replace NaN values with 0

# Feature Engineering: Rolling averages for last 5 games
df['team_xg_5g'] = df.groupby('team')['xg'].transform(lambda x: x.rolling(5, min_periods=1).mean())
df['opponent_xg_5g'] = df.groupby('opponent')['xg'].transform(lambda x: x.rolling(5, min_periods=1).mean())

# Feature Engineering: Head-to-head xG averages
df['h2h_xg'] = df.groupby(['team', 'opponent'])['xg'].transform(lambda x: x.expanding().mean())

# Define features and target
X = df[['team', 'opponent', 'venue', 'sh', 'sot', 'dist', 'fk', 'pk', 'pkatt', 'shot_accuracy', 'penalty_conversion', 'team_xg_5g', 'opponent_xg_5g', 'h2h_xg']]
y = df['xg']  # Predicting Expected Goals (xG)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train different ML models
models = {
    'CatBoost': CatBoostRegressor(iterations=500, depth=8, learning_rate=0.05, verbose=0),
    'LightGBM': LGBMRegressor(n_estimators=300, learning_rate=0.03, max_depth=12),
    'SVR': SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)
}

# Train Neural Network Model
nn_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer
])

nn_model.compile(optimizer='adam', loss='mae')
nn_model.fit(X_train, y_train, epochs=100, batch_size=8, verbose=0)

# Evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    error = mean_absolute_error(y_test, y_pred)
    print(f"{name} MAE: {error:.2f}")

# Predict with Neural Network
nn_pred = nn_model.predict(X_test).flatten()
nn_error = mean_absolute_error(y_test, nn_pred)
print(f"Neural Network MAE: {nn_error:.2f}")

# Function to predict xG using Ensemble Model
def predict_xg(team, opponent, venue):
    input_data = pd.DataFrame([[team, opponent, venue, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], columns=['team', 'opponent', 'venue', 'sh', 'sot', 'dist', 'fk', 'pk', 'pkatt', 'shot_accuracy', 'penalty_conversion', 'team_xg_5g', 'opponent_xg_5g', 'h2h_xg'])
    catboost_pred = models['CatBoost'].predict(input_data)[0]
    lgbm_pred = models['LightGBM'].predict(input_data)[0]
    svr_pred = models['SVR'].predict(input_data)[0]
    nn_pred = nn_model.predict(input_data)[0][0]
    
    # Weighted averaging for better accuracy
    ensemble_pred = (0.3 * catboost_pred + 0.3 * lgbm_pred + 0.2 * svr_pred + 0.2 * nn_pred)
    print(f"Ensemble Predicted xG for Team {team} vs Team {opponent} at Venue {venue}: {ensemble_pred:.2f}")
    
# Example usage
predict_xg(12, 4, 0)  # Example: Team 12 vs Team 4 at Home

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


CatBoost MAE: 0.23
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001189 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 633
[LightGBM] [Info] Number of data points in the train set: 1111, number of used features: 14
[LightGBM] [Info] Start training from score 1.317462
LightGBM MAE: 0.24
SVR MAE: 0.41
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
Neural Network MAE: 0.25
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
Ensemble Predicted xG for Team 12 vs Team 4 at Venue 0: 0.41


In [14]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for col in ['team', 'opponent']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [18]:
team_name = 12
opponent_name = 13

# Convert team names to encoded values
team_encoded = label_encoders['team'].transform([team_name])[0]
opponent_encoded = label_encoders['team'].transform([opponent_name])[0]

# Find past xG values
h2h_xg_matches = df[(df['team'] == team_encoded) & (df['opponent'] == opponent_encoded) & (df['venue'] == 0)]
print(h2h_xg_matches[['team', 'opponent', 'venue', 'xg']])


     team  opponent  venue   xg
27     12        13      0  2.6
656    12        13      0  1.5


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import mean_absolute_error
import numpy as np

# Load dataset
df = pd.read_csv(r"C:\Users\91798\Desktop\html\jypter\Football-main\Football-main\matches.csv")

# Select relevant columns
df = df[['team', 'opponent', 'venue', 'xg']]

# Convert categorical values to numeric using Label Encoding
label_encoders = {}
for col in ['team', 'opponent']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Manually map venue to 0 (Home) and 1 (Away)
venue_mapping = {'Home': 0, 'Away': 1}
df['venue'] = df['venue'].map(venue_mapping)

# Define features and target
X = df[['team', 'opponent', 'venue']]
y = df['xg']  # Predicting Expected Goals (xG)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train different ML models
models = {
    'CatBoost': CatBoostRegressor(iterations=500, depth=8, learning_rate=0.05, verbose=0),
    'LightGBM': LGBMRegressor(n_estimators=300, learning_rate=0.03, max_depth=12),
    'SVR': SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)
}

# Train Neural Network Model
nn_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer
])

nn_model.compile(optimizer='adam', loss='mae')
nn_model.fit(X_train, y_train, epochs=100, batch_size=8, verbose=0)

# Evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    error = mean_absolute_error(y_test, y_pred)
    print(f"{name} MAE: {error:.2f}")

# Predict with Neural Network
nn_pred = nn_model.predict(X_test).flatten()
nn_error = mean_absolute_error(y_test, nn_pred)
print(f"Neural Network MAE: {nn_error:.2f}")

# Function to predict xG using Ensemble Model
def predict_xg(team, opponent, venue):
    input_data = pd.DataFrame([[team, opponent, venue]], columns=['team', 'opponent', 'venue'])
    catboost_pred = models['CatBoost'].predict(input_data)[0]
    lgbm_pred = models['LightGBM'].predict(input_data)[0]
    svr_pred = models['SVR'].predict(input_data)[0]
    nn_pred = nn_model.predict(input_data)[0][0]
    
    # Weighted averaging for better accuracy
    ensemble_pred = (0.3 * catboost_pred + 0.3 * lgbm_pred + 0.2 * svr_pred + 0.2 * nn_pred)
    print(f"Ensemble Predicted xG for Team {team} vs Team {opponent} at Venue {venue}: {ensemble_pred:.2f}")
    
# Example usage
predict_xg(12, 4, 1)  # Example: Team 12 vs Team 4 at Home

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


CatBoost MAE: 0.55
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000269 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 48
[LightGBM] [Info] Number of data points in the train set: 1111, number of used features: 3
[LightGBM] [Info] Start training from score 1.317462
LightGBM MAE: 0.55
SVR MAE: 0.60
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
Neural Network MAE: 0.59
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Ensemble Predicted xG for Team 12 vs Team 4 at Venue 1: 2.04


In [16]:
predict_xg(12, 13, 1)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
Ensemble Predicted xG for Team 12 vs Team 13 at Venue 1: 1.85


In [19]:
predict_xg(12, 13, 0)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
Ensemble Predicted xG for Team 12 vs Team 13 at Venue 0: 1.76
