In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import mean_absolute_error
import numpy as np

# Load dataset
df = pd.read_csv(r"C:\Users\91798\Desktop\html\jypter\Football-main\Football-main\matches.csv")

# Select relevant columns
df = df[['team', 'opponent', 'venue', 'sot']]

# Convert categorical values to numeric using Label Encoding
label_encoders = {}
for col in ['team', 'opponent']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Manually map venue to 0 (Home) and 1 (Away)
venue_mapping = {'Home': 0, 'Away': 1}
df['venue'] = df['venue'].map(venue_mapping)

# Define features and target
X = df[['team', 'opponent', 'venue']]
y = df['sot']  # Predicting Shots on Target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train different ML models
models = {
    'CatBoost': CatBoostRegressor(iterations=500, depth=8, learning_rate=0.05, verbose=0),
    'LightGBM': LGBMRegressor(n_estimators=300, learning_rate=0.03, max_depth=12),
    'SVR': SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)
}

# Train Neural Network Model
nn_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer
])

nn_model.compile(optimizer='adam', loss='mae')
nn_model.fit(X_train, y_train, epochs=100, batch_size=8, verbose=0)

# Evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    error = mean_absolute_error(y_test, y_pred)
    print(f"{name} MAE: {error:.2f}")

# Predict with Neural Network
nn_pred = nn_model.predict(X_test).flatten()
nn_error = mean_absolute_error(y_test, nn_pred)
print(f"Neural Network MAE: {nn_error:.2f}")

# Function to predict Shots on Target using Ensemble Model
def predict_sot(team, opponent, venue):
    input_data = pd.DataFrame([[team, opponent, venue]], columns=['team', 'opponent', 'venue'])
    catboost_pred = models['CatBoost'].predict(input_data)[0]
    lgbm_pred = models['LightGBM'].predict(input_data)[0]
    svr_pred = models['SVR'].predict(input_data)[0]
    nn_pred = nn_model.predict(input_data)[0][0]
    
    # Weighted averaging for better accuracy
    ensemble_pred = (0.3 * catboost_pred + 0.3 * lgbm_pred + 0.2 * svr_pred + 0.2 * nn_pred)
    
    print(f"Ensemble Predicted Shots on Target for Team {team} vs Team {opponent} at Venue {venue}: {ensemble_pred:.2f}")
    
# Example usage
predict_sot(12, 4, 0)  # Example: Team 12 vs Team 4 at Home


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


CatBoost MAE: 1.80
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000109 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 48
[LightGBM] [Info] Number of data points in the train set: 1111, number of used features: 3
[LightGBM] [Info] Start training from score 4.071107
LightGBM MAE: 1.78
SVR MAE: 1.99
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
Neural Network MAE: 1.87
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
Ensemble Predicted Shots on Target for Team 12 vs Team 4 at Venue 0: 6.49


In [10]:
# Apply label encoding again (if not done already)
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for col in ['team', 'opponent']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


In [11]:
team_name = 12
opponent_name = 13

# Convert team names to encoded values
team_encoded = label_encoders['team'].transform([team_name])[0]
opponent_encoded = label_encoders['team'].transform([opponent_name])[0]

# Find past head-to-head matches
h2h_matches = df[(df['team'] == team_encoded) & (df['opponent'] == opponent_encoded)]
print(h2h_matches[['team', 'opponent', 'venue', 'sot']])

     team  opponent  venue   sot
10     12        13      1   5.0
27     12        13      0  10.0
639    12        13      1   2.0
656    12        13      0   6.0


In [12]:
predict_sot(12, 13, 1)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Ensemble Predicted Shots on Target for Team 12 vs Team 13 at Venue 1: 5.05
