In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import mean_absolute_error
import numpy as np

# Load dataset
df = pd.read_csv(r"C:\Users\91798\Desktop\html\jypter\Football-main\Football-main\matches.csv")

# Select relevant columns
df = df[['team', 'opponent', 'venue', 'sh']]

# Convert categorical values to numeric using Label Encoding
label_encoders = {}
for col in ['team', 'opponent']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Manually map venue to 0 (Home) and 1 (Away)
df['venue'] = df['venue'].map({'Home': 0, 'Away': 1})

# Define features and target
X = df[['team', 'opponent', 'venue']]
y = df['sh']  # Predicting total shots

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train different ML models
models = {
    'CatBoost': CatBoostRegressor(iterations=500, depth=8, learning_rate=0.05, verbose=0),
    'LightGBM': LGBMRegressor(n_estimators=300, learning_rate=0.03, max_depth=12),
    'XGBoost': XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=10),
    'Random Forest': RandomForestRegressor(n_estimators=200, random_state=42),
    'Linear Regression': LinearRegression(),
    'SVR': SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)
}

# Train Neural Network Model
nn_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer
])

nn_model.compile(optimizer='adam', loss='mae')
nn_model.fit(X_train, y_train, epochs=100, batch_size=8, verbose=0)

# Evaluate models
mae_scores = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    error = mean_absolute_error(y_test, y_pred)
    mae_scores[name] = error
    print(f"{name} MAE: {error:.2f}")

# Predict with Neural Network
nn_pred = nn_model.predict(X_test).flatten()
nn_error = mean_absolute_error(y_test, nn_pred)
mae_scores['Neural Network'] = nn_error
print(f"Neural Network MAE: {nn_error:.2f}")

# Select the best model
best_model_name = min(mae_scores, key=mae_scores.get)
best_model = models.get(best_model_name, nn_model)
print(f"Best Model Selected: {best_model_name}")

# Function to predict shots using the best model
def predict_shots(team, opponent, venue):
    input_data = pd.DataFrame([[team, opponent, venue]], columns=['team', 'opponent', 'venue'])
    predicted_shots = best_model.predict(input_data)[0]
    print(f"Predicted Shots for Team {team} vs Team {opponent} at Venue {venue}: {predicted_shots:.2f}")
    
# Example usage
predict_shots(12, 4, 0)  # Example: Team 12 vs Team 4 at Home

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


CatBoost MAE: 3.91
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000118 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 48
[LightGBM] [Info] Number of data points in the train set: 1111, number of used features: 3
[LightGBM] [Info] Start training from score 12.272727
LightGBM MAE: 3.86
XGBoost MAE: 4.84
Random Forest MAE: 4.22
Linear Regression MAE: 4.13
SVR MAE: 4.33
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
Neural Network MAE: 4.09
Best Model Selected: LightGBM
Predicted Shots for Team 12 vs Team 4 at Venue 0: 20.59


In [4]:
predict_shots(12, 4, 1)

Predicted Shots for Team 12 vs Team 4 at Venue 1: 15.94


In [12]:
# Apply label encoding again (if not done already)
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for col in ['team', 'opponent']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le
    team_name = 0
opponent_name = 4

# Convert team names to encoded values
team_encoded = label_encoders['team'].transform([team_name])[0]
opponent_encoded = label_encoders['team'].transform([opponent_name])[0]

# Find past head-to-head matches
h2h_matches = df[(df['team'] == team_encoded) & (df['opponent'] == opponent_encoded)]
print(h2h_matches[['team', 'opponent', 'venue', 'sh']])


     team  opponent  venue    sh
69      0         4      1  13.0
85      0         4      0  20.0
906     0         4      0  18.0
921     0         4      1  15.0


In [9]:
predict_shots(12, 13, 1)

Predicted Shots for Team 12 vs Team 13 at Venue 1: 15.27


In [10]:
predict_shots(12, 4, 0)

Predicted Shots for Team 12 vs Team 4 at Venue 0: 20.59


In [13]:
predict_shots(0, 4, 0)

Predicted Shots for Team 0 vs Team 4 at Venue 0: 19.73
