In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor

# Manually create the dataset based on your provided data
data = {
    'Team': ['Man City', 'Man City', 'Man City', 'Man City', 'Man City', 'Man City', 'Man City', 'Man City', 'Man City', 'Man City', 'Brighton', 'Brighton', 'Brighton', 'Brighton', 'Brighton', 'Brighton', 'Brighton', 'Brighton', 'Brighton', 'Brighton', 'Brighton'],
    'Home_xG': [3.3, 2.1, 2.0, 1.6, 2.9, 2.4, 2.0, 2.4, 2.1, 2.2, 1.8, 1.7, 2.1, 1.8, 1.6, 1.5, 1.3, 1.9, 2.0, 1.5, 2.0],
    'Away_xG': [0.3, 1.0, 0.9, 1.3, 0.2, 1.0, 1.5, 2.4, 1.6, 1.8, 1.0, 1.5, 1.8, 1.4, 1.2, 1.1, 1.2, 1.5, 1.4, 1.8, 1.3],
    'Home_xGA': [0.3, 1.0, 1.4, 1.6, 0.2, 0.9, 1.3, 1.0, 1.2, 1.5, 2.0, 2.1, 1.8, 1.9, 2.1, 2.3, 2.0, 2.1, 2.4, 2.3, 2.2],
    'Away_xGA': [0.1, 1.0, 0.9, 1.2, 0.3, 1.5, 1.8, 2.0, 1.7, 1.8, 1.2, 1.3, 1.1, 1.4, 1.2, 1.3, 1.4, 1.7, 1.4, 1.5, 1.3],
    'Total_Goals': [4, 3, 4, 3, 4, 4, 3, 4, 3, 4, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2, 3]
}

# Convert the data into a DataFrame
df = pd.DataFrame(data)

# Feature selection (using xG and xGA)
X = df[['Home_xG', 'Away_xG', 'Home_xGA', 'Away_xGA']]
y = df['Total_Goals']  # Target variable

# Scaling features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize the Gradient Boosting Regressor model
model = GradientBoostingRegressor(n_estimators=100, random_state=42)

# Fit the model on the whole dataset (or train-test split can be applied if needed)
model.fit(X_scaled, y)

# Data for the match tomorrow
tomorrow_data = {
    'Home_xG': [2.2],  # Man City Home xG (latest)
    'Away_xG': [1.4],  # Brighton Away xG (latest)
    'Home_xGA': [1.5],  # Man City Home xGA (latest)
    'Away_xGA': [1.3]   # Brighton Away xGA (latest)
}

# Convert to DataFrame
tomorrow_df = pd.DataFrame(tomorrow_data)

# Scale the features using the same scaler from the training phase
tomorrow_scaled = scaler.transform(tomorrow_df)

# Predict the total goals for tomorrow's match
predicted_goals = model.predict(tomorrow_scaled)

# Print the predicted total goals
print(f"Predicted total goals for tomorrow's match: {predicted_goals[0]}")

Predicted total goals for tomorrow's match: 3.9612221527173377


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Manually create the dataset based on your provided data
data = {
    'Team': ['Man City', 'Man City', 'Man City', 'Man City', 'Man City', 'Man City', 'Man City', 'Man City', 'Man City', 'Man City', 
             'Brighton', 'Brighton', 'Brighton', 'Brighton', 'Brighton', 'Brighton', 'Brighton', 'Brighton', 'Brighton', 'Brighton', 'Brighton'],
    'Home_xG': [3.3, 2.1, 2.0, 1.6, 2.9, 2.4, 2.0, 2.4, 2.1, 2.2, 1.8, 1.7, 2.1, 1.8, 1.6, 1.5, 1.3, 1.9, 2.0, 1.5, 2.0],
    'Away_xG': [0.3, 1.0, 0.9, 1.3, 0.2, 1.0, 1.5, 2.4, 1.6, 1.8, 1.0, 1.5, 1.8, 1.4, 1.2, 1.1, 1.2, 1.5, 1.4, 1.8, 1.3],
    'Home_xGA': [0.3, 1.0, 1.4, 1.6, 0.2, 0.9, 1.3, 1.0, 1.2, 1.5, 2.0, 2.1, 1.8, 1.9, 2.1, 2.3, 2.0, 2.1, 2.4, 2.3, 2.2],
    'Away_xGA': [0.1, 1.0, 0.9, 1.2, 0.3, 1.5, 1.8, 2.0, 1.7, 1.8, 1.2, 1.3, 1.1, 1.4, 1.2, 1.3, 1.4, 1.7, 1.4, 1.5, 1.3],
    'Total_Goals': [4, 3, 4, 3, 4, 4, 3, 4, 3, 4, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2, 3],
    'Man_City_Home_Points': [26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26],  # Man City's total home points (repeated for all rows)
    'Brighton_Away_Points': [22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22]  # Brighton's total away points (repeated for all rows)
}

# Convert the data into a DataFrame
df = pd.DataFrame(data)

# Feature selection (now including the new columns)
X = df[['Home_xG', 'Away_xG', 'Home_xGA', 'Away_xGA', 'Man_City_Home_Points', 'Brighton_Away_Points']]
y = df['Total_Goals']  # Target variable

# Scaling features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the Gradient Boosting Regressor model
model = GradientBoostingRegressor(n_estimators=100, random_state=42)

# Fit the model on the training data
model.fit(X_train, y_train)

# Predict the total goals on the test set
y_pred = model.predict(X_test)

# Calculate performance metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

# Cross-validation (to assess model consistency)
cv_scores = cross_val_score(model, X_scaled, y, cv=5, scoring='neg_mean_squared_error')

# Print the results
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared:", r2)
print("Cross-validation (CV) Mean RMSE:", np.mean(np.sqrt(-cv_scores)))

# Display actual vs predicted results
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(results)

# Prediction for tomorrow's match (with the updated features)
# Here we manually provide the xG and xGA for tomorrow's match
tomorrow_data = np.array([[2.2, 1.8, 1.5, 1.2, 26, 22]])  # Home_xG, Away_xG, Home_xGA, Away_xGA, Man_City_Home_Points, Brighton_Away_Points
tomorrow_data_scaled = scaler.transform(tomorrow_data)
predicted_goals_tomorrow = model.predict(tomorrow_data_scaled)

print("Predicted total goals for tomorrow's match:", predicted_goals_tomorrow[0])


Root Mean Squared Error (RMSE): 0.7318298098941333
R-squared: -0.3389371766242082
Cross-validation (CV) Mean RMSE: 0.6278869012689781
    Actual  Predicted
0        4   3.999975
17       3   2.000028
15       2   3.000002
1        3   3.823364
8        3   3.000002
Predicted total goals for tomorrow's match: 3.9999750986885454


