<a href="https://colab.research.google.com/github/adarzhh/Football-match-prediction/blob/main/code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load the dataset
data = pd.read_csv('football.csv')

# Preprocess 'Possession' column to handle percentage values
data['Possession'] = data['Possession'].str.rstrip('%').astype('float') / 100.0

# Separate features (X) and target variable (y)
X = data.drop(columns=['Outcome'])  # Dropping the target column from features
y = data['Outcome']  # Selecting the target column

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Encoding the target variable
label_map = {'Win': 1, 'Loss': 0, 'Draw': 2}
y_train_encoded = y_train.map(label_map)
y_test_encoded = y_test.map(label_map)

# Initialize the Linear Regression model
linear_model = LinearRegression()

# Train the linear model
linear_model.fit(X_train, y_train_encoded)

# Make predictions on the test set using linear regression
linear_predictions = linear_model.predict(X_test)

# Initialize the Random Forest Regression model
rf_model = RandomForestRegressor(random_state=42)

# Train the random forest model
rf_model.fit(X_train, y_train_encoded)

# Make predictions on the test set using random forest regression
rf_predictions = rf_model.predict(X_test)


# Calculate R^2 for linear regression
linear_r2 = r2_score(y_test_encoded, linear_predictions)

# Calculate R^2 for random forest regression
rf_r2 = r2_score(y_test_encoded, rf_predictions)

# Calculate RMSE for linear regression
linear_rmse = mean_squared_error(y_test_encoded, linear_predictions, squared=False)

# Calculate RMSE for random forest regression
rf_rmse = mean_squared_error(y_test_encoded, rf_predictions, squared=False)

print("Linear Regression Metrics:")
print("R^2 Score:", linear_r2)
print("Root Mean Squared Error:", linear_rmse)

print("\nRandom Forest Regression Metrics:")
print("R^2 Score:", rf_r2)
print("Root Mean Squared Error:", rf_rmse)

# Scatter plot for Linear Regression
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.scatter(y_test_encoded, linear_predictions, color='blue')
plt.plot([0, 2], [0, 2], linestyle='--', color='red')  # Diagonal line representing perfect predictions
plt.title('Linear Regression: Actual vs. Predicted')
plt.xlabel('Actual')
plt.ylabel('Predicted')

# Scatter plot for Random Forest Regression
plt.subplot(1, 2, 2)
plt.scatter(y_test_encoded, rf_predictions, color='green')
plt.plot([0, 2], [0, 2], linestyle='--', color='red')  # Diagonal line representing perfect predictions
plt.title('Random Forest Regression: Actual vs. Predicted')
plt.xlabel('Actual')
plt.ylabel('Predicted')

plt.tight_layout()
plt.show()

# Residual plot for Linear Regression
linear_residuals = y_test_encoded - linear_predictions
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.scatter(y_test_encoded, linear_residuals, color='blue')
plt.axhline(y=0, color='red', linestyle='--')
plt.title('Linear Regression: Residual Plot')
plt.xlabel('Actual')
plt.ylabel('Residual')

# Residual plot for Random Forest Regression
rf_residuals = y_test_encoded - rf_predictions
plt.subplot(1, 2, 2)
plt.scatter(y_test_encoded, rf_residuals, color='green')
plt.axhline(y=0, color='red', linestyle='--')
plt.title('Random Forest Regression: Residual Plot')
plt.xlabel('Actual')
plt.ylabel('Residual')

plt.tight_layout()
plt.show()

# Concatenate X_train and y_train_encoded to include target variable for correlation analysis
train_data = pd.concat([X_train, y_train_encoded], axis=1)

# Calculate correlation matrix
correlation_matrix = train_data.corr()

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

# Pie chart for distribution of classes in target variable
plt.figure(figsize=(8, 8))
y_train.value_counts().plot(kind='pie', autopct='%1.1f%%', colors=['skyblue', 'lightgreen', 'lightcoral'])
plt.title('Distribution of Classes in Target Variable')
plt.ylabel('')
plt.show()

# Evaluate both models
linear_mse = mean_squared_error(y_test_encoded, linear_predictions)
linear_mae = mean_absolute_error(y_test_encoded, linear_predictions)

rf_mse = mean_squared_error(y_test_encoded, rf_predictions)
rf_mae = mean_absolute_error(y_test_encoded, rf_predictions)

print("Linear Regression Metrics:")
print("Mean Squared Error:", linear_mse)
print("Mean Absolute Error:", linear_mae)

print("\nRandom Forest Regression Metrics:")
print("Mean Squared Error:", rf_mse)
print("Mean Absolute Error:", rf_mae)