# Data Hackathon - Player Goals Per 90 In the Premier League

## Analysis Questions

1. Which players have the highest Goals per 90 across different teams?
2. Do certain countries produce players with higher scoring rates?
3. How does total goals vary by number of matches played?
4. Which teams have the highest average Goals per 90 across their players?

## Hypothesis

1. Players with more minutes played tend to have lower goals.

2. Players from top-performing teams have higher Goals per 90 on average than those from lower-ranked teams.

## Target variable for Machine Learning

- Total Goals

In [0]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('player_goals_per_90.csv')

In [0]:
df.head()

In [0]:
df.drop(columns=['Rank'], inplace=True)

In [0]:
df.head()

In [0]:
df.info()

In [0]:
df['Total Goals'] = df['Total Goals'].astype(int)

In [0]:
df.info()

In [0]:
df.head()

In [0]:
df['Country'].unique()

In [0]:
country_mapping = {
    'COL': 'Colombia',
    'NOR': 'Norway', 
    'SWE': 'Sweden',
    'ENG': 'England',
    'POR': 'Portugal',
    'NZL': 'New Zealand',
    'FRA': 'France',
    'BEL': 'Belgium',
    'BRA': 'Brazil',
    'EGY': 'Egypt',
    'KOR': 'South Korea',
    'NGA': 'Nigeria',
    'URU': 'Uruguay',
    'SEN': 'Senegal',
    'MEX': 'Mexico',
    'GER': 'Germany',
    'NED': 'Netherlands',
    'COD': 'DR Congo',
    'JAM': 'Jamaica',
    'DEN': 'Denmark',
    'SCO': 'Scotland',
    'CMR': 'Cameroon',
    'IRL': 'Ireland',
    'ARG': 'Argentina',
    'ESP': 'Spain',
    'GHA': 'Ghana',
    'UKR': 'Ukraine',
    'MLI': 'Mali',
    'CIV': 'Ivory Coast',
    'RSA': 'South Africa',
    'SUI': 'Switzerland',
    'CZE': 'Czech Republic',
    'WAL': 'Wales',
    'ITA': 'Italy',
    'HAI': 'Haiti',
    'JPN': 'Japan',
    'ALB': 'Albania',
    'CRO': 'Croatia',
    'PAR': 'Paraguay',
    'ECU': 'Ecuador',
    'HUN': 'Hungary',
    'GAB': 'Gabon',
    'IRN': 'Iran',
    'POL': 'Poland',
    'ALG': 'Algeria',
    'ISL': 'Iceland',
    'SRB': 'Serbia',
    'BIH': 'Bosnia and Herzegovina',
    'BFA': 'Burkina Faso',
    'GRE': 'Greece',
    'NIR': 'Northern Ireland',
    'MAR': 'Morocco',
    'USA': 'United States'
}


df['Country'] = df['Country'].map(country_mapping)

print(df['Country'].unique())

In [0]:
df.head()

In [0]:
top_player_per_team = df.loc[df.groupby('Team')['Goals per 90'].idxmax()]
top_player_per_team = top_player_per_team.sort_values('Goals per 90', ascending=True)


team_player_labels = [f"{row['Team']} ({row['Player']})" for _, row in top_player_per_team.iterrows()]

sns.barplot(x=top_player_per_team['Goals per 90'], y=team_player_labels, palette='viridis')
plt.title('Highest Goals per 90 Player from Each Team')
plt.xlabel('Goals per 90')
plt.ylabel('Team (Player)')
plt.tight_layout()
plt.show()

In [0]:
country_avg = df.groupby('Country')['Goals per 90'].agg(['mean', 'count']).reset_index()
country_filtered = country_avg[country_avg['count'] >= 3].sort_values('mean', ascending=False).head(10)

sns.barplot(x=country_filtered.index, y=country_filtered['mean'], palette='plasma')
plt.title('Average Goals per 90 by Country (Top 10, min 3 players)')
plt.xlabel('Country')
plt.ylabel('Average Goals per 90')
plt.xticks(range(len(country_filtered)), country_filtered['Country'], rotation=45)
plt.tight_layout()
plt.show()

In [0]:
sns.scatterplot(data=df, x='Matches', y='Total Goals', palette='coolwarm')
plt.title('Total Goals vs Matches Played')
plt.xlabel('Matches Played')
plt.ylabel('Total Goals')
plt.tight_layout()
plt.show()

In [0]:
team_avg = df.groupby('Team')['Goals per 90'].mean().sort_values(ascending=False)
sns.barplot(x=team_avg.index, y=team_avg.values, palette='Set2')
plt.title('Average Goals per 90 by Team')
plt.xlabel('Team')
plt.ylabel('Average Goals per 90')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [0]:
sns.lineplot(data=df, x='Minutes', y='Total Goals')
plt.title('Total Goals vs Minutes Played')
plt.xlabel('Minutes Played')
plt.ylabel('Total Goals')
plt.tight_layout()
plt.show()

In [0]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np

In [0]:
X = df[['Minutes', 'Matches', 'Goals per 90']].values
y = df['Total Goals'].values


In [0]:
ml_df = pd.DataFrame({
    'Minutes': X[:, 0], 
    'Matches_Played': X[:, 1], 
    'Goals_per_90': X[:, 2], 
    'Total_Goals': y
})
ml_df.head()

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [0]:
model = LinearRegression()
model.fit(X_train, y_train)

In [0]:
y_pred = model.predict(X_test)

y_pred

In [0]:
for i in range(5):
    print(f"Actual: {y_test[i]}, Predicted: {y_pred[i]:.1f}")

In [0]:
plt.scatter(y_test, y_pred, color='purple', alpha=0.6, label='Predictions')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2, label='Perfect Prediction')
plt.title('Actual vs Predicted Total Goals')
plt.xlabel('Actual Total Goals')
plt.ylabel('Predicted Total Goals')
plt.legend()
plt.grid(True)
plt.show()

In [0]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Mean Squared Error:', mse)
print('R2 Score:', r2)

In [0]:
new_player = [[2000, 25, 0.5]] 
prediction = model.predict(new_player)

print(f"Prediction for new player:")
print(f"Minutes: {new_player[0][0]}, Matches: {new_player[0][1]}, Goals per 90: {new_player[0][2]}")
print(f"Predicted Total Goals: {prediction[0]:.1f}")