In [13]:
from datetime import datetime
import numpy as np
import pandas as pd
import pandasql as psql
from sklearn.linear_model import LinearRegression

### Expected goals vs goals analysis - year to year variation

#### data taken from https://understat.com/

In [2]:
df = pd.read_csv("Premier_League_xG.csv")
df.head()

Unnamed: 0,Season,№,Team,M,G,GA,XG,XGA
0,2020,1,Liverpool,37,82,32,74.63,39.23
1,2020,2,Manchester City,37,97,35,98.64,36.21
2,2020,3,Manchester United,37,64,36,64.78,37.3
3,2020,4,Chelsea,37,67,54,75.07,40.92
4,2020,5,Leicester,37,67,39,60.26,46.48


In [24]:
# Adjust for number of matches
for stat in ['G', 'GA', 'XG', 'XGA']:
    df[stat] = df[stat] * 38 / df['M']

In [25]:
combined = psql.sqldf("""
select cur.team, cur.season, cur.G, cur.GA, prev.G as prev_g, prev.GA as prev_ga, prev.XG as prev_xg, prev.XGA as prev_xga
from df as cur, df as prev
where cur.team = prev.team
and cur.season = prev.season + 1
""", locals())
combined.head()

Unnamed: 0,Team,Season,G,GA,prev_g,prev_ga,prev_xg,prev_xga
0,Liverpool,2020,84.216216,32.864865,89.0,22.0,79.46,29.15
1,Manchester City,2020,99.621622,35.945946,95.0,23.0,93.72,25.73
2,Manchester United,2020,65.72973,36.972973,65.0,54.0,68.62,52.3
3,Chelsea,2020,68.810811,55.459459,63.0,39.0,63.97,38.11
4,Leicester,2020,68.810811,40.054054,51.0,48.0,52.11,44.64


In [26]:
X = combined[['prev_g']]
y = combined['G']
g_reg = LinearRegression().fit(X, y)
print(f"Goals R^2: {g_reg.score(X, y)}")
X = combined[['prev_xg']]
g_reg = LinearRegression().fit(X, y)
print(f"Expected Goals R^2: {g_reg.score(X, y)}")

X = combined[['prev_ga']]
y = combined['GA']
g_reg = LinearRegression().fit(X, y)
print(f"Goals Against R^2: {g_reg.score(X, y)}")
X = combined[['prev_xga']]
g_reg = LinearRegression().fit(X, y)
print(f"Expected Goals Against R^2: {g_reg.score(X, y)}")

Goals R^2: 0.6067729074968935
Expected Goals R^2: 0.6322451358342138
Goals Against R^2: 0.370401381662048
Expected Goals Against R^2: 0.4016001705432335
