<a href="https://colab.research.google.com/github/bbqgonewrong/FootyAnalytics/blob/main/premier_league_predictions_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [67]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,precision_score

In [2]:
standings_url = 'https://fbref.com/en/comps/9/Premier-League-Stats'

In [4]:
data = requests.get(standings_url)

In [8]:
soup = BeautifulSoup(data.text)

In [10]:
standings_table = soup.select('table.stats_table')[0]

In [11]:
links = standings_table.find_all('a')

In [13]:
links = [l.get('href') for l in links]

In [15]:
links = [l for l in links if '/squads/' in l]

In [31]:
team_urls = [f"https://fbref.com{l}" for l in links]

In [None]:
team_urls
#https://fbref.com/en/squads/b8fd03ef/Manchester-City-Stats

In [None]:
team_url = team_urls[0]
data = requests.get(team_url)
data.text

In [35]:
matches = pd.read_html(data.text,match = 'Scores & Fixtures')

In [65]:
matches = matches[0]

In [41]:
soup = BeautifulSoup(data.text)
links = soup.find_all('a')
links = [l.get('href') for l in links]
links = [l for l in links if l and 'all_comps/shooting' in l]
data = requests.get(f'https://fbref.com{links[0]}')

In [None]:
data.text

In [57]:
shooting = pd.read_html(data.text,match='Shooting')[0]

In [None]:
shooting.head()

In [58]:
shooting.columns = shooting.columns.droplevel()

In [63]:
type(matches),type(shooting)

(list, pandas.core.frame.DataFrame)

In [66]:
#Joining the two tables

team_data = matches.merge(shooting[['Date','Sh','SoT','Dist','FK','PK','PKatt']],on = 'Date')

In [69]:
team_data.shape,matches.shape,shooting.shape

((58, 25), (58, 19), (59, 26))

In [72]:
years = list(range(2022,2020,-1))

In [73]:
years

[2022, 2021]

In [74]:
all_matches = []

#Creating the formal loop for scraping shooting data and team stats in a single loop

In [78]:
for year in years:
  data = requests.get(standings_url)
  soup = BeautifulSoup(data.text)
  standings_table = soup.select('table.stats_table')[0]
  links = [l.get('href') for l in standings_table.find_all('a')]
  links = [l for l in links if '/squads/' in l]
  team_urls = [f'https://fbref.com{l}' for l in links]

  #Getting statistics for the previous season 
  previous_season = soup.select('a.prev')[0].get('href')
  standings_url = f'https://fbref.com/{previous_season}'

  for team_url in team_urls:
    team_name = team_url.split('/')[-1].replace('-Stats','').replace('-','')

    data = requests.get(team_url)
    matches = pd.read_html(data.text,match = 'Scores & Fixtures')[0]

    soup = BeautifulSoup(data.text)
    links = [l.get('href') for l in soup.find_all('a')]
    links = [l for l in links if l and 'all_comps/shooting' in l]
    data = requests.get(f'https://fbref.com{links[0]}')
    shooting = pd.read_html(data.text,match='Shooting')[0]
    shooting.columns = shooting.columns.droplevel()
    try:
      team_data = matches.merge(shooting[['Date','Sh','SoT','Dist','FK','PK','PKatt']],on='Date')
    except ValueError:
      continue
    team_data = team_data[team_data['Comp']=='Premier League']
    team_data['Season'] = year
    team_data['Team'] = team_name
    all_matches.append(team_data)
    time.sleep(1) #Allow organic website traffic
  

In [79]:
match_df = pd.concat(all_matches)
match_df.columns = [c.lower() for c in match_df.columns]
match_df.to_csv('matches.csv')

In [81]:
from google.colab import files
files.download('matches.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#Calculating the match winner from the data created

In [2]:
matches = pd.read_csv('matches.csv',index_col = 0)

In [7]:
matches.comp.unique()

array(['Premier League'], dtype=object)

In [5]:
matches.shape

(1558, 27)

In [None]:
matches['team'].value_counts()

In [18]:
liv_matches = matches[matches['team']=='Liverpool']

In [None]:
matches['round'].value_counts()

In [None]:
matches.dtypes

In [22]:
#Covert datetime from object to datetime
matches['date'] = pd.to_datetime(matches['date'])

In [None]:
matches

#Creating a few basic predictors

In [25]:
#Convert home and away into tokens
matches['venue_code'] = matches['venue'].astype('category').cat.codes

In [30]:
matches['opp_code'] = matches['opponent'].astype('category').cat.codes

In [31]:
matches['hour'] = matches['time'].str.replace(':.+','',regex=True).astype('int')

In [33]:
matches['day_code'] = matches['date'].dt.dayofweek

In [37]:
#Combines L and D as a single unit. Can try to change this around
matches['target'] = (matches['result']=='W').astype('int')

#Defining a Random Forest Classifier

In [52]:
rf = RandomForestClassifier(n_estimators=50,min_samples_split=10,random_state=42)

In [53]:
train = matches[matches['date']<'2021-01-01']

In [54]:
test = matches[matches['date']>'2021-01-01']

In [55]:
predictors = ['venue_code','opp_code','hour','day_code']

In [56]:
rf.fit(train[predictors],train['target'])

RandomForestClassifier(min_samples_split=10, n_estimators=50, random_state=42)

In [57]:
len(train),len(test)

(1534, 24)

In [58]:
preds = rf.predict(test[predictors])

In [59]:
preds

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0,
       1, 1])

##Adding accuracy to the predictions

In [60]:
accuracy = accuracy_score(test['target'],preds)

In [61]:
accuracy

0.16666666666666666

In [64]:
combined = pd.DataFrame(dict(actual = test['target'],predictions=preds))

In [None]:
combined

In [None]:
pd.crosstab(index=combined['actual'],columns = combined['predictions'])

##Adding precision to the predictions

In [None]:
precision_score(test['target'],preds)

#Creating rolling average metrics

In [69]:
grouped_matches = matches.groupby('team')

In [71]:
group = grouped_matches.get_group('ManchesterCity')

In [None]:
group

In [73]:
def rolling_averages(group,cols,new_cols):
  group = group.sort_values('date')
  rolling_stats = group[cols].rolling(3,closed='left').mean()
  #Closed left allows the rolling average not to be calculated for the present week
  group[new_cols] = rolling_stats
  group = group.dropna(subset=new_cols)
  return group


In [74]:
cols = ['gf','ga','sh','sot','dist','fk','pk','pkatt']
new_cols = [f'{c}_rolling' for c in cols]

In [75]:
new_cols

['gf_rolling',
 'ga_rolling',
 'sh_rolling',
 'sot_rolling',
 'dist_rolling',
 'fk_rolling',
 'pk_rolling',
 'pkatt_rolling']

In [None]:
rolling_averages(group,cols,new_cols)

In [78]:
matches_rolling = matches.groupby('team').apply(lambda x:rolling_averages(x,cols,new_cols))

In [None]:
matches_rolling

In [80]:
matches_rolling = matches_rolling.droplevel('team')

In [82]:
matches_rolling.index = range(matches_rolling.shape[0])

In [131]:
def make_predictions(data,predictors):
  train = data[data['date'] < '2020-06-10']
  test = data[data['date'] > '2020-06-10']
  rf.fit(train[predictors],train['target'])
  preds = rf.predict(test[predictors])
  combined = pd.DataFrame(dict(actual = test['target'],predictions = preds),index = test.index)
  precision = precision_score(test['target'],preds)
  return combined,precision


In [132]:
cols_preds = predictors+new_cols

In [133]:
train[predictors]

Unnamed: 0,venue_code,opp_code,hour,day_code
0,0,24,20,0
2,1,12,16,6
4,0,11,17,5
5,1,0,17,5
7,0,23,12,5
...,...,...,...,...
35,0,20,12,5
36,1,21,15,5
37,0,13,20,4
38,1,15,14,6


In [134]:
combined,precision = make_predictions(matches_rolling,cols_preds)

In [135]:
combined = combined.merge(matches_rolling[['date','team','opponent','result']],left_index = True,right_index = True)

In [136]:
len(combined)

222

#Combining Home and Away

##Creating a class for fuzzy matching exact team names to their vernacular use

In [137]:
class MissingDict(dict):
  __missing__ = lambda self,key:key

map_values = {
    'Brighton and Hove Albion': 'Brighton',
    'ManchesterUnited' :'ManchesterUtd',
    'NewcastleUnited' : 'NewcastleUtd',
    'Tottenham Hotspurs' : 'Tottenham',
    'West Ham United' : 'West Ham',
    'Wolverhampton Wanderers' : 'Wolves'

}
mapping = MissingDict(**map_values)

In [None]:
mapping["Arsenal"]

In [138]:
combined['new_team'] = combined['team'].map(mapping)

In [None]:
combined

In [140]:
merged = combined.merge(combined,left_on = ['date','new_team'],right_on = ['date','opponent'])

In [None]:
combined['new_team']

In [None]:
merged

##Checking the values where one team was predicted to win while the other was predicted to lose or draw

In [None]:
merged[(merged['predictions_x']==1)& (merged['predictions_y']==0)]['actual_x'].value_counts()

In [148]:
9/14

0.6428571428571429