In [50]:
import requests, time, random
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score
import numpy as np
from scipy.special import expit as sigmoid
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam


### Scraping

In [2]:
url = "https://fbref.com/en/comps/9/Premier-League-Stats"
try:
    response = requests.get(url)
    response.raise_for_status()  # Raise an error for bad responses
    soup = BeautifulSoup(response.text, 'html.parser')
    standings_table = soup.select('table.stats_table')[0]
    links = standings_table.find_all('a', href=True)
    links = [link['href'] for link in links if 'href' in link.attrs]
    links = [link for link in links if '/squads/' in link]
    team_urls = [f"https://fbref.com{link}" for link in links]

except:
    print(f"An error occurred.")

An error occurred.


In [3]:
team_url = team_urls[0]

response = requests.get(team_url)
response.raise_for_status()  # Raise an error for bad responses
matches = pd.read_html(StringIO(response.text), match='Scores & Fixtures')[0]


NameError: name 'team_urls' is not defined

In [17]:
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a', href=True)
links = [link['href'] for link in links if 'href' in link.attrs]
links = [link for link in links if link and '/all_comps/shooting/' in link]


response = requests.get(f"https://fbref.com{links[0]}")
response.raise_for_status()  # Raise an error for bad responses
shooting = pd.read_html(StringIO(response.text), match='Shooting')[0]

shooting.columns = shooting.columns.droplevel()  # Drop the multi-level index


In [18]:
team_data = matches.merge(shooting[['Date', 'Sh', 'SoT', 'Dist','FK', 'PK', 'PKatt']], on='Date')

In [None]:
def send_request(url,penalty):
    while True:
        try:
            response = requests.get(url)
            response.raise_for_status()  # Raise an error for bad responses
            return response, penalty
        except:
            print(f"Error fetching {url}. Status code: {getattr(response, 'status_code', 'N/A')}. Retrying in {penalty} seconds...")
            penalty *= 2
            if penalty > 60:  # Cap the penalty to avoid excessive waiting
                penalty = 60
            time.sleep(penalty)
    



years = list(range(2025,2020,-1))
all_matches = []
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

for year in years:
    penalty = 2
    response, penalty = send_request(standings_url,penalty)
    soup = BeautifulSoup(response.text, 'html.parser')
    standings_table = soup.select('table.stats_table')[0]

    links = [link.get('href') for link in standings_table.find_all('a', href=True)]
    links = [link for link in links if '/squads/' in link]
    team_urls = [f"https://fbref.com{link}" for link in links]

    previous_season = soup.select('a.prev')[0].get('href')
    standings_url = f"https://fbref.com/{previous_season}"

    for team_url in team_urls:
        team_name = team_url.split('/')[-1].replace('-Stats', '').replace('-', ' ').title()

        response, penalty = send_request(team_url, penalty)
        matches = pd.read_html(StringIO(response.text), match='Scores & Fixtures')[0]

        soup = BeautifulSoup(response.text, 'html.parser')
        links = [link.get('href') for link in soup.find_all('a', href=True)]
        links = [link for link in links if link and '/all_comps/shooting/' in link]
        response, penalty = send_request(f"https://fbref.com{links[0]}",penalty)
        shooting = pd.read_html(StringIO(response.text), match='Shooting')[0]
        shooting.columns = shooting.columns.droplevel()

        try:
            team_data = matches.merge(shooting[['Date', 'Sh', 'SoT', 'Dist','FK', 'PK', 'PKatt']], on='Date')
        except ValueError:
            continue

        team_data = team_data[team_data['Comp'] == 'Premier League']
        team_data['Team'] = team_name
        team_data['Season'] = year
        all_matches.append(team_data)
        


match_df = pd.concat(all_matches)
match_df.columns = [c.lower() for c in match_df.columns]

match_df.to_csv('matches.csv', index=False)


Error fetching https://fbref.com/en/comps/9/Premier-League-Stats. Status code: 429. Retrying in 2 seconds...
Error fetching https://fbref.com/en/comps/9/Premier-League-Stats. Status code: 429. Retrying in 4 seconds...
Error fetching https://fbref.com/en/comps/9/Premier-League-Stats. Status code: 429. Retrying in 8 seconds...
Error fetching https://fbref.com/en/comps/9/Premier-League-Stats. Status code: 429. Retrying in 16 seconds...


### ML

#### Data Preprocessing

In [141]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {
    "Brighton and Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd",
    "Newcastle United": "Newcastle",
    "Tottenham Hotspur": "Tottenham",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves",
}

mapping = MissingDict(**map_values)

matches = pd.read_csv('datasets/final_matches.csv')

matches['team'] = matches['team'].map(mapping)
matches['opponent'] = matches['opponent'].map(mapping)

matches['date'] = pd.to_datetime(matches['date'])
matches = matches.sort_values('date').reset_index(drop=True)

matches['venue_code'] = matches['venue'].astype('category').cat.codes
matches['team_code'] = matches['team'].astype('category').cat.codes
matches['hour'] = matches['time'].str.replace(':.+',"",regex=True).astype(int)
matches['day_code'] = matches['date'].dt.dayofweek
matches['target'] = matches['result'].apply(lambda x: 1 if x == 'W' else -1)


In [143]:
def rolling_average(group, cols, new_cols):
    group = group.sort_values('date')
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats

    # Calculate points for form
    result_to_points = {'W': 3, 'D': 1, 'L': 0}
    points = group['result'].map(result_to_points)
    group['form_rolling'] = points.rolling(3, closed='left').sum()

    group = group.dropna(subset=new_cols + ['form_rolling'])
    return group

cols = ['gf', 'ga', 'xga', 'xg', 'poss', 'sh', 'sot', 'dist', 'fk', 'pk', 'pkatt']  
new_cols = [f'{col}_rolling' for col in cols]  

grouped_matches = matches.groupby('team')
group = grouped_matches.get_group('Arsenal')

matches_rolling = matches.groupby('team').apply(lambda x: rolling_average(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel('team')
matches_rolling.index = range(matches_rolling.shape[0])
matches_rolling.sort_values('date', inplace=True)

  matches_rolling = matches.groupby('team').apply(lambda x: rolling_average(x, cols, new_cols))


In [144]:
matches_rolling[(matches_rolling['date'] == '2020-10-03') & (((matches_rolling['team'] == 'Crystal Palace')) | (matches_rolling['opponent'] == 'Crystal Palace'))]

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,xg,xga,poss,attendance,captain,formation,opp formation,referee,match report,notes,sh,sot,dist,fk,pk,pkatt,team,season,venue_code,team_code,hour,day_code,target,gf_rolling,ga_rolling,xga_rolling,xg_rolling,poss_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling,form_rolling
1119,2020-10-03,12:30,Premier League,Matchweek 4,Sat,Away,L,0,4,Chelsea,0.2,2.7,30.0,,Wilfried Zaha,4-4-2,4-2-3-1,Michael Oliver,Match Report,,4.0,0.0,22.5,0.0,0,0,Crystal Palace,2021,0,7,12,5,-1,1.666667,1.0,1.333333,1.033333,32.666667,8.666667,2.666667,15.833333,0.0,0.333333,0.333333,6.0
932,2020-10-03,12:30,Premier League,Matchweek 4,Sat,Home,W,4,0,Crystal Palace,2.7,0.2,70.0,,César Azpilicueta,4-2-3-1,4-4-2,Michael Oliver,Match Report,,15.0,4.0,17.2,0.0,2,2,Chelsea,2021,1,6,12,5,1,2.0,2.0,1.266667,1.366667,54.0,11.666667,5.333333,21.133333,0.333333,0.333333,0.666667,4.0


In [None]:
df = matches_rolling.iloc[:, [0,9] + list(range(26, matches_rolling.shape[1]))]

def find_paired_row(df):
    # Returns the index of the paired row for each row in df
    paired_indices = []
    for idx, row in df.iterrows():
        mask = (df['date'] == row['date']) & (df['team'] == row['opponent']) & (df['opponent'] == row['team'])
        paired = df[mask]
        if not paired.empty:
            paired_indices.append(paired.index[0])
        else:
            paired_indices.append(None)
    return paired_indices

paired_row_indices = find_paired_row(df)
paired_row_indices

In [99]:
data = matches_rolling.iloc[:, [0,9] + list(range(26, matches_rolling.shape[1]))]
data['date'] = pd.to_datetime(data['date'])
data = data.sort_values('date')

merged = data.merge(
    data,
    left_on=['date', 'team','opponent'],
    right_on=['date', 'opponent','team'],
    suffixes=('_home', '_away')
)
data = merged.drop(columns=['opponent_home', 'season_away','venue_code_away', 'hour_away','day_code_away','target_away',
                     'opponent_away'])
['season', 'team_home', 'team_away',]
pd.set_option('display.max_columns', None)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['date'] = pd.to_datetime(data['date'])


Unnamed: 0,date,team_home,season_home,venue_code_home,opp_code_home,hour_home,day_code_home,target_home,gf_rolling_home,ga_rolling_home,xga_rolling_home,xg_rolling_home,poss_rolling_home,sh_rolling_home,sot_rolling_home,dist_rolling_home,fk_rolling_home,pk_rolling_home,pkatt_rolling_home,form_rolling_home,team_away,opp_code_away,gf_rolling_away,ga_rolling_away,xga_rolling_away,xg_rolling_away,poss_rolling_away,sh_rolling_away,sot_rolling_away,dist_rolling_away,fk_rolling_away,pk_rolling_away,pkatt_rolling_away,form_rolling_away
0,2020-10-03,Crystal Palace,2021,0,6,12,5,-1,1.666667,1.0,1.333333,1.033333,32.666667,8.666667,2.666667,15.833333,0.0,0.333333,0.333333,6.0,Chelsea,7,2.0,2.0,1.266667,1.366667,54.0,11.666667,5.333333,21.133333,0.333333,0.333333,0.666667,4.0
1,2020-10-03,Chelsea,2021,1,7,12,5,1,2.0,2.0,1.266667,1.366667,54.0,11.666667,5.333333,21.133333,0.333333,0.333333,0.666667,4.0,Crystal Palace,6,1.666667,1.0,1.333333,1.033333,32.666667,8.666667,2.666667,15.833333,0.0,0.333333,0.333333,6.0
2,2020-10-17,Liverpool,2021,0,8,12,5,-1,2.333333,2.666667,1.633333,2.2,65.333333,17.666667,7.0,17.466667,0.666667,0.0,0.0,6.0,Everton,13,3.666667,1.666667,0.666667,2.466667,57.333333,12.333333,5.666667,15.166667,0.666667,0.333333,0.333333,9.0
3,2020-10-17,Southampton,2021,0,6,15,5,-1,1.666667,1.666667,0.933333,1.266667,53.333333,10.333333,4.666667,17.933333,0.333333,0.333333,0.333333,6.0,Chelsea,21,2.333333,1.666667,0.966667,1.866667,61.333333,13.666667,5.333333,19.766667,0.333333,0.666667,1.0,4.0
4,2020-10-17,Chelsea,2021,1,21,15,5,-1,2.333333,1.666667,0.966667,1.866667,61.333333,13.666667,5.333333,19.766667,0.333333,0.666667,1.0,4.0,Southampton,6,1.666667,1.666667,0.933333,1.266667,53.333333,10.333333,4.666667,17.933333,0.333333,0.333333,0.333333,6.0


#### Modeling

In [51]:
rf = RandomForestClassifier(n_estimators=100, min_samples_split=10, random_state=1)
predictors = ['venue_code', 'xga_rolling', 'xg_rolling', 'poss_rolling', 'opp_code', 'hour', 'day_code', 'form_rolling']

data = matches_rolling.iloc[:, 27:]

def make_predictions(data, predictors):
    train = data[data['season'] < 2025]
    test = data[data['season'] == 2025]
    rf.fit(train[predictors], train['target'])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame({
        'actual': test['target'],
        'predicted': preds
    })
    precision = precision_score(test['target'], preds)
    acc = accuracy_score(test['target'], preds)
    return combined, precision, acc

combined, precision, acc = make_predictions(data, predictors + new_cols)



print(f"Precision: {precision:.2f}, Accuracy: {acc:.2f}")


Precision: 0.54, Accuracy: 0.64


In [55]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {
    "Brighton and Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd",
    "Newcastle United": "Newcastle",
    "Tottenham Hotspur": "Tottenham",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves",
}

mapping = MissingDict(**map_values)

combined_merged = combined.merge(matches_rolling[['date', 'team', 'opponent', 'form_rolling']], left_index=True, right_index=True)

combined_merged['team'] = combined_merged['team'].map(mapping)

rows_to_drop = set()
for idx, row in combined_merged.iterrows():
    # Find the paired row: same date, team is opponent and opponent is team
    mask = (
        (combined_merged['date'] == row['date']) &
        (combined_merged['team'] == row['opponent']) &
        (combined_merged['opponent'] == row['team'])
    )
    paired = combined_merged[mask]
    if not paired.empty:
        paired_idx = paired.index[0]
        # Only process each pair once
        if paired_idx in rows_to_drop or idx in rows_to_drop or paired_idx == idx:
            continue
        # If predicted values are equal, set predicted to 0 for the kept row
        if row['predicted'] == paired.iloc[0]['predicted']:
            combined_merged.at[idx, 'predicted'] = 0 if random.random() < 2/3 else -1
        # Mark the paired row for removal
        rows_to_drop.add(paired_idx)

# Drop the marked rows
combined_merged = combined_merged.drop(list(rows_to_drop)).reset_index(drop=True)

combined_merged

Unnamed: 0,actual,predicted,date,team,opponent,form_rolling
0,1,1,2024-08-17,Arsenal,Wolves,9.0
1,1,1,2024-08-24,Arsenal,Aston Villa,9.0
2,-1,1,2024-08-31,Arsenal,Brighton,9.0
3,1,1,2024-09-15,Arsenal,Tottenham,7.0
4,-1,-1,2024-09-22,Arsenal,Manchester City,7.0
...,...,...,...,...,...,...
483,-1,-1,2024-09-15,Wolves,Newcastle Utd,1.0
484,-1,-1,2024-10-26,Wolves,Brighton,0.0
485,-1,-1,2025-01-06,Wolves,Nott'ham Forest,7.0
486,-1,-1,2025-01-15,Wolves,Newcastle Utd,4.0


In [56]:
combined_merged[(combined_merged['team'] == 'Liverpool') | (combined_merged['opponent'] == 'Liverpool')]

Unnamed: 0,actual,predicted,date,team,opponent,form_rolling
8,-1,0,2024-10-27,Arsenal,Liverpool,6.0
35,-1,0,2025-05-11,Arsenal,Liverpool,4.0
47,-1,-1,2024-11-09,Aston Villa,Liverpool,4.0
61,-1,0,2025-02-19,Aston Villa,Liverpool,2.0
78,-1,0,2024-09-21,Bournemouth,Liverpool,4.0
95,-1,-1,2025-02-01,Bournemouth,Liverpool,7.0
109,-1,-1,2024-08-25,Brentford,Liverpool,6.0
126,-1,-1,2025-01-18,Brentford,Liverpool,4.0
149,-1,-1,2024-11-02,Brighton And Hove Albion,Liverpool,7.0
176,1,-1,2025-05-19,Brighton And Hove Albion,Liverpool,7.0


In [58]:
combined_merged['predicted_points'] = combined_merged['predicted'].map({1: 3, 0: 1, -1: 0})

combined_merged.groupby('team')['predicted_points'].sum().sort_values(ascending=False)



team
Arsenal                     74
Chelsea                     50
Fulham                      37
Liverpool                   37
Aston Villa                 35
Bournemouth                 35
Newcastle                   33
Manchester City             33
Crystal Palace              25
Brighton And Hove Albion    21
Everton                     18
Brentford                   17
Nottingham Forest           15
Leicester City              13
Ipswich Town                10
Manchester Utd              10
Tottenham                   10
West Ham                     7
Southampton                  4
Wolves                       0
Name: predicted_points, dtype: int64

In [23]:
123 / (123 + 83)

0.5970873786407767