In [12]:
import requests, time, random
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
import numpy as np
from scipy.special import expit as sigmoid
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam


### Scraping

In [2]:
url = "https://fbref.com/en/comps/9/Premier-League-Stats"
try:
    response = requests.get(url)
    response.raise_for_status()  # Raise an error for bad responses
    soup = BeautifulSoup(response.text, 'html.parser')
    standings_table = soup.select('table.stats_table')[0]
    links = standings_table.find_all('a', href=True)
    links = [link['href'] for link in links if 'href' in link.attrs]
    links = [link for link in links if '/squads/' in link]
    team_urls = [f"https://fbref.com{link}" for link in links]

except:
    print(f"An error occurred.")

An error occurred.


In [3]:
team_url = team_urls[0]

response = requests.get(team_url)
response.raise_for_status()  # Raise an error for bad responses
matches = pd.read_html(StringIO(response.text), match='Scores & Fixtures')[0]


NameError: name 'team_urls' is not defined

In [17]:
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a', href=True)
links = [link['href'] for link in links if 'href' in link.attrs]
links = [link for link in links if link and '/all_comps/shooting/' in link]


response = requests.get(f"https://fbref.com{links[0]}")
response.raise_for_status()  # Raise an error for bad responses
shooting = pd.read_html(StringIO(response.text), match='Shooting')[0]

shooting.columns = shooting.columns.droplevel()  # Drop the multi-level index


In [18]:
team_data = matches.merge(shooting[['Date', 'Sh', 'SoT', 'Dist','FK', 'PK', 'PKatt']], on='Date')

In [None]:
def send_request(url,penalty):
    while True:
        try:
            response = requests.get(url)
            response.raise_for_status()  # Raise an error for bad responses
            return response, penalty
        except:
            print(f"Error fetching {url}. Status code: {getattr(response, 'status_code', 'N/A')}. Retrying in {penalty} seconds...")
            penalty *= 2
            if penalty > 60:  # Cap the penalty to avoid excessive waiting
                penalty = 60
            time.sleep(penalty)
    



years = list(range(2025,2020,-1))
all_matches = []
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

for year in years:
    penalty = 2
    response, penalty = send_request(standings_url,penalty)
    soup = BeautifulSoup(response.text, 'html.parser')
    standings_table = soup.select('table.stats_table')[0]

    links = [link.get('href') for link in standings_table.find_all('a', href=True)]
    links = [link for link in links if '/squads/' in link]
    team_urls = [f"https://fbref.com{link}" for link in links]

    previous_season = soup.select('a.prev')[0].get('href')
    standings_url = f"https://fbref.com/{previous_season}"

    for team_url in team_urls:
        team_name = team_url.split('/')[-1].replace('-Stats', '').replace('-', ' ').title()

        response, penalty = send_request(team_url, penalty)
        matches = pd.read_html(StringIO(response.text), match='Scores & Fixtures')[0]

        soup = BeautifulSoup(response.text, 'html.parser')
        links = [link.get('href') for link in soup.find_all('a', href=True)]
        links = [link for link in links if link and '/all_comps/shooting/' in link]
        response, penalty = send_request(f"https://fbref.com{links[0]}",penalty)
        shooting = pd.read_html(StringIO(response.text), match='Shooting')[0]
        shooting.columns = shooting.columns.droplevel()

        try:
            team_data = matches.merge(shooting[['Date', 'Sh', 'SoT', 'Dist','FK', 'PK', 'PKatt']], on='Date')
        except ValueError:
            continue

        team_data = team_data[team_data['Comp'] == 'Premier League']
        team_data['Team'] = team_name
        team_data['Season'] = year
        all_matches.append(team_data)
        


match_df = pd.concat(all_matches)
match_df.columns = [c.lower() for c in match_df.columns]

match_df.to_csv('matches.csv', index=False)


Error fetching https://fbref.com/en/comps/9/Premier-League-Stats. Status code: 429. Retrying in 2 seconds...
Error fetching https://fbref.com/en/comps/9/Premier-League-Stats. Status code: 429. Retrying in 4 seconds...
Error fetching https://fbref.com/en/comps/9/Premier-League-Stats. Status code: 429. Retrying in 8 seconds...
Error fetching https://fbref.com/en/comps/9/Premier-League-Stats. Status code: 429. Retrying in 16 seconds...


### ML

#### Data Preprocessing

In [2]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {
    "Brighton and Hove Albion": "Brighton",
    'Brighton And Hove Albion': "Brighton",
    'Manchester Utd': "Manchester United",
    "Newcastle United": "Newcastle",
    'Newcastle Utd': "Newcastle",
    "Tottenham Hotspur": "Tottenham",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves",
    'Sheffield Utd': "Sheffield United",
    'West Bromwich Albion': "West Brom",
    "Nott'ham Forest": "Nottingham Forest",
    'Aston Villa': "Aston Villa",
}

mapping = MissingDict(**map_values)

matches = pd.read_csv('datasets/final_matches.csv')

matches['team'] = matches['team'].map(mapping)
matches['opponent'] = matches['opponent'].map(mapping)

matches['date'] = pd.to_datetime(matches['date'])
matches = matches.sort_values('date').reset_index(drop=True)

matches['venue_code'] = matches['venue'].astype('category').cat.codes
matches['team_code'] = matches['team'].astype('category').cat.codes
matches['hour'] = matches['time'].str.replace(':.+',"",regex=True).astype(int)
matches['day_code'] = matches['date'].dt.dayofweek
matches['target'] = matches['result'].apply(lambda x: 1 if x == 'W' else -1)


In [3]:
def rolling_average(group, cols, new_cols):
    group = group.sort_values('date')
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats

    # Calculate points for form
    result_to_points = {'W': 3, 'D': 1, 'L': 0}
    points = group['result'].map(result_to_points)
    group['form_rolling'] = points.rolling(3, closed='left').sum()

    group = group.dropna(subset=new_cols + ['form_rolling'])
    return group

cols = ['gf', 'ga', 'xga', 'xg', 'poss', 'sh', 'sot', 'dist', 'fk', 'pk', 'pkatt']  
new_cols = [f'{col}_rolling' for col in cols]  

grouped_matches = matches.groupby('team')

matches_rolling = matches.groupby('team').apply(lambda x: rolling_average(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel('team')
matches_rolling.index = range(matches_rolling.shape[0])
matches_rolling.sort_values('date', inplace=True)

  matches_rolling = matches.groupby('team').apply(lambda x: rolling_average(x, cols, new_cols))


In [4]:
# Create a match_id by sorting team/opponent and combining with date
def get_match_id(row):
    teams = sorted([row['team'], row['opponent']])
    # Ensure date is a string in YYYY-MM-DD format
    # 2020-10-03_Leeds United_Manchester City
    if (row['date'] == '2020-10-03'):
        print(True)
    date_str = row['date'].strftime('%Y-%m-%d') if hasattr(row['date'], 'strftime') else str(row['date'])
    return f"{date_str}_{teams[0]}_{teams[1]}"

matches_rolling['match_id'] = matches_rolling.apply(get_match_id, axis=1)


match_id_counts = matches_rolling['match_id'].value_counts()
pd.set_option('display.max_rows', None)


matches_rolling = matches_rolling[matches_rolling['match_id'].map(match_id_counts) > 1].reset_index(drop=True)

In [23]:
df = matches_rolling.iloc[:, [0,9] + list(range(26, matches_rolling.shape[1]))]

# Merge the DataFrame with itself on match_id
merged_df = pd.merge(
    df,
    df,
    on='match_id',
    suffixes=('_home', '_away')
)

# Filter out self-merging by ensuring home and away teams are different
merged_df = merged_df[merged_df['team_home'] != merged_df['team_away']].reset_index(drop=True)

merged_df_sliced = merged_df.drop_duplicates(subset='match_id', keep='first').reset_index(drop=True)

merged_df.head(1)


Unnamed: 0,date_home,opponent_home,team_home,season_home,venue_code_home,team_code_home,hour_home,day_code_home,target_home,gf_rolling_home,...,xga_rolling_away,xg_rolling_away,poss_rolling_away,sh_rolling_away,sot_rolling_away,dist_rolling_away,fk_rolling_away,pk_rolling_away,pkatt_rolling_away,form_rolling_away
0,2020-10-03,Brighton,Everton,2021,1,8,15,5,1,2.666667,...,1.066667,1.8,52.666667,14.0,4.0,18.866667,0.0,0.666667,0.666667,3.0


#### Modeling

In [60]:
rf = SVC(kernel='rbf', C=1, random_state=1)
predictors = ['venue_code_home', 'team_code_home', 'team_code_away', 'day_code_home','gf_rolling_home', 'gf_rolling_away',
              'ga_rolling_home', 'ga_rolling_away', 'xga_rolling_home', 'xga_rolling_away', 'xg_rolling_home', 'xg_rolling_away',
              'sh_rolling_home', 'sh_rolling_away', 'sot_rolling_home', 'sot_rolling_away', 'dist_rolling_home', 'dist_rolling_away',
              'fk_rolling_home', 'fk_rolling_away', 'pk_rolling_home', 'pk_rolling_away', 'pkatt_rolling_home', 'pkatt_rolling_away',
              'form_rolling_home', 'form_rolling_away']


season = 2023

train = merged_df[merged_df['season_home'] < season]
test = merged_df[merged_df['season_home'] == season]

scores = cross_val_score(rf, train[predictors], train['target_home'], cv=5, scoring='accuracy')
print(f"Cross-validation accuracy: {scores.mean():.2f}")

rf.fit(train[predictors], train['target_home'])
preds = rf.predict(test[predictors])
acc = accuracy_score(test['target_home'], preds)
precision = precision_score(test['target_home'], preds)
print(f"Test Accuracy: {acc:.2f}, Test Precision: {precision:.2f}")


Cross-validation accuracy: 0.62
Test Accuracy: 0.62, Test Precision: 0.58


In [39]:
season = 2025

train_sliced = merged_df_sliced[merged_df_sliced['season_home'] != season]
test_sliced = merged_df_sliced[merged_df_sliced['season_home'] == season]

scores_sliced = cross_val_score(rf, train_sliced[predictors], train_sliced['target_home'], cv=5, scoring='accuracy')
print(f"Cross-validation accuracy: {scores.mean():.2f}")

rf.fit(train_sliced[predictors], train_sliced['target_home'])
preds_sliced = rf.predict(test_sliced[predictors])
acc = accuracy_score(test_sliced['target_home'], preds_sliced)
precision = precision_score(test_sliced['target_home'], preds_sliced)
print(f"Test Accuracy: {acc:.2f}, Test Precision: {precision:.2f}")

Cross-validation accuracy: 0.64
Test Accuracy: 0.64, Test Precision: 0.59


In [40]:
results_df_sliced = pd.DataFrame({
    'date': test_sliced['date_home'],
    'Team': test_sliced['team_home'],
    'Opponent': test_sliced['team_away'],
    'Actual': test_sliced['target_home'],
    'Predicted': preds_sliced,
}).sort_values('date').reset_index(drop=True)

team_list = results_df_sliced['Team'].unique().tolist()

team_points = []
for team in team_list:
    point = 0
    team_home_data = results_df_sliced[results_df_sliced['Team'] == team]
    team_away_data = results_df_sliced[results_df_sliced['Opponent'] == team]

    for index, row in team_home_data.iterrows():
        if row['Predicted'] == 1:
            point += 3
        elif row['Predicted'] == 0:
            point += 1
        
    for index, row in team_away_data.iterrows():
        if row['Predicted'] == -1:
            point += 3
        elif row['Predicted'] == 0:
            point += 1
    
    team_points.append({'Team': team, 'Points': point})
    
team_points = sorted(team_points, key=lambda x: x['Points'], reverse=True)

team_ranking = pd.DataFrame(team_points)


team_ranking


Unnamed: 0,Team,Points
0,Chelsea,81
1,Liverpool,78
2,Bournemouth,69
3,Brighton,69
4,Everton,69
5,Aston Villa,66
6,Manchester City,66
7,Nottingham Forest,60
8,Newcastle,60
9,Manchester United,60


In [61]:
results_df = pd.DataFrame({
    'date': test['date_home'],
    'Team': test['team_home'],
    'Opponent': test['team_away'],
    'Actual': test['target_home'],
    'Predicted': preds,
}).sort_values('date').reset_index(drop=True)

team_list = results_df['Team'].unique().tolist()
team_points = []

for team in team_list:
    point = 0
    team_home_data = results_df[results_df['Team'] == team]
    team_away_data = results_df[results_df['Opponent'] == team]

    for index, row in team_home_data.iterrows():
        similar_game = team_away_data[team_away_data['date'] == row['date']].iloc[0]
        pred1 = row['Predicted']
        pred2 = similar_game['Predicted']
        if pred1 == pred2:
            if random.random() < 0.5:
                    
                if random.random() < 1/3:
                    point += 3
                else:
                    point += 1

        elif row['Predicted'] == 1:
            point += 3
    team_points.append({'Team': team, 'Points': point})

team_points = sorted(team_points, key=lambda x: x['Points'], reverse=True)

team_ranking = pd.DataFrame(team_points)

team_ranking
        
    

Unnamed: 0,Team,Points
0,Manchester City,55
1,Tottenham,54
2,Arsenal,45
3,Everton,42
4,Manchester United,42
5,Liverpool,40
6,Chelsea,36
7,Aston Villa,35
8,Leeds United,34
9,Brighton,34
