# Premier League Prediction Project

In [1]:
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver

stats_url = 'https://fbref.com/en/comps/9/Premier-League-Stats'
seasons = [2023, 2024, 2025]

### Accessing the website through Selenium to get data

In [2]:
options = webdriver.ChromeOptions()
options.add_argument("--headless")
driver = webdriver.Chrome(options = options)

driver.get(stats_url)
time.sleep(6)
html = driver.page_source
driver.quit()

soup = BeautifulSoup(html, "html.parser")
standing_table = soup.select_one("table.stats_table") # we only need the first one among table.stats_table elements

### Example of accessing each team's data

In [3]:

links = [l.get('href') for l in standing_table.find_all('a') if '/squads' in l.get('href')] # parse the team_url from Premier League Standings Table
team_url = [f'https://fbref.com{link}' for link in links]
liverpool_url = team_url[0] # first team (winner) of season 2024-2025
liverpool_url


'https://fbref.com/en/squads/822bd0ba/2024-2025/Liverpool-Stats'

In [6]:
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(options = options)

driver.get(liverpool_url)
time.sleep(6)
liverpool_html = driver.page_source
driver.quit()

liverpool_df = pd.read_html(liverpool_html, match="Scores & Fixtures")[0] # get the team's Scores & Fixtures table and convert it to pandas df
print(liverpool_df.head())

         Date           Time            Comp         Round  Day Venue Result  \
0  2024-08-17  12:30 (20:30)  Premier League   Matchweek 1  Sat  Away      W   
1  2024-08-25  16:30 (00:30)  Premier League   Matchweek 2  Sun  Home      W   
2  2024-09-01  16:00 (00:00)  Premier League   Matchweek 3  Sun  Away      W   
3  2024-09-14  15:00 (23:00)  Premier League   Matchweek 4  Sat  Home      L   
4  2024-09-17  21:00 (04:00)    Champions Lg  League phase  Tue  Away      W   

  GF GA         Opponent   xG  xGA Poss Attendance          Captain Formation  \
0  2  0     Ipswich Town  2.6  0.5   62      30014  Virgil van Dijk   4-2-3-1   
1  2  0        Brentford  2.5  0.5   62      60017  Virgil van Dijk   4-2-3-1   
2  3  0   Manchester Utd  1.8  1.4   47      73738  Virgil van Dijk   4-2-3-1   
3  0  1  Nott'ham Forest  0.9  0.4   68      60344  Virgil van Dijk   4-2-3-1   
4  3  1         it Milan  3.1  0.6   51      59826  Virgil van Dijk   4-2-3-1   

  Opp Formation         Referee 

  liverpool_df = pd.read_html(liverpool_html, match="Scores & Fixtures")[0]


## Getting All Team's Data Using Same Method

In [13]:
stats_url = 'https://fbref.com/en/comps/9/Premier-League-Stats'
seasons = [2023, 2024, 2025]
prem_matches = []

options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(options = options)

for season in seasons:
    driver.get(stats_url)
    time.sleep(6)
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    standing_table = soup.select_one('table.stats_table')

    links = [l.get('href') for l in standing_table.find_all('a') if '/squads' in l.get('href')] # get all links that's in <a> tag with path '/squads' from standing_table
    team_urls = [f'https://fbref.com{link}' for link in links] # convert it to team page links

    prev = soup.select_one('a.prev').get('href')
    stats_url = f'https://fbref.com{prev}' # set the stats_url link to previous season stats page to access previous season's data on next iteration

    for team_url in team_urls:
        team_name = team_url.split('/')[-1].replace('-Stats', '')
        driver.get(team_url)
        time.sleep(6)
        html = driver.page_source
        scores_fixtures_table = pd.read_html(html, match = 'Scores & Fixtures')[0]

        shooting_url = '/'.join(team_url.split('/')[:-1]) + f'/matchlogs/all_comps/shooting/{team_name}-Match-Logs-All-Competitions'
        print(shooting_url)
        # shooting_url: https://fbref.com/en/squads/822bd0ba/2024-2025/matchlogs/all_comps/shooting/Liverpool-Match-Logs-All-Competitions form
        driver.get(shooting_url)
        time.sleep(6)
        shooting_html = driver.page_source
        shooting_table = pd.read_html(shooting_html, match = 'Shooting')[0]
        shooting_table.columns = shooting_table.columns.droplevel()

        defense_url = '/'.join(team_url.split('/')[:-1]) + f'/matchlogs/all_comps/defense/{team_name}-Match-Logs-All-Competitions'
        print(defense_url)
        driver.get(defense_url)
        time.sleep(6)
        defense_html = driver.page_source
        defense_table = pd.read_html(defense_html, match = 'Defensive Actions')[0]
        defense_table.columns = defense_table.columns.droplevel()

        team_df = (scores_fixtures_table.merge(shooting_table[["Date", "Sh", "SoT", "Dist", "FK", "PK", "xG"]], on="Date")
        .merge(defense_table[["Date", "Tkl", "TklW", "Int", "Clr", "Blocks", "Err"]]))
        team_df = team_df[team_df["Comp"] == "Premier League"]
        team_df['Season'] = season
        team_df['Team'] = team_name
        prem_matches.append(team_df)
        time.sleep(6)
    
driver.quit()
print(len(prem_matches))


  scores_fixtures_table = pd.read_html(html, match = 'Scores & Fixtures')[0]


https://fbref.com/en/squads/822bd0ba/2024-2025/matchlogs/all_comps/shooting/Liverpool-Match-Logs-All-Competitions


  shooting_table = pd.read_html(shooting_html, match = 'Shooting')[0]


https://fbref.com/en/squads/822bd0ba/2024-2025/matchlogs/all_comps/defense/Liverpool-Match-Logs-All-Competitions


  defense_table = pd.read_html(defense_html, match = 'Defensive Actions')[0]
  scores_fixtures_table = pd.read_html(html, match = 'Scores & Fixtures')[0]


https://fbref.com/en/squads/18bb7c10/2024-2025/matchlogs/all_comps/shooting/Arsenal-Match-Logs-All-Competitions


  shooting_table = pd.read_html(shooting_html, match = 'Shooting')[0]


https://fbref.com/en/squads/18bb7c10/2024-2025/matchlogs/all_comps/defense/Arsenal-Match-Logs-All-Competitions


  defense_table = pd.read_html(defense_html, match = 'Defensive Actions')[0]
  scores_fixtures_table = pd.read_html(html, match = 'Scores & Fixtures')[0]


https://fbref.com/en/squads/b8fd03ef/2024-2025/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions


  shooting_table = pd.read_html(shooting_html, match = 'Shooting')[0]


https://fbref.com/en/squads/b8fd03ef/2024-2025/matchlogs/all_comps/defense/Manchester-City-Match-Logs-All-Competitions


  defense_table = pd.read_html(defense_html, match = 'Defensive Actions')[0]
  scores_fixtures_table = pd.read_html(html, match = 'Scores & Fixtures')[0]


https://fbref.com/en/squads/cff3d9bb/2024-2025/matchlogs/all_comps/shooting/Chelsea-Match-Logs-All-Competitions


  shooting_table = pd.read_html(shooting_html, match = 'Shooting')[0]


https://fbref.com/en/squads/cff3d9bb/2024-2025/matchlogs/all_comps/defense/Chelsea-Match-Logs-All-Competitions


  defense_table = pd.read_html(defense_html, match = 'Defensive Actions')[0]
  scores_fixtures_table = pd.read_html(html, match = 'Scores & Fixtures')[0]


https://fbref.com/en/squads/b2b47a98/2024-2025/matchlogs/all_comps/shooting/Newcastle-United-Match-Logs-All-Competitions


  shooting_table = pd.read_html(shooting_html, match = 'Shooting')[0]


https://fbref.com/en/squads/b2b47a98/2024-2025/matchlogs/all_comps/defense/Newcastle-United-Match-Logs-All-Competitions


  defense_table = pd.read_html(defense_html, match = 'Defensive Actions')[0]
  scores_fixtures_table = pd.read_html(html, match = 'Scores & Fixtures')[0]


https://fbref.com/en/squads/8602292d/matchlogs/all_comps/shooting/Aston-Villa-Match-Logs-All-Competitions


  shooting_table = pd.read_html(shooting_html, match = 'Shooting')[0]


ValueError: No tables found matching pattern 'Shooting'