## Getting Premier League Data from ESPN

- GP: Games Played
- W: Wins
- D: Draws
- L: Losses
- F: Goals For
- A: Goals Against
- GD: Goal Difference
- P: Points

In [23]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

### Standings

In [24]:
all_years_standings = []
for year in range(2009, 2023):
    # Go through all pages since the url only differs by suffix of year
    url_base = "https://www.espn.com/soccer/standings/_/league/ENG.1/sort/gamesplayed/dir/asc/season/"
    year_str = str(year)
    year_url = url_base + year_str
    page = requests.get(year_url)
    soup = BeautifulSoup(page.content, "html.parser")
    # Extract table to store it in the list with all the dataframes
    table = soup.find_all("table")
    teams = pd.read_html(str(table))[0]
    data = pd.read_html(str(table))[1]
    season_results = pd.concat([teams, data], axis = 1)
    season_results["Season"] = season_results.columns[0]
    season_results.rename(columns = {year_str + "-" + str(year + 1):"Team"}, inplace = True)
    season_results["Rank"] = np.arange(1, len(season_results) + 1)
    # Get rid of the team abbrievation in front of the Team Name
    season_results["Team"] = season_results["Team"].str.replace(r"^[A-Z]{3}", "", regex = True)
    all_years_standings.append(season_results)
    
prem_standings = pd.concat(all_years_standings)

In [25]:
prem_standings

Unnamed: 0,Team,GP,W,D,L,F,A,GD,P,Season,Rank
0,Chelsea,38,27,5,6,103,32,71,86,2009-2010,1
1,Manchester United,38,27,4,7,86,28,58,85,2009-2010,2
2,Arsenal,38,23,6,9,83,41,42,75,2009-2010,3
3,Tottenham Hotspur,38,21,7,10,67,41,26,70,2009-2010,4
4,Manchester City,38,18,13,7,73,45,28,67,2009-2010,5
...,...,...,...,...,...,...,...,...,...,...,...
15,Southampton,22,4,3,15,18,40,-22,15,2022-2023,16
16,Manchester City,23,16,3,4,59,23,36,51,2022-2023,17
17,Manchester United,23,14,4,5,38,28,10,46,2022-2023,18
18,Tottenham Hotspur,23,12,3,8,42,35,7,39,2022-2023,19


In [26]:
prem_standings.to_excel("premier_standings.xlsx", index = False)

### Squad & Stats
- Man City: https://www.espn.com/soccer/team/stats/_/id/382/league/ENG.1/season/
- Arsenal: https://www.espn.com/soccer/team/stats/_/id/359/league/ENG.1/season/
etc.


- RK: Ranking
- P: Games Played
- G: Goals scored
- A: Assists

In [27]:
# Through the ESPN website I manually found the team ids through the urls
team_id = {"Manchester City" : 382, "Arsenal": 359, "Manchester United": 360, "Newcastle United": 361,
"Tottenham Hotspur" : 367, "Brighton & Hove Albion" : 331, "Fulham": 370, "Brentford" : 337, "Liverpool" : 64,
"Chelsea" : 363, "Aston Villa" : 362, "Crystal Palace" : 384, "Leicester City" : 375, "Nottingham Forest" : 393,
"Wolverhampton Wanderers" : 380, "West Ham United": 371, "Leeds United" : 357, "Everton" : 368, "AFC Bournemouth": 349,
"Southampton" : 376, "Leicester City" : 375}
# Then realized having id to team name is more convenient
id_to_team = {val: key for key, val in team_id.items()}

In [53]:
def squad_to_df(team_id, year_lb = 2009, year_ub = 2022):
    """
    Get a dataframe of Top Scorers and Top Assists every year for a given 
    team's url page on the ESPN website.
    """
    list_of_dfs = []
    for year in range(year_lb, year_ub + 1):
        year_str = str(year)
        page_url = ("https://www.espn.com/soccer/team/stats/_/id/" + 
                            str(team_id) + "/league/ENG.1/season/" + year_str)
        page = requests.get(page_url)
        soup = BeautifulSoup(page.content, "html.parser")
        try: 
            # The table will be read in as two separate dfs, concatenate them
            table_l_r = pd.read_html(str(soup.find_all("table")))
            scorers = pd.DataFrame(table_l_r[0])
            scorers = scorers.add_prefix("Scorer_")
            assisters = pd.DataFrame(table_l_r[1])
            assisters = assisters.add_prefix("Assister_")
            table = pd.concat([scorers, assisters], axis = 1)
            table["Season"] = year_str + "-" + str(year + 1)
            # Front fill the Rank columns because ties are not given an int. rank
            table["Scorer_RK"] = table["Scorer_RK"].ffill().astype(int)
            table["Assister_RK"] = table["Assister_RK"].ffill().astype(int)
            if len(table) > 0: 
                list_of_dfs.append(table)
        except ValueError: 
            # Data is missing for some years, just move on if that happens
            continue
    is_data = len(list_of_dfs) > 0 
    # If that team has at least one year of data, concatenate to one dataframe
    if is_data:
        return pd.concat(list_of_dfs)
    # Otherwise return an empty dataframe
    return pd.DataFrame()

Assemble a dictionary with keys as team names and values as dataframes with top scorer/assister data for each year where data could be retrieved.

In [54]:
scorers_assist = []
for t_id in id_to_team:
    key = id_to_team[t_id]
    df = squad_to_df(t_id)
    df["Team"] = key
    scorers_assist.append(df)

In [55]:
del scorers_assist[8]

In [56]:
[df.shape for df in scorers_assist]

[(316, 10),
 (314, 10),
 (321, 10),
 (271, 10),
 (311, 10),
 (124, 10),
 (173, 10),
 (50, 10),
 (318, 10),
 (242, 10),
 (223, 10),
 (170, 10),
 (25, 10),
 (174, 10),
 (295, 10),
 (49, 10),
 (312, 10),
 (149, 10),
 (244, 10)]

In [57]:
player_data = pd.concat(scorers_assist)
player_data.to_excel("espn_scorer_assister.xlsx", index = False)

Save this scorers_assist Excel, to be merged with game summaries and odds data to act as additional predictors for ML models

In [None]:
# Save to one excel file with one sheet per team
# file_path = "espn_premier_score_assist.xlsx"
# my_writer = pd.ExcelWriter(file_path)

# for team in scorers_assist:
#     df = scorers_assist[team]
#     df.to_excel(my_writer, sheet_name = team, index = False)

# my_writer.close()