In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [None]:
def parse_def(df):
    df.columns = df.columns.droplevel()
    df = df.drop(columns=["Time", "Comp", "Round", "Day", "Venue", "Result", "GF", "GA", "Opponent", "Match Report"])
    df.columns = ["Date", "Tkl", "TklW", "TklDef3rd", "TklMid3rd", "TklAtt3rd", "DriTkl", "DriChall", "Tkl%", "ChallLost", "Blocks", "ShBlk", "PassBlk", "Int", "Tkl+Int", "Clr", "Err"]
    return df

def parse_gca(df):
    df.columns = df.columns.droplevel()
    df = df.drop(columns=["Time", "Comp", "Round", "Day", "Venue", "Result", "GF", "GA", "Opponent", "Match Report"])
    df.columns = ["Date", "SCA", "SCALivePass", "SCADeadPass", "SCADri", "SCASh", "SCAFls", "SCADefAc", "GCA", "GCALivePass", "GCADeadPass", "GCADri", "GCASh", "GCAFls", "GCADefAc"]
    return df

def parse_gk(df):
    df.columns = df.columns.droplevel()
    df = df.drop(columns=["Time", "Comp", "Round", "Day", "Venue", "Result", "GF", "GA", "Opponent", "Match Report"])
    df.columns=["Date", "SoTA", "Saves", "Save%", "CS", "PSxG", "PSxG+/-", "PKattAg", "PKConc", "PKsvAg", "PKMissAg", "LauCmp", "LauAtt", "LauCmp%", "PassAtt", "ThrowsAtt", "PassLaunch%", 
                "PassAvgLen", "GKAtt", "GKLaunch%", "GKAvgLen", "CrossesFaced", "CrossesStp", "CrossesStp%", "DefActionOutBox", "AvgDistOfDefAction"]
    return df

def parse_misc(df):
    df.columns = df.columns.droplevel()
    df = df.drop(columns=["Time", "Comp", "Round", "Day", "Venue", "Result", "GF", "GA", "Opponent", "2CrdY", "Int", "TklW", "OG", "Match Report"])
    df.columns = ["Date", "Yell", "Red", "Fouls", "FoulsDrawn", "Off", "Crosses", "PKwon", "PKcon", "Recov", "AerialWon", "AerialLost", "AerialWon%"]
    return df

def parse_pass(df):
    df.columns = df.columns.droplevel()
    df = df.drop(columns=["Time", "Comp", "Round", "Day", "Venue", "Result", "GF", "GA", "Opponent", "Match Report"])
    df.columns = ["Date", "TotPassCmp", "TotPassAtt", "TotPassCmp%", "TotPassDist", "PrgPassDist", "SPassCmp", "SPasAtt", "SPasCmp%", "MPasCmp", "MPasAtt", "MPasCmp%", 
                    "LPasCmp", "LPasAtt", "LPasCmp%", "Ast", "xAG", "xA", "KeyPasses", "PassIntoFinal1/3", "PassdIntoBox", "CrsIntoBox", "PrgPass"]
    return df

def parse_poss(df):
    df.columns = df.columns.droplevel()
    df = df.drop(columns=["Time", "Comp", "Round", "Day", "Venue", "Result", "GF", "GA", "Opponent", "Match Report"])
    df.columns = ["Date", "Poss", "Touches", "DefBoxTouches", "Def3rdTouches", "Mid3rdTouches", "Att3rdTouches", "AttBoxTouches", 
                    "Live", "AttTakeOns", "SuccTakeOns", "Succ%TakeOns", "TkldInTakeOns", "Tkld%InTakeOns", "Carries", "TotDistCarried", 
                    "PrgDistCarried", "PrgCarries", "CarriesInto1/3", "CarriesIntoBox", "Miscontrols", "Dispossessed", "PassesRec", "PrgPassesRec"]
    return df

def parse_passTypes(df):
    df.columns = df.columns.droplevel()
    df = df.drop(columns=["Time", "Comp", "Round", "Day", "Venue", "Result", "GF", "GA", "Opponent", "Att", "Cmp", "Match Report"])
    df.columns = ["Date", "LivePass", "DeadPass", "PassesFK", "TB", "Switch", "Cross", "TI", "CK", "CKIn", "CKOut", "CKStraight", "OffPasses", "BlockedPasses"]
    return df

def parse_shooting(df):
    df.columns = df.columns.droplevel()
    df = df.drop(columns=["Time", "Comp", "Round", "Day", "Venue", "Result", "GF", "GA", "Opponent", "Gls", "Match Report"])
    df.columns = ["Date", "Sh", "SoT", "SoT%", "G/Sh", "G/SoT", "AvgDistOfSh", "FK", "PK", "PKattFor", "xG", "npxG", "npxG/Sh", "G-xG", "np:G-xG"]
    return df

In [None]:
def parse_url(url, parse, text):
    data = requests.get(url)
    df = pd.read_html(data.text, match=text)[0]
    df = parse(df)
    return df

In [None]:
parse_fcns = [parse_def, parse_gca, parse_gk, parse_misc, parse_pass, parse_passTypes, parse_poss, parse_shooting]
table_names = ["Defensive Actions", "Goal and Shot Creation", "Goalkeeping", "Miscellaneous Stats", "Passing", "Pass Types", "Possession", "Shooting"]

all_matches = []
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"
data = requests.get(standings_url)
soup = BeautifulSoup(data.text)
standings_table = soup.select('table.stats_table')[0]

links = [l.get("href") for l in standings_table.find_all('a')]
links = [l for l in links if '/squads/' in l]
team_urls = [f"https://fbref.com{l}" for l in links]
    
# Get team data

for team_url in team_urls:
    # Get match data
    team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
    data = requests.get(team_url)
    time.sleep(10)

    matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
    matches = matches.drop(columns=["Poss", "xG", "Attendance", "Captain", "Match Report", "Referee", "Notes"])
        
    # Get stats links
    soup = BeautifulSoup(data.text)
    links = [l.get("href") for l in soup.find_all('a')]
    links = [l for l in links if l and ("all_comps/shooting" in l or "all_comps/keeper" in l or "all_comps/passing" in l
                                or "all_comps/passing_types" in l or "all_comps/gca" in l or "all_comps/defense" in l
                                or "all_comps/possession" in l or "all_comps/misc" in l)]
    links = list(set(links))
    links = [f"https://fbref.com{l}" for l in links]
    links.sort()
        
    dfs = []
    for url, category, parser in zip(links, table_names, parse_fcns):
        dfs.append(parse_url(url, parser, category))
    
    team_data = matches
    for df in dfs:
        try:
            team_data = team_data.merge(df, how="left", on="Date")
        except ValueError:
            continue

    team_data = team_data[team_data["Comp"] == "Premier League"]
    
    # Add year and team name
    # team_data["Season"] = year
    team_data["Team"] = team_name
    all_matches.append(team_data)

In [None]:
# Convert to CSV
len(all_matches)
match_df = pd.concat(all_matches)
match_df.columns = [c.lower() for c in match_df.columns]
match_df.to_csv("matches.csv")