In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [29]:
# Parse the defensive actions table
def parse_def(df):
    df.columns = df.columns.droplevel()
    df = df.drop(columns=["Time", "Comp", "Round", "Day", "Venue", "Result", "GF", "GA", "Opponent", "Match Report"])
    df.columns = ["Date", "Tkl", "TklW", "Def 3rd", "Mid 3rd", "Att 3rd", "DriTkl", "Att", "Tkl%", "Lost", "Blocks", "Sh", "PassBlk", "Int", "Tkl+Int", "Clr", "Err"]
    return df

def parse_gca(df):
    df.columns = df.columns.droplevel()
    df = df.drop(columns=["Time", "Comp", "Round", "Day", "Venue", "Result", "GF", "GA", "Opponent", "Match Report"])
    return df

def parse_gk(df):
    df.columns = df.columns.droplevel()
    df = df.drop(columns=["Time", "Comp", "Round", "Day", "Venue", "Result", "GF", "GA", "Opponent", "Match Report"])
    df.columns = ["Date", "SoTA", "GA", "Saves", "Save%", "CS", "PSxG", "PSxG+/-", "PKatt", "PKA", "PKsv", "PKm", "Cmp", "Att", "Cmp%", "Att", "Thr", "Launch%", "AvgLen", "Att", "Launch%", "AvgLen", "Opp", "Stp", "Stp%", "#OPA", "AvgDist"]
    return df

def parse_misc(df):
    df.columns = df.columns.droplevel()
    df = df.drop(columns=["Time", "Comp", "Round", "Day", "Venue", "Result", "GF", "GA", "Opponent", "2CrdY", "Int", "TklW", "OG", "Match Report"])
    df.columns = ["Date", "Yell", "Red", "Fouls", "FoulsDrawn", "Off", "Crosses", "PKwon", "PKcon", "Recov", "AerialWon", "AerialLost", "AerialWon%"]
    return df

def parse_pass(df):
    df.columns = df.columns.droplevel()
    df = df.drop(columns=["Time", "Comp", "Round", "Day", "Venue", "Result", "GF", "GA", "Opponent", "Match Report"])
    df.columns = ["Date", "TotCmp", "TotAtt", "TotCmp%", "TotDist", "PrgDist", "SCmp", "SAtt", "SCmp%", "MCmp", "MAtt", "MCmp%", "LCmp", "LAtt", "LCmp%", "Ast", "xAG", "xA", "KeyPasses", "PasIntoFin1/3", "PasIntoBox", "CrsIntoBox", "PrgP"]
    return df

def parse_poss(df):
    df.columns = df.columns.droplevel()
    df = df.drop(columns=["Time", "Comp", "Round", "Day", "Venue", "Result", "GF", "GA", "Opponent", "Match Report"])
    df.columns = ["Date", "Poss", "Touches", "DefBoxTouches", "Def3rdTouches", "Mid3rdTouches", "Att3rdTouches", "AttBoxTouches", "Live", "AttTakeOns", "SuccTakeOns", "Succ%TakeOns", "TkldInTakeOns", "Tkld%InTakeOns", "Carries", "TotDistCarried", "PrgDistCarried", "PrgCarries", "CarriesInto1/3", "CarriesIntoBox", "Miscontrols", "Dispossessed", "PassesRec", "PrgPassesRec"]
    return df

def parse_passTypes(df):
    df.columns = df.columns.droplevel()
    df = df.drop(columns=["Time", "Comp", "Round", "Day", "Venue", "Result", "GF", "GA", "Opponent", "Att", "Cmp", "Match Report"])
    df.columns = ["Date", "Live", "Dead", "FK", "TB", "Switch", "Cross", "TI", "CK", "CKIn", "CKOut", "CKShort", "OffPasses", "BlockedPasses"]
    return df

def parse_shooting(df):
    df.columns = df.columns.droplevel()
    df = df.drop(columns=["Time", "Comp", "Round", "Day", "Venue", "Result", "GF", "GA", "Opponent", "Match Report"])
    df.columns = ["Date", "Sh", "SoT", "SoT%", "G/Sh", "G/SoT", "AvgDist", "FK", "PK", "PKatt", "xG", "npxG", "npxG/Sh", "G-xG", "np:G-xG"]
    return df

In [30]:
data = requests.get("https://fbref.com/en/squads/b8fd03ef/2022-2023/matchlogs/all_comps/defense/Manchester-City-Match-Logs-All-Competitions")
df = pd.read_html(data.text, match="Defensive Actions")[0]
df = parse_def(df)
df.columns

Index(['Date', 'Tkl', 'TklW', 'Def 3rd', 'Mid 3rd', 'Att 3rd', 'DriTkl', 'Att',
       'Tkl%', 'Lost', 'Blocks', 'Sh', 'PassBlk', 'Int', 'Tkl+Int', 'Clr',
       'Err'],
      dtype='object')

In [None]:
years = list(range(2022, 2020, - 1))
all_matches = []
standings_url = "https://fbref.com/en/comps/9/standings/Premier-League-Stats"

categories = ["Defensive Actions", "Goal and Shot Creation", "Goalkeeping", "Miscellaneous Stats", "Passing", "Possession", "Pass Types", "Shooting"]
stats = [ ["Tkl", "TklW", "Def 3rd", "Mid 3rd", "Att 3rd", "Tkl", "Att", "Tkl%", "Lost", "Blocks", "Sh", "Pass", "Int", "Tkl+Int", "Clr", "Err"],
        ["SCA",	"PassLive",	"PassDead",	"TO",	"Sh",	"Fld",	"Def",	"GCA",	"PassLive",	"PassDead",	"TO",	"Sh",	"Fld",	"Def"],
        ["SoTA", "GA", "Saves", "Save%", "CS", "PSxG", "PSxG+/-", "PKatt", "PKA", "PKsv", "PKm", "Cmp", "Att", "Cmp%", "Att", "Thr", "Launch%", "AvgLen", "Att", "Launch%", "AvgLen", "Opp", "Stp", "Stp%", "#OPA", "AvgDist"],
        ["CrdY", "CrdR", "2CrdY", "Fls", "Fld", "Off", "Crs", "Int", "TklW", "PKwon", "PKcon", "OG", "Recov", "Won", "Lost", "Won%"],
        []
]

for year in years:
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]

    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]
    
    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"
    
    # Get team data

    for team_url in team_urls:
        # Get match data
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        data = requests.get(team_url)
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        
        # Get stats links
        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and ("all_comps/shooting" in l or "all_comps/keeper" in l or "all_comps/passing" in l
                                             or "all_comps/passing_types" in l or "all_comps/gca" in l or "all_comps/defense" in l
                                               or "all_comps/possesion" in l or "all_comps/misc" in l)]
        links = list(set(links))
        links = [f"https://fbref.com{l}" for l in links]
        links.sort()
        
        for link, category in zip(links, categories):
            data = requests.get(link)
            df = pd.read_html(data.text, match=category)[0]
            df.to_csv(f"{team_name}_{category}.csv")
        time.sleep(3)
        break
        # Get shooting data
        data = requests.get(f"https://fbref.com{links[0]}")
        shooting = pd.read_html(data.text, match="Shooting")[0]

        # Clean up shooting data
        shooting.columns = shooting.columns.droplevel()
        try:
            team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
        except ValueError:
            continue
        team_data = team_data[team_data["Comp"] == "Premier League"]
        
        # Add year and team name
        team_data["Season"] = year
        team_data["Team"] = team_name
        all_matches.append(team_data)
        time.sleep(3)

In [10]:
# Convert to CSV
len(all_matches)
match_df = pd.concat(all_matches)
match_df.columns = [c.lower() for c in match_df.columns]
match_df.to_csv("matches.csv")