In [6]:
import pandas as pd
import soccerdata as sd  # requiere internet para scrapear

def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Aplana MultiIndex de columnas y normaliza nombres a str sin espacios."""
    out = df.copy()
    if isinstance(out.columns, pd.MultiIndex):
        out.columns = [
            "_".join([str(c) for c in tup if str(c) != ""]).replace(" ", "_")
            for tup in out.columns
        ]
    else:
        out.columns = [str(c).replace(" ", "_") for c in out.columns]
    return out

STAT_TYPES = ["summary", "keepers", "passing", "passing_types", "defense", "possession", "misc"]
META_COLS  = ["game_id", "date", "time", "week", "home_team", "away_team", "score", "venue", "referee"]
ID_KEYS    = ["league", "season", "game", "team", "player"]

In [7]:
LEAGUE = "ESP-La Liga"
SEASON = "20-21"    # <-- cámbialo por la temporada que quieras (p. ej.: "21-22", "22-23", etc.)
OUT_DIR = "out/la_liga_player_stats"  # carpeta donde guardaremos archivos

In [8]:
fb = sd.FBref(leagues=LEAGUE, seasons=SEASON)

In [9]:
# Descarga el calendario y normaliza columnas
schedule = fb.read_schedule().reset_index()
schedule = flatten_columns(schedule)

# Nos quedamos con columnas útiles
cols_sched = ["game", "game_id", "date", "time", "week", "home_team", "away_team", "score", "venue", "referee"]
cols_sched = [c for c in cols_sched if c in schedule.columns]
matches = schedule[cols_sched].drop_duplicates("game").copy()

# Normaliza tipos por seguridad
matches["game"]    = matches["game"].astype(str)
matches["game_id"] = matches["game_id"].astype(str)

print(matches.shape)
matches.head()

(380, 10)


Unnamed: 0,game,game_id,date,time,week,home_team,away_team,score,venue,referee
0,2020-09-12 Cádiz-Osasuna,15d5602e,2020-09-12,21:00,1,Cádiz,Osasuna,0–2,Estadio Ramón de Carranza,Isidro Díaz de Mera
1,2020-09-12 Eibar-Celta Vigo,056f7848,2020-09-12,16:00,1,Eibar,Celta Vigo,0–0,Estadio Municipal de Ipurúa,Valentín Pizarro
2,2020-09-12 Granada-Athletic Club,083135b8,2020-09-12,18:30,1,Granada,Athletic Club,2–0,Estadio Nuevo Los Cármenes,Antonio Matéu Lahoz
3,2020-09-13 Alavés-Betis,8c3b5490,2020-09-13,14:00,1,Alavés,Betis,0–1,Estadio de Mendizorroza,Pablo González
4,2020-09-13 Valencia-Levante,49d3b582,2020-09-13,21:00,1,Valencia,Levante,4–2,Estadio de Mestalla,José Sánchez


In [10]:
stat_type = "summary"  # cambia por cualquiera de: summary, keepers, passing_types, defense, possession, misc
game_ids = matches["game_id"].tolist()

pstats = fb.read_player_match_stats(stat_type=stat_type, match_id=game_ids)

# Asegurar columnas simples
pstats = pstats.reset_index()
pstats = flatten_columns(pstats)

# Garantizar que 'game' esté como columna y sea str
if "game" not in pstats.columns and "game" in getattr(pstats.index, "names", []):
    pstats = pstats.reset_index()
pstats["game"] = pstats["game"].astype(str)

# Añadir metadatos con .map (evita MergeError)
meta_by_game = matches.set_index("game")
for col in META_COLS:
    if col in meta_by_game.columns:
        pstats[col] = pstats["game"].map(meta_by_game[col])

print(pstats.shape)
pstats.head()

(11607, 44)


Unnamed: 0,league,season,game,team,player,jersey_number,nation,pos,age,min,...,Take-Ons_Succ,game_id,date,time,week,home_team,away_team,score,venue,referee
0,ESP-La Liga,2021,2020-09-12 Cádiz-Osasuna,Cádiz,Alfonso Espino,22,URU,LB,28-251,90,...,0,15d5602e,2020-09-12,21:00,1,Cádiz,Osasuna,0–2,Estadio Ramón de Carranza,Isidro Díaz de Mera
1,ESP-La Liga,2021,2020-09-12 Cádiz-Osasuna,Cádiz,Anthony Lozano,9,HON,FW,27-140,13,...,0,15d5602e,2020-09-12,21:00,1,Cádiz,Osasuna,0–2,Estadio Ramón de Carranza,Isidro Díaz de Mera
2,ESP-La Liga,2021,2020-09-12 Cádiz-Osasuna,Cádiz,Cala,16,ESP,CB,30-291,90,...,0,15d5602e,2020-09-12,21:00,1,Cádiz,Osasuna,0–2,Estadio Ramón de Carranza,Isidro Díaz de Mera
3,ESP-La Liga,2021,2020-09-12 Cádiz-Osasuna,Cádiz,David Gil,13,ESP,GK,26-245,90,...,0,15d5602e,2020-09-12,21:00,1,Cádiz,Osasuna,0–2,Estadio Ramón de Carranza,Isidro Díaz de Mera
4,ESP-La Liga,2021,2020-09-12 Cádiz-Osasuna,Cádiz,Filip Malbašić,24,SRB,AM,27-299,45,...,2,15d5602e,2020-09-12,21:00,1,Cádiz,Osasuna,0–2,Estadio Ramón de Carranza,Isidro Díaz de Mera


In [12]:
pstats.to_csv(f"la_liga_{SEASON}_players_ALL.csv", index=False)


In [None]:
preferred_cols = [
    "league", "season", "game_id", "date", "time", "week", "home_team", "away_team", "score", "venue", "referee",
    "team", "player", "jersey_number", "pos", "age", "min"
]
present = [c for c in preferred_cols if c in pstats.columns]
pstats_ordered = pstats[present + [c for c in pstats.columns if c not in present]].copy()

# Orden cronológico
sort_cols = [c for c in ["date", "time", "game_id", "team", "player"] if c in pstats_ordered.columns]
if sort_cols:
    pstats_ordered = pstats_ordered.sort_values(sort_cols, na_position="last").reset_index(drop=True)

# Guardar
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)
pstats_ordered.to_csv(f"{OUT_DIR}/la_liga_{SEASON}_players_{stat_type}.csv", index=False)
# pstats_ordered.to_parquet(f"{OUT_DIR}/la_liga_{SEASON}_players_{stat_type}.parquet", index=False)
pstats_ordered.head()

In [None]:
wide.to_csv(f"{OUT_DIR}/la_liga_{SEASON}_players_ALL.csv", index=False)
