In [11]:
import pandas as pd

# Cargar el archivo CSV
df = pd.read_csv("../datasets/combined_odds_cleaned.csv")
clasif = pd.read_csv("../datasets/clasificaciones.csv")

# Ordenar por fecha para cálculos históricos
df = df.sort_values(by="Date")


In [12]:
import pandas as pd

# Preprocesamiento
df = df.sort_values("Date")
df["Season"] = df["Season"].astype(str)
clasif["season"] = clasif["season"].astype(str)
clasif["team"] = clasif["team"].str.strip()

# Función para temporada anterior
def temporada_anterior(season):
    y1, y2 = season.split("-")
    return f"{int(y1)-1}-{str(int(y2)-1).zfill(2)}"

# Función para estadísticas
def calcular_estadisticas(partidos, equipo):
    if partidos.empty:
        return 0, 0.0, 0.0, 0
    ganados, gf, gc = 0, 0, 0
    for _, row in partidos.iterrows():
        local = row["HomeTeam"] == equipo
        g_team = row["FTHG"] if local else row["FTAG"]
        g_rival = row["FTAG"] if local else row["FTHG"]
        if g_team > g_rival:
            ganados += 1
        gf += g_team
        gc += g_rival
    total = len(partidos)
    return round(ganados / total, 2), round(gf / total, 2), round(gc / total, 2), gf - gc

records = []

# Iterar por cada partido (una fila por partido)
for idx, row in df.iterrows():
    match_date = row["Date"]
    season = row["Season"]
    team = row["HomeTeam"]
    rival = row["AwayTeam"]
    goals_team = row["FTHG"]
    goals_rival = row["FTAG"]
    result = 1 if goals_team > goals_rival else -1 if goals_team < goals_rival else 0

    # Estadísticas del equipo
    prev_team_matches = df[
        ((df["HomeTeam"] == team) | (df["AwayTeam"] == team)) &
        (df["Date"] < match_date)
    ].tail(10)
    pct_wins, avg_gf, avg_ga, gd = calcular_estadisticas(prev_team_matches, team)

    # Estadísticas del rival
    prev_rival_matches = df[
        ((df["HomeTeam"] == rival) | (df["AwayTeam"] == rival)) &
        (df["Date"] < match_date)
    ].tail(10)
    pct_wins_r, avg_gf_r, avg_ga_r, gd_r = calcular_estadisticas(prev_rival_matches, rival)

    # Head-to-head
    h2h = df[
        (((df["HomeTeam"] == team) & (df["AwayTeam"] == rival)) |
         ((df["HomeTeam"] == rival) & (df["AwayTeam"] == team))) &
        (df["Date"] < match_date)
    ].tail(5)
    pct_wins_vs, gf_vs, ga_vs, gd_vs = calcular_estadisticas(h2h, team)

    # Posiciones de la temporada anterior
    temp_ant = temporada_anterior(season)
    pos_team = clasif[
        (clasif["season"] == temp_ant) & (clasif["team"] == team)
    ]["position"]
    pos_rival = clasif[
        (clasif["season"] == temp_ant) & (clasif["team"] == rival)
    ]["position"]
    last_pos_team = int(pos_team.values[0]) if not pos_team.empty else 21
    last_pos_rival = int(pos_rival.values[0]) if not pos_rival.empty else 21

    records.append({
        "season": season,
        "date": match_date,
        "team": team,
        "rival_team": rival,
        "home_adv": 1,
        "last_season_team": last_pos_team,
        "last_season_rival": last_pos_rival,
        "pct_wins": pct_wins,
        "avg_goals_scored": avg_gf,
        "avg_goals_received": avg_ga,
        "goal_difference": gd,
        "pct_wins_rival": pct_wins_r,
        "avg_goals_scored_rival": avg_gf_r,
        "avg_goals_received_rival": avg_ga_r,
        "goal_difference_rival": gd_r,
        "pct_wins_vs_rival": pct_wins_vs,
        "avg_goals_scored_vs_rival": gf_vs,
        "avg_goals_received_vs_rival": ga_vs,
        "goal_difference_vs_rival": gd_vs,
        "AvgH": row["AvgH"],
        "AvgD": row["AvgD"],
        "AvgA": row["AvgA"],
        "goals_team": goals_team,
        "goals_rival": goals_rival,
        "result": result
    })

# Crear DataFrame final
df_final = pd.DataFrame(records)

# Eliminar primera temporada
df_final = df_final[df_final["season"] != "2003-04"]

# Redondear floats
float_cols = df_final.select_dtypes(include=['float']).columns
df_final[float_cols] = df_final[float_cols].round(2)

# Guardar
df_final.to_csv("dataset_transformado.csv", index=False)
