In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

def scrape_laliga_season(season_id=2016, matchday_from=1, matchday_to=38):
    """
    Scrapes LaLiga match results for a given season from Transfermarkt.
    Returns a DataFrame with date, home team, away team, goals.
    """
    url = (
        f"https://www.transfermarkt.com/laliga/gesamtspielplan/wettbewerb/ES1"
        f"?saison_id={season_id}&spieltagVon={matchday_from}&spieltagBis={matchday_to}"
    )

    resp = requests.get(url, headers=HEADERS)
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, "html.parser")
    data = []
    current_date = None

    for table in soup.find_all("table"):
        for tbody in table.find_all("tbody"):
            for tr in tbody.find_all("tr"):
                if "bg_blau_20" in tr.get("class", []):  # date row
                    date_link = tr.find("a", href=True)
                    if date_link:
                        try:
                            current_date = datetime.strptime(date_link.text.strip(), "%m/%d/%y").strftime("%Y-%m-%d")
                        except ValueError:
                            current_date = None
                    continue

                tds = tr.find_all("td")
                if len(tds) >= 7:
                    try:
                        home_team = tds[2].get_text(strip=True)
                        result = tds[4].get_text(strip=True)
                        away_team = tds[6].get_text(strip=True)

                        if ":" in result:
                            home_goals, away_goals = result.split(":")
                            home_goals = int(home_goals.strip())
                            away_goals = int(away_goals.strip())
                        else:
                            home_goals = away_goals = None

                        data.append({
                            "season": f"{season_id}/{season_id+1}",
                            "date": current_date,
                            "home_team": home_team,
                            "away_team": away_team,
                            "home_team_goal": home_goals,
                            "away_team_goal": away_goals
                        })
                    except:
                        pass

    return pd.DataFrame(data)

def scrape_multiple_seasons(start_year=2016, end_year=2024):
    """
    Loops through multiple seasons and concatenates them into one DataFrame.
    """
    all_data = []
    for year in range(start_year, end_year + 1):
        print(f"Scraping season {year}/{year+1}...")
        season_df = scrape_laliga_season(season_id=year)
        all_data.append(season_df)

    return pd.concat(all_data, ignore_index=True)

# Scrape and save single CSV
df_all = scrape_multiple_seasons(2016, 2024)
#print(f"Total matches scraped: {len(df_all)}")

save_path = "/content/drive/My Drive/master_IA/TFM/laliga_2016_2024_matches.csv"
df_all.to_csv(save_path, index=False)
print(f"File saved to: {save_path}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Scraping season 2016/2017...
Scraping season 2017/2018...
Scraping season 2018/2019...
Scraping season 2019/2020...
Scraping season 2020/2021...
Scraping season 2021/2022...
Scraping season 2022/2023...
Scraping season 2023/2024...
Scraping season 2024/2025...
File saved to: /content/drive/My Drive/master_IA/TFM/laliga_2016_2024_matches.csv
