In [None]:
from datetime import datetime, timedelta

import pandas as pd
import requests
import tqdm
from bs4 import BeautifulSoup

In [None]:
GW1_START_DATES = {
    2022: datetime(2022, 8, 4),
    2023: datetime(2023, 8, 10),
    2024: datetime(2024, 8, 15),
}

In [None]:
day_map = {
    "Monday": 0,
    "Tuesday": 1,
    "Wednesday": 2,
    "Thursday": 3,
    "Friday": 4,
    "Saturday": 5,
    "Sunday": 6,
}

In [None]:
def get_weekdays_between(start, end, day_of_week="Thursday"):
    """Generate all Thursdays between two dates."""
    days = []
    current = start
    while current <= end:
        if current.weekday() == day_map[day_of_week]:
            days = [*days, current]
        current += timedelta(days=1)
    return days


def get_gameweek_number(date, gw1_start):
    """Calculate gameweek number based on first GW start."""
    delta_days = (date - gw1_start).days
    gw = delta_days // 7 + 1
    return gw if gw > 0 else None


def list_articles_for_date(date: datetime):
    base_url = f"https://www.fantasyfootballscout.co.uk/{date.year}/{date.strftime('%m')}/{date.strftime('%d')}/"
    try:
        res = requests.get(base_url, timeout=10)
        res.raise_for_status()
        soup = BeautifulSoup(res.text, "html.parser")

        # Find all anchor tags with hrefs
        links = soup.find_all("a", href=True)
        urls = set()

        for link in links:
            href = link["href"]
            if href.startswith(base_url):
                urls.add(href)

        # filter to urls containing 'injury'
        urls = [url for url in urls if "injury" in url or "team-news" in url]
        urls = [url for url in urls if "comments" not in url]

        return sorted(urls)

    except Exception as e:
        print(f"Failed to fetch or parse {base_url}: {e}")
        return []


def scrape_article(url):
    try:
        res = requests.get(url, timeout=10)
        res.raise_for_status()
        soup = BeautifulSoup(res.text, "html.parser")
        content_div = soup.find("article")
        if not content_div:
            return None
        paragraphs = content_div.find_all("p")
        return "\n\n".join(
            p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)
        )
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")
        return None

In [None]:
articles = []

# for each year
for season_year, gw1_date in GW1_START_DATES.items():
    print(season_year)
    season_end = datetime(season_year + 1, 5, 31)
    thursdays = get_weekdays_between(gw1_date, season_end)
    saturdays = get_weekdays_between(gw1_date, season_end, "Saturday")
    update_dates = thursdays + saturdays
    for date in tqdm(update_dates):
        gw = get_gameweek_number(date, gw1_date)

        if gw is None or gw > 38:
            continue

        urls = list_articles_for_date(date)

        day_of_week = date.strftime("%A")

        for url in urls:
            content = scrape_article(url)
            if content:
                articles.append(
                    {
                        "date": date.strftime("%Y-%m-%d"),
                        "gameweek": gw,
                        "day_of_week": day_of_week,
                        "url": url,
                        "content": content,
                    }
                )

In [None]:
df = pd.DataFrame(articles)

In [None]:
# sort by date
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values(by="date")

df.head()