In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import pandas as pd
import time
import datetime as dt
import re
from dateutil import parser as dtparse   # pip install python-dateutil

GAP

In [2]:
pd.set_option('display.max_rows', None)

In [3]:
import requests, re, pandas as pd
from bs4 import BeautifulSoup
HEADERS = {"User-Agent": "Mozilla/5.0"}

# ─────────────────────────── maps 1 & 2 resolver ────────────────────────────
def first_two_games(base_url: str):
    """
    Returns [(game_id, map_name)] for Map 1 and Map 2.

    • looks ONLY at the first <div class="vm-stats-gamesnav">   (top nav bar)
    • skips the disabled “All” tab (index 0)
    • collects items 1 and 2   → Map 1, Map 2
    """
    soup   = BeautifulSoup(requests.get(base_url, headers=HEADERS).text,
                           "html.parser")

    top_nav = soup.select_one(".vm-stats-gamesnav")     # top bar only
    items   = top_nav.select(".js-map-switch")[1:3]     # skip ‘All’, take 2

    games = []
    for div in items:
        gid  = div["data-game-id"]
        text = " ".join(div.stripped_strings)           # "1 Haven"
        map_name = text.split(maxsplit=1)[1]            # "Haven"
        games.append((gid, map_name))
    return games


# ─────────────────────────── one‑map scraper ────────────────────────────────
def scrape_one_map(match_url: str, game_id: str, map_name: str) -> pd.DataFrame:
    base_path = match_url.split("?", 1)[0]                      # keep slug
    html      = requests.get(f"{base_path}/?game={game_id}&tab=overview",
                             headers=HEADERS).text
    soup      = BeautifulSoup(html, "html.parser")

    # ── locate the specific map block ────────────────────────────────────────
    block = soup.find("div", class_="vm-stats-game", attrs={"data-game-id": game_id})
    if not block:
        raise ValueError(f"Could not find stats for game ID {game_id}")

    # header with score / winner
    header = block.select_one(".vm-stats-game-header")
    teams  = [t.text.strip() for t in header.select(".team-name")]
    scores = [int(s.text)     for s in header.select(".score")]
    winner = teams[0] if scores and scores[0] > scores[1] else (
             teams[1] if scores and scores[1] > scores[0] else None)
    # ── two team tables inside this block ────────────────────────────────────

    # ── iterate over the first two visible tables (Team A, Team B) ───────────
    tables = block.select("table.wf-table-inset")[:2]

    records = []
    for tbl_idx, tbl in enumerate(tables):          # tbl_idx = 0 or 1
        for r in tbl.select("tbody tr"):
            name_tag  = r.select_one(".text-of")
            agent_tag = r.select_one(".mod-agents img")
            team_tag = header.select(".team-name")[tbl_idx].text.strip()
            if not (name_tag and agent_tag):
                continue

            def both(node):
                span = node.select_one(".side.mod-both") or node.select_one(".mod-both")
                return span.text.strip() if span else ""

            records.append({
                "Map"        : map_name,
                "Team"       : r.select_one(".ge-text-light").text.strip(),
                "Player"     : name_tag.text.strip(),
                "Agent"      : agent_tag["alt"].strip(),
                "ACS"        : both(r.select("td.mod-stat")[1]),
                "Kills"      : both(r.select_one(".mod-vlr-kills")),
                "Deaths"     : both(r.select_one(".mod-vlr-deaths")),
                "Assists"    : both(r.select_one(".mod-vlr-assists")),
                "FirstKills" : both(r.select_one(".mod-fb")),
                "Winner"     : winner == team_tag,  # ← flag by table index
                "MapScore"   : f"{scores[0]}‑{scores[1]}" if scores else ""
            })

    return pd.DataFrame(records)


In [4]:
def extract_match_date(match_url: str):
    soup = BeautifulSoup(requests.get(match_url, headers=HEADERS).text,
                         "html.parser")

    tag = soup.select_one(".match-header-date .moment-tz-convert")
    if not tag:
        return None

    raw = (tag.get("data-utc-ts") or "").strip()
    if raw.isdigit():                          # Unix timestamp
        return dt.datetime.utcfromtimestamp(int(raw)).date()

    # try ISO or other common formats
    for candidate in (raw, tag.text.strip()):
        try:
            return dtparse.parse(candidate, fuzzy=True).date()
        except (ValueError, TypeError):
            continue
    return None

def vlr_first_two_maps(match_url: str) -> pd.DataFrame:
    match_date = extract_match_date(match_url)   # ← new robust helper

    game_info  = first_two_games(match_url)      # [('195773','Haven'), ('195774','Split')]
    dfs = [
        scrape_one_map(match_url, gid, name).assign(Date=match_date)
        for gid, name in game_info
    ]
    return pd.concat(dfs, ignore_index=True)


def scrape_data(df, match_urls):
    for i in range(len(match_urls)):
        url = match_urls[i]
        df2 = vlr_first_two_maps(url)
        df2["Match"] = i
        df = pd.concat([df, df2], ignore_index=True)
    return df


In [5]:
# ─────────────────────────── 1. grab ALL match URLs for an event ────────────
def event_match_urls(event_url: str) -> list[str]:
    """
    Returns a list of absolute match URLs from an event page such as
    https://www.vlr.gg/event/matches/2276/... .
    Ignores duplicates and scrapes only completed / scheduled matches.
    """
    html  = requests.get(event_url, headers=HEADERS).text
    soup  = BeautifulSoup(html, "html.parser")

    links = set()
    for a in soup.select("a"):                      # every anchor on the page
        href = a.get("href", "")
        if re.match(r"^/\d+/", href):               # /427995/... pattern
            links.add("https://www.vlr.gg" + href.split("?")[0])  # keep slug

    return sorted(links)                            # 🔢 deterministic order


# ─────────────────────────── 2. scrape event end‑to‑end ─────────────────────
def scrape_event_first_two_maps(event_url: str,
                                pause_sec: float = 1.0) -> pd.DataFrame:
    """
    For every match listed on the event page:
        • scrape Map 1 & Map 2 stats (+ outcome, date, etc.)
    Returns one big tidy DataFrame.
    """
    all_matches = event_match_urls(event_url)
    print(f"Found {len(all_matches)} matches")

    big_df = pd.DataFrame()
    for i, m_url in enumerate(all_matches, 1):
        try:
            df = vlr_first_two_maps(m_url)
            df["MatchID"] = i
            df["MatchURL"] = m_url
            big_df = pd.concat([big_df, df], ignore_index=True)
            print(f"[{i}/{len(all_matches)}]  ✅  scraped {m_url}")
        except Exception as e:
            print(f"[{i}/{len(all_matches)}]  ⚠️  {m_url}  -> {e}")
        time.sleep(pause_sec)                       # be polite
    return big_df

In [14]:
match_urls = [ 
    "https://www.vlr.gg/482509/team-liquid-vs-natus-vincere-champions-tour-2025-emea-stage-1-playoffs-lr2/?game=all&tab=overview",
              
    
]

test = pd.DataFrame()
test = scrape_data(test, match_urls)
test.head()
test.to_csv("test.csv", index=False)

In [7]:
# emea_url = ("https://www.vlr.gg/event/matches/2380/champions-tour-2025-emea-stage-1/?series_id=all")
# emea_df = scrape_event_first_two_maps(emea_url)
# apac_url = ("https://www.vlr.gg/event/matches/2379/champions-tour-2025-pacific-stage-1/?series_id=all")
# apac_df = scrape_event_first_two_maps(apac_url)
# emea_df.to_csv("emea.csv", index=False)
# apac_df.to_csv("apac.csv", index=False)

In [8]:
# cn_url = ("https://www.vlr.gg/event/matches/2359/champions-tour-2025-china-stage-1/?series_id=all")
# cn_df = scrape_event_first_two_maps(cn_url)
# amer_url = ("https://www.vlr.gg/event/matches/2347/champions-tour-2025-americas-stage-1/?series_id=all")
# amer_df = scrape_event_first_two_maps(amer_url)
# cn_df.to_csv("cn.csv", index=False)
# amer_df.to_csv("amer.csv", index=False)