Zum scrapen der reviews links. Problem Reviews werden durch scrollen neu geladen. Deshalb Selenium um scrollen zu automatisieren. Aber durch normale Suche werden nicht alle reviews angezeigt. Daher Filter über Score Dropdown dann werden alle Reviews innerhalb eines Score Bereichs geladen.

In [6]:
import time
import csv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromiumService
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.core.os_manager import ChromeType

In [7]:
# Setup
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(service=ChromiumService(ChromeDriverManager(chrome_type=ChromeType.CHROMIUM).install()), options=options)
driver.get("https://www.ign.com/reviews/games")

In [8]:
# Cookie-Banner wegklicken
try:
    accept_btn = WebDriverWait(driver, 5).until(
        EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Accept')]"))
    )
    accept_btn.click()
    print("🍪 Cookie-Banner akzeptiert")
    time.sleep(2)
except:
    print("ℹ️ Kein Cookie-Banner gefunden oder bereits akzeptiert")

🍪 Cookie-Banner akzeptiert


In [9]:
# Alle schon bekannten Reviews (per data-id)
all_reviews = {}

# Wartehilfe
wait = WebDriverWait(driver, 10)

# Warte auf das Score-Dropdown
wait.until(EC.presence_of_element_located((By.ID, "scoreRange")))

# Zugriff auf das Dropdown
select = Select(driver.find_element(By.ID, "scoreRange"))

# Alle verfügbaren Optionen holen (außer "All Scores")
score_options = [
    opt for opt in select.options
    if opt.get_attribute("value") and opt.get_attribute("value") != "All Scores"
]

for score_option in score_options:
    value = score_option.get_attribute("value")
    print(f"\n🔍 Filter anwenden: {score_option.text} ({value})")

    # Score auswählen
    select.select_by_value(value)
    time.sleep(1)  # kurz warten, bis neu geladen wird

    # Warten, bis neue Ergebnisse erscheinen
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.content-item")))

    # Scrollen bis alle geladen
    last_count = 0
    scroll_round = 0
    no_new_rounds = 0
    MAX_SCROLLS = 500
    MAX_TOTAL_TIME = 300  # Sekunden

    scroll_start_time = time.time()

    while no_new_rounds < 2:
        items = driver.find_elements(By.CSS_SELECTOR, "div.content-item")
        current_count = len(items)
        print(f"⏳ Scroll {scroll_round + 1}: {current_count} Reviews")

        if current_count == last_count:
            no_new_rounds += 1
            print(f"⚠️  Keine neuen Inhalte (x{no_new_rounds})")
        else:
            no_new_rounds = 0

        if (time.time() - scroll_start_time) > MAX_TOTAL_TIME:
            print(f"🛑 Zeitlimit von {MAX_TOTAL_TIME}s erreicht – breche Scrollen ab.")
            break

        if scroll_round >= MAX_SCROLLS:
            print(f"⚠️ Maximale Scrollanzahl ({MAX_SCROLLS}) erreicht – möglicherweise unvollständig.")
            break

        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        last_count = current_count
        scroll_round += 1

    # Reviews auslesen mit frischen Elementen
    print(f"🔍 Auslesen der Reviews...")
    items = driver.find_elements(By.CSS_SELECTOR, "div.content-item")
    for i in range(len(items)):
        try:
            item = driver.find_elements(By.CSS_SELECTOR, "div.content-item")[i]
            review_id = item.get_attribute("data-id")
            if review_id and review_id not in all_reviews:
                try:
                    link_element = item.find_element(By.CSS_SELECTOR, 'a[data-cy="item-body"]')
                    link = link_element.get_attribute("href")
                    all_reviews[review_id] = link
                    print(f"🔗 {link}")
                except Exception as e:
                    print(f"⚠️  Fehler beim Link holen: {e}")
                    continue
        except Exception as e:
            print(f"⚠️  Element {i} ist stale oder nicht auffindbar – übersprungen. Fehler: {e}")
            continue

    # Zwischenspeichern
    with open("reviews.csv", "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["id", "link"])
        for review_id, link in all_reviews.items():
            writer.writerow([review_id, link])

    print(f"💾 Zwischenspeicherung nach {score_option.text}: {len(all_reviews)} Reviews in 'reviews.csv'")
    print(f"✅ Gesammelt: {len(all_reviews)} eindeutige Reviews bisher")

print(f"\n🏁 Gesamt: {len(all_reviews)} eindeutige Reviews")

# Browser schließen
driver.quit()

# Optional: Ausgabe als Vorschau
for idx, (review_id, link) in enumerate(all_reviews.items(), 1):
    print(f"{idx}. ID: {review_id} | 🔗 {link}")



🔍 Filter anwenden: 10 (10,10)
⏳ Scroll 1: 10 Reviews
⏳ Scroll 2: 20 Reviews
⏳ Scroll 3: 30 Reviews
⏳ Scroll 4: 40 Reviews
⏳ Scroll 5: 50 Reviews
⏳ Scroll 6: 59 Reviews
⏳ Scroll 7: 68 Reviews
⏳ Scroll 8: 74 Reviews
⏳ Scroll 9: 74 Reviews
⚠️  Keine neuen Inhalte (x1)
⏳ Scroll 10: 74 Reviews
⚠️  Keine neuen Inhalte (x2)
🔍 Auslesen der Reviews...
🔗 https://www.ign.com/articles/arcs-board-game-review
🔗 https://www.ign.com/articles/elden-ring-shadow-of-the-erdtree-dlc-review
🔗 https://www.ign.com/articles/stardew-valley-review-2024
🔗 https://www.ign.com/articles/asgards-wrath-2-review
🔗 https://www.ign.com/articles/baldurs-gate-3-review
🔗 https://www.ign.com/articles/the-legend-of-zelda-tears-of-the-kingdom-review
🔗 https://www.ign.com/articles/resident-evil-4-remake-review
🔗 https://www.ign.com/articles/dead-cells-return-to-castlevania-dlc-review
🔗 https://www.ign.com/articles/metroid-prime-remastered-review
🔗 https://www.ign.com/articles/dwarf-fortress-review
🔗 https://www.ign.com/article

KeyboardInterrupt: 