In [2]:
!pip install selenium
!apt update
!apt install chromium-chromedriver

Collecting selenium
  Downloading selenium-4.38.0-py3-none-any.whl.metadata (7.5 kB)
Collecting trio<1.0,>=0.31.0 (from selenium)
  Downloading trio-0.32.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket<1.0,>=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting certifi>=2025.10.5 (from selenium)
  Downloading certifi-2025.10.5-py3-none-any.whl.metadata (2.5 kB)
Collecting sortedcontainers (from trio<1.0,>=0.31.0->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio<1.0,>=0.31.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket<1.0,>=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pysocks!=1.5.7,<2.0,>=1.5.6 (from urllib3[socks]<3.0,>=2.5.0->selenium)
  Downloading PySocks-1.7.1-py3-none-any.whl.metadata (13 kB)
Downloading selenium-4.38.0


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
'apt' is not recognized as an internal or external command,
operable program or batch file.
'apt' is not recognized as an internal or external command,
operable program or batch file.


In [None]:
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
import threading
import os
from urllib3.exceptions import MaxRetryError
import logging
import sys
import json

def setup_logging(config_file='config_selenium.json'):
    try:
        with open(config_file, 'r', encoding='utf-8') as f:
            config = json.load(f)

        log_config = config.get('logging', {})

        if not log_config.get('enabled', True):
            logging.getLogger().disabled = True
            return

        level = getattr(logging, log_config.get('level', 'INFO'))

        handlers = []
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
        )

        if log_config.get('log_to_file', True):
            file_handler = logging.FileHandler(
                log_config.get('filename', 'api_client.log'),
                encoding='utf-8'
                )
            file_handler.setFormatter(formatter)
            handlers.append(file_handler)

        if log_config.get('log_to_console', True):
            console_handler = logging.StreamHandler(sys.stdout)
            console_handler.setFormatter(formatter)
            handlers.append(console_handler)

        logging.basicConfig(level=level, handlers=handlers)

        logger = logging.getLogger(__name__)
        logger.info('Логирование настроено из %s', config_file)
        return logger

    except FileNotFoundError:
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('selenium_scraper.log', encoding='utf-8'),
                logging.StreamHandler(sys.stdout)
            ]
        )
        logger = logging.getLogger(__name__)
        logger.warning("Файл конфигурации %s не найден, используется стандартная конфигурация", config_file)
        return logger

    except Exception as e:
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        logger = logging.getLogger(__name__)
        logger.error("Ошибка настройки логирования: %s", e)
        return logger

logger = setup_logging()

if logger is None:
    logger = logging.getLogger(__name__)
    logger.disabled = True

driver_lock = threading.Lock()

def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)

    chrome_options.add_argument("--proxy-server='direct://'")
    chrome_options.add_argument("--proxy-bypass-list=*")

    chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")

    chrome_options.page_load_strategy = 'eager'

    prefs = {
        "profile.managed_default_content_settings.images": 2,
        "profile.default_content_setting_values.notifications": 2,
        "profile.managed_default_content_settings.stylesheets": 2,
        "profile.managed_default_content_settings.cookies": 2,
        "profile.managed_default_content_settings.javascript": 1,
    }
    chrome_options.add_experimental_option("prefs", prefs)

    try:
        driver = webdriver.Chrome(options=chrome_options)
        driver.set_page_load_timeout(25)
        driver.set_script_timeout(25)
        logger.info("Драйвер успешно создан")
        return driver
    except Exception as e:
        logger.error(f"Ошибка создания драйвера: {e}")
        raise

def check_driver(driver):
    try:
        driver.current_url
        return True
    except (WebDriverException, MaxRetryError):
        logger.warning("Драйвер не отвечает")
        return False

def parse_metacritic(driver):
    game_data = {}

    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )

        body = driver.find_element(By.TAG_NAME, "body")
        full_text = body.text
        lines = [line.strip() for line in full_text.split('\n') if line.strip()]

        metascore_selectors = [
            "div.c-siteReviewScore span",
            "span.c-siteReviewScore",
            "div[class*='metascore_w']",
            ".c-productScoreInfo_scoreNumber",
        ]

        for selector in metascore_selectors:
            try:
                elements = driver.find_elements(By.CSS_SELECTOR, selector)
                for element in elements:
                    text = element.text.strip()
                    if text.isdigit() and 0 <= int(text) <= 100:
                        game_data['metascore'] = text
                        logger.debug(f"Найден metascore: {text} с селектором {selector}")
                        break
                if 'metascore' in game_data:
                    break
            except Exception as e:
                logger.debug(f"Ошибка при поиске metascore с селектором {selector}: {e}")
                continue

        title_selectors = ["h1.c-productHero_title", "h1.c-pageTitle", "h1"]
        for selector in title_selectors:
            try:
                title_element = driver.find_element(By.CSS_SELECTOR, selector)
                title = title_element.text.strip()
                if title and len(title) > 1:
                    game_data['title'] = title
                    logger.debug(f"Найден заголовок: {title} с селектором {selector}")
                    break
            except Exception as e:
                logger.debug(f"Ошибка при поиске заголовка с селектором {selector}: {e}")
                continue

        try:
            for i, line in enumerate(lines):
                line_lower = line.lower()

                if any(keyword in line_lower for keyword in ['developer:', 'developed by']) and i+1 < len(lines):
                    next_line = lines[i+1].strip()
                    if next_line and len(next_line) > 1:
                        game_data['developer'] = next_line
                        logger.debug(f"Найден разработчик: {next_line}")

                elif any(keyword in line_lower for keyword in ['publisher:', 'published by']) and i+1 < len(lines):
                    next_line = lines[i+1].strip()
                    if next_line and len(next_line) > 1:
                        game_data['publisher'] = next_line
                        logger.debug(f"Найден издатель: {next_line}")

                elif any(keyword in line_lower for keyword in ['genre:', 'genres:', 'category:']) and i+1 < len(lines):
                    next_line = lines[i+1].strip()
                    if next_line and len(next_line) > 1:
                        game_data['genres'] = next_line
                        logger.debug(f"Найдены жанры: {next_line}")

                elif any(keyword in line_lower for keyword in ['release date:', 'released:']) and i+1 < len(lines):
                    next_line = lines[i+1].strip()
                    if next_line and len(next_line) > 1:
                        game_data['release_date'] = next_line
                        logger.debug(f"Найдена дата релиза: {next_line}")

                elif any(keyword in line_lower for keyword in ['platform:', 'platforms:']) and i+1 < len(lines):
                    next_line = lines[i+1].strip()
                    if next_line and len(next_line) > 1:
                        game_data['platforms'] = next_line
                        logger.debug(f"Найдены платформы: {next_line}")

        except Exception as e:
            logger.warning(f"Ошибка при текстовом поиске: {e}")

        rating_found = False
        details_found = False

        for i, line in enumerate(lines):
            if line == 'Details':
                details_found = True
                continue

            if details_found:
                if line.startswith('Rated'):
                    game_data['rating'] = line
                    rating_found = True
                    logger.debug(f"Найден рейтинг из Details: {line}")
                    break
                elif line in ['Related Games', 'Game Stats', 'Critic Reviews', 'User Reviews']:
                    break

        if not rating_found:
            for i, line in enumerate(lines):
                if line.startswith('Rated'):
                    game_data['rating'] = line
                    rating_found = True
                    logger.debug(f"Найден рейтинг: {line}")
                    break
                elif any(keyword in line.upper() for keyword in ['ESRB:', 'RATED', 'MATURE', 'TEEN', 'EVERYONE']):
                    if line and len(line) > 3:
                        game_data['rating'] = line
                        rating_found = True
                        logger.debug(f"Найден рейтинг по ключевым словам: {line}")
                        break

        default_fields = {
            'platforms': 'N/A',
            'release_date': 'N/A',
            'developer': 'N/A',
            'publisher': 'N/A',
            'genres': 'N/A',
            'rating': 'N/A',
            'metascore': 'N/A',
            'title': 'N/A'
        }

        for field, default_value in default_fields.items():
            if field not in game_data or not game_data[field] or game_data[field] == 'N/A':
                game_data[field] = default_value

        logger.info(f"Успешно распарсены данные: { {k: v for k, v in game_data.items() if v != 'N/A'} }")
        return game_data

    except Exception as e:
        logger.error(f"Ошибка парсинга: {e}")
        return {field: 'N/A' for field in [
            'metascore', 'title', 'release_date', 'platforms',
            'developer', 'publisher', 'genres', 'rating'
        ]}

def generate_meta_url(game_slug):
    if pd.isna(game_slug) or not game_slug or game_slug == '':
        logger.warning(f"Пустой slug для игры")
        return None
    url = f"https://www.metacritic.com/game/{game_slug}/"
    logger.debug(f"Сгенерирован URL: {url}")
    return url

def process_single_game(game_info):
    index, total_games, game_name, game_slug, driver = game_info

    logger.info(f"[{index+1}/{total_games}] Обработка {game_name} (slug: {game_slug})")

    try:
        metacritic_url = generate_meta_url(game_slug)

        if not metacritic_url:
            logger.warning(f"Пустой URL для игры {game_name}")
            return game_name, {'error': 'Empty slug', 'metacritic_url': 'N/A', 'game_name': game_name}

        with driver_lock:
            if not check_driver(driver):
                raise WebDriverException("Драйвер не отвечает")
            logger.debug(f"Переход по URL: {metacritic_url}")
            driver.get(metacritic_url)

        time.sleep(2)

        game_data = parse_metacritic(driver)
        game_data['metacritic_url'] = metacritic_url
        game_data['game_name'] = game_name
        game_data['slug'] = game_slug

        logger.info(f"Успешно обработана игра {game_name}")
        return game_name, game_data

    except Exception as e:
        logger.error(f"Ошибка обработки {game_name}: {e}")
        return game_name, {
            'error': str(e),
            'metacritic_url': metacritic_url if 'metacritic_url' in locals() else 'N/A',
            'game_name': game_name,
            'slug': game_slug,
            'metascore': 'N/A',
            'title': 'N/A'
        }

def scrape_games_batch(df, batch_size=30, start_from=0, max_retries=2):
    driver = None
    games_data = {}

    total_games = len(df)
    end_index = min(start_from + batch_size, total_games)

    logger.info(f"Обработка игр с {start_from} по {end_index} из {total_games}")

    for retry in range(max_retries):
        try:
            if driver is None:
                driver = setup_driver()
                logger.info("Драйвер инициализирован для батча")

            for index in range(start_from, end_index):
                try:
                    row = df.iloc[index]
                    game_name = row['name']
                    game_slug = row['slug']
                    game_info = (index, total_games, game_name, game_slug, driver)

                    game_name, game_data = process_single_game(game_info)
                    games_data[game_name] = game_data

                    if index < end_index - 1:
                        time.sleep(1.5)

                    if (index - start_from + 1) % 30 == 0:
                        logger.info(f"Обработано {index - start_from + 1} игр в текущем батче")

                except Exception as e:
                    logger.error(f"Ошибка при обработке игры {index}: {e}")
                    continue

            if driver:
                driver.quit()
                logger.info("Драйвер закрыт после успешного выполнения батча")
            return games_data

        except (WebDriverException, MaxRetryError, TimeoutException) as e:
            logger.error(f"Ошибка драйвера (попытка {retry + 1}/{max_retries}): {e}")
            if driver:
                try:
                    driver.quit()
                    logger.info("Драйвер принудительно закрыт после ошибки")
                except Exception as e:
                    logger.warning(f"Ошибка при закрытии драйвера: {e}")
                driver = None
            time.sleep(3)

    logger.error(f"Не удалось обработать батч после {max_retries} попыток")
    return games_data

def update_dataset(df, metacritic_data):
    logger.info("Начало обновления датасета")

    metacritic_columns = [
        'metacritic_url', 'metacritic_title', 'metacritic_score',
        'metacritic_release_date', 'metacritic_platforms', 'metacritic_developer',
        'metacritic_publisher', 'metacritic_genres', 'metacritic_rating'
    ]

    for col in metacritic_columns:
        if col not in df.columns:
            df[col] = None

    updated_count = 0
    for index in range(len(df)):
        game_name = df.iloc[index]['name']

        if game_name in metacritic_data:
            game_data = metacritic_data[game_name]

            df.at[index, 'metacritic_url'] = game_data.get('metacritic_url', '')
            df.at[index, 'metacritic_title'] = game_data.get('title', '')
            df.at[index, 'metacritic_score'] = game_data.get('metascore', '')
            df.at[index, 'metacritic_release_date'] = game_data.get('release_date', '')
            df.at[index, 'metacritic_platforms'] = game_data.get('platforms', '')
            df.at[index, 'metacritic_developer'] = game_data.get('developer', '')
            df.at[index, 'metacritic_publisher'] = game_data.get('publisher', '')
            df.at[index, 'metacritic_genres'] = game_data.get('genres', '')
            df.at[index, 'metacritic_rating'] = game_data.get('rating', '')

            updated_count += 1

    logger.info(f"Обновлено записей: {updated_count}/{len(df)}")
    return df

def save_progress(df, filename, batch_num=None):
    if batch_num is not None:
        name, ext = os.path.splitext(filename)
        progress_filename = f"{name}_batch_{batch_num}{ext}"
    else:
        progress_filename = filename

    try:
        df.to_csv(progress_filename, index=False, encoding='utf-8')
        logger.info(f"Прогресс сохранен: {progress_filename}")
        return progress_filename
    except Exception as e:
        logger.error(f"Ошибка сохранения файла {progress_filename}: {e}")
        return None

def main():
    logger.info("Запуск основного процесса скрапинга")

    try:
        df = pd.read_csv('games_parallel_2000_2000.csv', engine='python', on_bad_lines='skip')
        logger.info(f"Загружено игр из файла: {len(df)}")

    except FileNotFoundError:
        logger.error("Файл games_parallel_2000_2000.csv не найден")
        return None
    except Exception as e:
        logger.error(f"Ошибка загрузки файла: {e}")
        return None

    if 'slug' not in df.columns:
        logger.error("Отсутствует столбец 'slug' в датасете")
        return df

    batch_size = 30
    total_games = len(df)

    logger.info(f"Всего игр для обработки: {total_games}")
    logger.info(f"Размер батча: {batch_size}")

    metacritic_columns = [
        'metacritic_url', 'metacritic_title', 'metacritic_score',
        'metacritic_release_date', 'metacritic_platforms', 'metacritic_developer',
        'metacritic_publisher', 'metacritic_genres', 'metacritic_rating'
    ]

    for col in metacritic_columns:
        if col not in df.columns:
            df[col] = None

    all_metacritic_data = {}
    batch_num = 0

    for start_index in range(0, total_games, batch_size):
        batch_num += 1
        logger.info(f"Обработка батча {batch_num}: игры {start_index} - {min(start_index + batch_size, total_games)}")

        try:
            batch_metacritic_data = scrape_games_batch(df, batch_size, start_from=start_index)
            all_metacritic_data.update(batch_metacritic_data)

            df = update_dataset(df, batch_metacritic_data)

            save_progress(df, 'result_with_metacritic.csv', batch_num)

            success_count = sum(1 for data in batch_metacritic_data.values()
                              if data.get('metascore', 'N/A') != 'N/A' and data.get('metascore', 'N/A') != '')
            logger.info(f"Успешно собрано оценок в батче {batch_num}: {success_count}/{len(batch_metacritic_data)}")

            if start_index + batch_size < total_games:
                logger.info("Пауза 10 секунд перед следующим батчем")
                time.sleep(10)

        except Exception as e:
            logger.error(f"Критическая ошибка в батче {batch_num}: {e}")
            logger.info("Продолжаем со следующим батчем")
            continue

    logger.info("Завершение обработки всех батчей, итоговое сохранение")

    try:
        final_filename = save_progress(df, 'final_result.csv')

        if final_filename:
            logger.info(f"Финальный файл сохранен: {final_filename}")

            total_success = sum(1 for data in all_metacritic_data.values()
                               if data.get('metascore', 'N/A') != 'N/A' and data.get('metascore', 'N/A') != '')
            logger.info(f"Итоговая статистика: успешно собрано оценок {total_success} из {len(all_metacritic_data)}")

        return df

    except Exception as e:
        logger.error(f"Ошибка при финальном сохранении: {e}")
        return df

if __name__ == "__main__":
    logger.info("Запуск скрапинга Metacritic")

    final_df = main()

    if final_df is not None:
        logger.info(f"Скрапинг завершён успешно. Размер итогового датасета: {len(final_df)} строк")
    else:
        logger.error("Скрапинг завершился с ошибкой. Датасет не был создан")

2025-11-04 12:37:34,202 - __main__ - INFO - Логирование настроено из config_selenium.json
2025-11-04 12:37:34,208 - __main__ - INFO - Запуск скрапинга Metacritic
2025-11-04 12:37:34,209 - __main__ - INFO - Запуск основного процесса скрапинга
2025-11-04 12:37:34,268 - __main__ - INFO - Загружено игр из файла: 1000
2025-11-04 12:37:34,272 - __main__ - INFO - Всего игр для обработки: 1000
2025-11-04 12:37:34,273 - __main__ - INFO - Размер батча: 30
2025-11-04 12:37:34,278 - __main__ - INFO - Обработка батча 1: игры 0 - 30
2025-11-04 12:37:34,279 - __main__ - INFO - Обработка игр с 0 по 30 из 1000
2025-11-04 12:37:38,514 - __main__ - INFO - Драйвер успешно создан
2025-11-04 12:37:38,515 - __main__ - INFO - Драйвер инициализирован для батча
2025-11-04 12:37:38,517 - __main__ - INFO - [1/1000] Обработка Counter-Strike (slug: counter-strike)
