In [None]:
!pip install playwright
!playwright install chromium
!apt-get update
!apt-get install -y libgstreamer-gl1.0-0 libgstreamer-plugins-bad1.0-0 libflite1 libavif13 libenchant-2-2 libsecret-1-0 libmanette-0.2-0 libgles2-mesa

import asyncio
import re
import json
import logging
import sys
from datetime import datetime, timedelta
from typing import Optional, List, Dict, Any
from playwright.async_api import async_playwright, Page, Browser, BrowserContext

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)]
)
logger = logging.getLogger("PhilharmoniaParser")

BASE_URL = "https://filarmonia39.ru/?event"

MONTHS_RU = {
    "января": 1, "январь": 1,
    "февраля": 2, "февраль": 2,
    "марта": 3, "март": 3,
    "апреля": 4, "апрель": 4,
    "мая": 5, "май": 5,
    "июня": 6, "июнь": 6,
    "июля": 7, "июль": 7,
    "августа": 8, "август": 8,
    "сентября": 9, "сентябрь": 9,
    "октября": 10, "октябрь": 10,
    "ноября": 11, "ноябрь": 11,
    "декабря": 12, "декабрь": 12
}

def normalize_date(date_text: str) -> Optional[str]:
    try:
        # Example: "25 Января 2026"
        parts = date_text.lower().split()
        day = None
        month = None
        year = None
        
        for part in parts:
            if part.isdigit():
                if len(part) == 4:
                    year = int(part)
                elif len(part) <= 2:
                    day = int(part)
            elif part in MONTHS_RU:
                month = MONTHS_RU[part]
                
        if day and month and year:
            return f"{year}-{month:02d}-{day:02d}"
        return None
    except:
        return None

class PhilharmoniaParser:
    def __init__(self):
        self.browser: Optional[Browser] = None
        self.context: Optional[BrowserContext] = None
        self.page: Optional[Page] = None
        self.results: List[Dict[str, Any]] = []

    async def start(self):
        self.playwright = await async_playwright().start()
        self.browser = await self.playwright.chromium.launch(headless=True)
        self.context = await self.browser.new_context(
            viewport={"width": 1920, "height": 1080},
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )
        self.page = await self.context.new_page()

    async def stop(self):
        if self.page:
            await self.page.close()
        if self.context:
            await self.context.close()
        if self.browser:
            await self.browser.close()
        if hasattr(self, 'playwright'):
            await self.playwright.stop()

    async def parse(self, months_to_scan: int = 5):
        try:
            current_date = datetime.now()
            
            # Strategy: Iterate through months and construct URL directly
            # URL pattern: https://filarmonia39.ru/?event&m=YYYYMM01
            
            for i in range(months_to_scan):
                # Calculate target year and month
                target_month = current_date.month + i
                target_year = current_date.year + (target_month - 1) // 12
                target_month = (target_month - 1) % 12 + 1
                
                url_param = f"{target_year}{target_month:02d}01"
                target_url = f"{BASE_URL}&m={url_param}"
                
                logger.info(f"Navigating to month view: {target_url} ({target_month}/{target_year})")
                
                try:
                    await self.page.goto(target_url, timeout=60000, wait_until="domcontentloaded")
                    await asyncio.sleep(2) # Allow slight render time
                    
                    # Parse events on this specific month page
                    await self.parse_current_list()
                    
                except Exception as e:
                    logger.error(f"Failed to load month {target_month}/{target_year}: {e}")

        except Exception as e:
            logger.error(f"Error in main parse loop: {e}", exc_info=True)

    async def parse_current_list(self):
        """Parses the list of events (.afisha_list_item) currently visible."""
        # Selector for event items
        events = await self.page.locator("div.afisha_list_item").all()
        logger.info(f"Found {len(events)} events on current page.")

        for event_el in events:
            try:
                # Title & URL
                title_link = event_el.locator("h1 a.mer_item_title")
                if await title_link.count() == 0:
                    continue
                
                title = await title_link.inner_text()
                href = await title_link.get_attribute("href")
                full_url = "https://filarmonia39.ru" + href if href.startswith("/") else href
                
                # Deduplication
                if any(e['url'] == full_url for e in self.results):
                    continue

                # Image extraction
                img = event_el.locator("img.mer_item_img").first
                img_src = await img.get_attribute("src") if await img.count() > 0 else ""
                if img_src and not img_src.startswith("http"):
                    img_src = "https://filarmonia39.ru" + img_src

                # Navigate to detail page
                try:
                    new_page = await self.context.new_page()
                    await new_page.goto(full_url, timeout=45000, wait_until="domcontentloaded")
                    
                    # 1. Full Description
                    desc_locator = new_page.locator("div.mer_item_info_text") 
                    if await desc_locator.count() == 0:
                         desc_locator = new_page.locator("div.text")

                    full_desc_text = ""
                    if await desc_locator.count() > 0:
                        full_desc_text = await desc_locator.first.inner_text()
                    else:
                        full_desc_text = await event_el.locator(".mer_item_list_progr").inner_text()
                    
                    # 2. Date/Time/Age Parsing
                    date_block = event_el.locator(".date_block")
                    date_text_raw = await date_block.inner_text()
                    date_text_clean = date_text_raw.replace("\n", " ").strip()
                    
                    # Extract Age
                    age_restriction = ""
                    age_match = re.search(r'(\d+\+)', date_text_clean)
                    if age_match:
                        age_restriction = age_match.group(1)
                    
                    # Extract Time
                    time_val = "00:00"
                    time_match = re.search(r'(\d{1,2}:\d{2})', date_text_clean)
                    if time_match:
                        time_val = time_match.group(1)
                        
                    # Extract Date
                    date_clean = re.sub(r'\d+\+', '', date_text_clean) # Remove age
                    date_clean = re.sub(r'Понедельник|Вторник|Среда|Четверг|Пятница|Суббота|Воскресенье', '', date_clean, flags=re.I)
                    date_clean = re.sub(r'\d{1,2}:\d{2}', '', date_clean) # Remove time
                    date_clean = re.sub(r'\s+', ' ', date_clean).strip()
                    
                    # Normalized Date
                    norm_date = normalize_date(date_clean)

                    # 3. Prices
                    price_min = None
                    price_max = None
                    
                    content_text = await new_page.locator("body").inner_text()
                    price_match = re.search(r'Цена:?\s*([\d\s\,-]+)\s*(руб|₽)', content_text, re.IGNORECASE)
                    
                    if price_match:
                        price_str = price_match.group(1)
                        found_prices = [int(p) for p in re.findall(r'\d+', price_str)]
                        if found_prices:
                            price_min = min(found_prices)
                            price_max = max(found_prices)
                    
                    # Ticket Status
                    buy_btn = new_page.locator("a[href*='tickets=']")
                    has_ticket_btn = await buy_btn.count() > 0
                    ticket_status = "available" if has_ticket_btn else "unavailable"
                    
                    logger.info(f"Parsed: {title} | Date: {date_clean} (Norm: {norm_date}) | Tickets: {ticket_status}")
                    
                    self.results.append({
                        "title": title.strip(),
                        "url": full_url,
                        "date_text": date_clean,
                        "normalized_date": norm_date,
                        "time": time_val,
                        "age_restriction": age_restriction,
                        "image_url": img_src,
                        "description": full_desc_text.strip(),
                        "price_min": price_min,
                        "price_max": price_max,
                        "ticket_status": ticket_status
                    })
                    
                    await new_page.close()
                    
                except Exception as e:
                    logger.error(f"Error parsing event detail {full_url}: {e}")
                    if 'new_page' in locals():
                        await new_page.close()
                    continue

            except Exception as e:
                 logger.error(f"Error processing event item: {e}")
                 continue

async def main():
    parser = PhilharmoniaParser()
    await parser.start()
    try:
        # Scan 6 months to cover user requirement (Jan - May+)
        await parser.parse(months_to_scan=6)
        print(json.dumps(parser.results, ensure_ascii=False, indent=2))
        
        with open("philharmonia_results.json", "w", encoding="utf-8") as f:
            json.dump(parser.results, f, ensure_ascii=False, indent=2)
            
    finally:
        await parser.stop()

if __name__ == "__main__":
    await main()
