In [5]:
import requests
from bs4 import BeautifulSoup
import re
import logging
from abc import ABC, abstractmethod

# Set up basic logging
logging.basicConfig(level=logging.INFO)

class TEDTalkScraper:
    """
    Responsible for scraping TED talk URLs and extracting the opening line
    from the transcript page.
    """
    def __init__(self):
        self.base_url = "https://www.ted.com"
        self.talks_url = self.base_url + "/talks"
    
    def get_talk_urls(self, num_pages=1):
        """
        Fetch talk URLs from TED talks listing pages.
        """
        talk_urls = []
        for page in range(1, num_pages+1):
            url = f"{self.talks_url}?page={page}"
            logging.info(f"Fetching talks from: {url}")
            response = requests.get(url)
            if response.status_code != 200:
                logging.error(f"Failed to fetch page {page}")
                continue
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Example approach: select anchors with href^="/talks/" and data-ga-context="talks"
            # Adjust the data-ga-* attributes or classes as needed to match the actual page.
            links = soup.select('a[data-ga-context="talks"][href^="/talks/"]')
            if not links:
                logging.warning("No talk links found with the given selector. Check page structure.")
            
            for link in links:
                href = link.get('href')
                if href:
                    # Remove query parameters if present
                    full_url = self.base_url + href.split('?')[0]
                    if full_url not in talk_urls:
                        talk_urls.append(full_url)

        return talk_urls

    def get_opening_line(self, talk_url):
        """
        Scrape the transcript page for a talk and extract the first paragraph,
        which we assume to be the opening line.
        """
        transcript_url = talk_url + "/transcript"
        logging.info(f"Fetching transcript from: {transcript_url}")
        response = requests.get(transcript_url)
        if response.status_code != 200:
            logging.error(f"Failed to fetch transcript for {talk_url}")
            return None
        
        soup = BeautifulSoup(response.text, 'html.parser')
        paragraphs = soup.find_all('p')
        if not paragraphs:
            logging.warning(f"No transcript paragraphs found for {talk_url}")
            return None
        
        # Use the first non-empty paragraph as the opening line.
        for p in paragraphs:
            line = p.get_text(strip=True)
            if line:
                return line
        return None

    def pull_all_opening_lines(self, num_pages=1):
        """
        Retrieve all opening lines from the scraped talk URLs.
        """
        talk_urls = self.get_talk_urls(num_pages=num_pages)
        logging.info(f"Found {len(talk_urls)} talk URLs")
        logging.info(f"Sample talk URLs: {talk_urls[:3]}")
        
        opening_lines = []
        for talk_url in talk_urls:
            line = self.get_opening_line(talk_url)
            if line:
                opening_lines.append((talk_url, line))
        return opening_lines


class TEDTalkProcessor:
    """
    Orchestrates the process of scraping TED talks and extracting their opening lines.
    """
    def __init__(self, scraper: TEDTalkScraper):
        self.scraper = scraper

    def process(self, num_pages=1):
        opening_lines = self.scraper.pull_all_opening_lines(num_pages=num_pages)
        results = []
        for url, line in opening_lines:
            results.append({
                'url': url,
                'opening_line': line
            })
        logging.info(f"Processed {len(results)} opening lines")
        logging.info(f"Sample results: {results[:3]}")
        return results


if __name__ == "__main__":
    scraper = TEDTalkScraper()
    processor = TEDTalkProcessor(scraper)

    # Pull data from 1 page for demonstration; adjust num_pages as needed.
    results = processor.process(num_pages=1)

    # Output the results
    for item in results:
        print(f"Talk URL: {item['url']}")
        print(f"Opening Line: {item['opening_line']}")
        print("-" * 40)


INFO:root:Fetching talks from: https://www.ted.com/talks?page=1
INFO:root:Found 0 talk URLs
INFO:root:Sample talk URLs: []
INFO:root:Processed 0 opening lines
INFO:root:Sample results: []
