In [1]:
import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urlparse

In [None]:

def scrape_islamqa_topic(url):
    # Extract topic name from URL
    parsed_url = urlparse(url)
    path_segments = parsed_url.path.split('/')
    topic_name = path_segments[-1] if path_segments[-1] else path_segments[-2]
    
    result = {
        topic_name: {
            "title": "",
            "description": "",
            "questions": []
        }
    }
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    
    current_url = url
    title_extracted = False
    
    while current_url:
        try:
            # Respectful delay between requests
            time.sleep(0.05)
            
            response = requests.get(current_url, headers=headers)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            if not title_extracted:
                # Extract title
                title_tag = soup.find('h1', class_='title')
                if title_tag:
                    result[topic_name]['title'] = title_tag.get_text(strip=True)
                
                # Extract description
                subtitle_tag = soup.find('p', class_='subtitle')
                if subtitle_tag:
                    description_tag = subtitle_tag.find('p')
                    if description_tag:
                        result[topic_name]['description'] = description_tag.get_text(strip=True)
                
                title_extracted = True
            
            # Extract question links
            question_cards = soup.select('div.single-topic a.post-card')
            for card in question_cards:
                href = card.get('href')
                if href:
                    result[topic_name]['questions'].append(href)
            
            # Handle pagination
            next_li = soup.find('li', class_='next')
            if next_li and next_li.a:

                current_url = next_li.a.get('href', None)
            else:
                current_url = None
                
        except requests.exceptions.RequestException as e:
            print(f"Error fetching {current_url}: {e}")
            break
            
    return result

In [3]:

if __name__ == "__main__":
    target_url = "https://islamqa.info/en/categories/topics/269/islamic-politics"
    scraped_data = scrape_islamqa_topic(target_url)
    print(scraped_data)

KeyError: 'href'