In [40]:
import requests
from bs4 import BeautifulSoup
import time
import json
from urllib.parse import urlparse
import os

In [None]:
# Global constants
HEADERS = {
    'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) ')
}
CHECKPOINT_FILE = "islamqa_data_en.json"

In [42]:

# Checkpoint Functionality

def load_checkpoint(filename):
    """Load already scraped data if the checkpoint file exists."""
    if os.path.exists(filename):
        with open(filename, 'r', encoding='utf-8') as f:
            try:
                data = json.load(f)
                print(f"Loaded checkpoint with {len(data)} topics.")
                return data
            except Exception as e:
                print(f"Error loading checkpoint: {e}")
    return {}


def save_checkpoint(data, filename):
    """Save current scraped data to a checkpoint file."""
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    print(f"Checkpoint saved: {filename}")

In [47]:
def get_final_url(topic_id):
    """
    Resolve the redirect for a given topic ID and return the final URL.
    Returns None if the URL cannot be resolved.
    """
    base_url = f"https://islamqa.info/en/categories/topics/{topic_id}"
    try:
        response = requests.head(base_url, headers=HEADERS, allow_redirects=True)
        if response.status_code == 200:
            return response.url
    except Exception as e:
        print(f"Error in get_final_url for topic {topic_id}: {e}")
    return None

In [48]:

def scrape_topic(final_url):
    """
    Scrape a single topic page, gathering its title, description,
    and list of question URLs.
    Supports pagination.
    """
    # Determine topic name from the URL
    parsed_url = urlparse(final_url)
    path_segments = parsed_url.path.strip("/").split('/')
    topic_name = path_segments[-1] if path_segments else "unknown_topic"

    result = {
        "title": "",
        "description": "",
        "questions": []  # This will later hold full question data objects.
    }

    current_url = final_url

    while current_url:
        try:
            time.sleep(0.05)  # Be respectful with a small delay.
            response = requests.get(current_url, headers=HEADERS)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract title and description once (from the first page)
            if not result["title"]:
                title_tag = soup.find('h1', class_='title')
                result["title"] = title_tag.get_text(strip=True) if title_tag else ""
                
                subtitle_tag = soup.find('p', class_='subtitle')
                result["description"] = subtitle_tag.get_text(strip=True) if subtitle_tag else ""

            # Extract question card links (each <a> with class "post-card" inside a topic container)
            question_cards = soup.select('div.single-topic a.post-card')
            for card in question_cards:
                href = card.get('href')
                if href:
                    # Prepend domain if necessary
                    if not href.startswith("http"):
                        href = "https://islamqa.info" + href
                    result["questions"].append(href)

            # Check for a "next" pagination link
            next_li = soup.find('li', class_='next')
            if next_li and next_li.a and next_li.a.has_attr('href'):
                current_url = next_li.a['href']
            else:
                current_url = None

        except Exception as e:
            print(f"Error processing topic page {current_url}: {e}")
            break

    return topic_name, result

In [49]:

def scrape_answer_page(url):
    """
    Scrape a single answer page.
    Extracts the page title, question text, answer body (without any <h3> tags),
    and summary (if available).
    Returns a dictionary or None if there is an error.
    """
    try:
        response = requests.get(url, headers=HEADERS)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")

        # Extract the h1 title from the answer page
        title_tag = soup.find("h1", class_="title")
        title = title_tag.get_text(strip=True) if title_tag else None

        # Extract the question text
        question_section = soup.find("section", class_="single_fatwa__question text-justified")
        question = None
        if question_section:
            question_div = question_section.find("div")
            if question_div:
                question = question_div.get_text(strip=True)

        # Extract the answer body, removing any <h3> tags that represent unwanted titles
        answer_section = soup.find("section", class_="single_fatwa__answer__body text-justified _pa--0")
        answer = None
        if answer_section:
            for h3 in answer_section.find_all("h3"):
                h3.decompose()
            answer = answer_section.get_text(strip=True)

        # Extract summary if available
        summary_div = soup.find("div", class_="single_fatwa__summary__body")
        summary = summary_div.get_text(strip=True) if summary_div else None

        return {
            "url": url,
            "title": title,
            "question": question,
            "answer": answer,
            "summary": summary
        }
    except Exception as e:
        print(f"Error scraping answer page {url}: {e}")
        return None

In [50]:
def process_topic(topic_id, all_data):
    """
    Process a single topic ID:
      1. Resolve the final URL.
      2. Scrape topic details and get a list of question URLs.
      3. For each question, scrape the full answer details.
      4. Save the collected data in the global all_data dict.
    """
    final_url = get_final_url(topic_id)
    if not final_url:
        print(f"Skipping topic ID {topic_id} because final URL not found.")
        return

    print(f"Processing topic {topic_id}: {final_url}")
    topic_name, topic_data = scrape_topic(final_url)

    full_questions = []
    for q_url in topic_data["questions"]:
        print(f"  Scraping question: {q_url}")
        answer_data = scrape_answer_page(q_url)
        if answer_data:
            full_questions.append(answer_data)
        time.sleep(0.05)  # Delay between question requests

    topic_data["questions"] = full_questions
    all_data[str(topic_id)] = topic_data

In [51]:


def main(start=1, end=269):
    """
    Main function to loop over topic IDs.
    Loads any existing checkpoint and processes topics one-by-one.
    After each topic is processed (or if an error occurs), the checkpoint is saved.
    """
    all_data = load_checkpoint(CHECKPOINT_FILE)

    for topic_id in range(start, end + 1):
        # Skip topics that have already been processed
        if str(topic_id) in all_data:
            print(f"Topic {topic_id} already processed, skipping.")
            continue

        try:
            process_topic(topic_id, all_data)
            save_checkpoint(all_data, CHECKPOINT_FILE)
        except Exception as e:
            print(f"Error processing topic {topic_id}: {e}")
            save_checkpoint(all_data, CHECKPOINT_FILE)
            continue

    print("Scraping complete.")

In [52]:

if __name__ == "__main__":
    main(1, 269)

Loaded checkpoint with 10 topics.
Topic 1 already processed, skipping.
Topic 2 already processed, skipping.
Topic 3 already processed, skipping.
Topic 4 already processed, skipping.
Topic 5 already processed, skipping.
Topic 6 already processed, skipping.
Topic 7 already processed, skipping.
Topic 8 already processed, skipping.
Topic 9 already processed, skipping.
Topic 10 already processed, skipping.
Processing topic 11: https://m.islamqa.info/en/categories/topics/11?traffic_source=main_islamqa
Checkpoint saved: islamqa_data_en.json
Processing topic 12: https://m.islamqa.info/en/categories/topics/12?traffic_source=main_islamqa


KeyboardInterrupt: 