In [2]:
import requests
from bs4 import BeautifulSoup
import csv
import json
from pathlib import Path
import logging

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("scrape_log.log", encoding='utf-8'),
        logging.StreamHandler()  # Also prints to console
    ]
)
logger = logging.getLogger(__name__)

def scrape_faq_from_website(url, accordion_id):
    """Scrape FAQs from a single website given its URL and accordion ID."""
    try:
        logger.info(f"Attempting to scrape {url} with accordion ID '{accordion_id}'")
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        
        if response.status_code != 200:
            logger.error(f"Failed to retrieve {url}. Status code: {response.status_code}")
            return []
        
        soup = BeautifulSoup(response.content, 'html.parser')
        accordion = soup.find(id=accordion_id)
        
        if not accordion:
            logger.warning(f"Accordion with ID '{accordion_id}' not found on {url}")
            return []
        
        faq_divs = accordion.find_all('div', recursive=False)
        if not faq_divs:
            logger.warning(f"No FAQ divs found in accordion '{accordion_id}' on {url}")
            return []
        
        faq_data = []
        for div in faq_divs:
            # Find the question
            question_elem = div.find('a')
            if question_elem and question_elem.find('span'):
                question = question_elem.find_all('span')[-1].get_text(strip=True)
            else:
                question = "Question not found"
                logger.debug(f"Question not found in a div on {url}")
                
            # Find the answer
            answer_div = div.find('div', id=lambda x: x and x != question_elem.get('id') if question_elem else None)
            answer_text = ""
                
            if answer_div:
                for elem in answer_div.children:
                    if elem.name == 'p':
                        answer_text += elem.get_text(strip=True) + "\n"
                    elif elem.name == 'ul':
                        for li in elem.find_all('li'):
                            answer_text += f"- {li.get_text(strip=True)}\n"
                    elif elem.name == 'section':
                        answer_text += elem.get_text(strip=True) + "\n"
                    elif elem.name == 'div':
                        answer_text += elem.get_text(strip=True) + "\n"
                answer_text = answer_text.strip()
            else:
                answer_text = "Answer not found"
                logger.debug(f"Answer not found for a question on {url}")
                
            if question != "Question not found" or answer_text != "Answer not found":
                faq_data.append({
                    "Website": url,
                    "Question": question,
                    "Answer": answer_text
                })
        
        logger.info(f"Found {len(faq_data)} FAQs on {url}")
        return faq_data
    
    except Exception as e:
        logger.error(f"Error scraping {url}: {str(e)}")
        return []

def get_websites_from_user():
    """Prompt user to input websites and accordion IDs interactively."""
    websites = []
    logger.info("Starting interactive website input")
    print("Enter websites and their accordion IDs. Type 'done' when finished.")
    while True:
        url = input("Enter website URL (or 'done' to finish): ").strip()
        if url.lower() == 'done':
            break
        accordion_id = input(f"Enter accordion ID for {url}: ").strip()
        websites.append({"url": url, "accordion_id": accordion_id})
        logger.info(f"Added {url} with accordion ID '{accordion_id}'")
    logger.info(f"Collected {len(websites)} websites from user input")
    return websites

def get_websites_from_file(file_path="websites.json"):
    """Read websites and accordion IDs from a JSON file."""
    file_path = Path(file_path)
    if not file_path.exists():
        logger.warning(f"File '{file_path}' not found. Creating a sample file.")
        sample_data = [
            {"url": "https://www.irishimmigration.ie/coming-to-visit-ireland/frequently-asked-questions/", "accordion_id": "accordion-14617-1"}
        ]
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(sample_data, f, indent=4)
        logger.info(f"Created sample file '{file_path}' with 1 entry")
        return sample_data
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        logger.info(f"Loaded {len(data)} websites from '{file_path}'")
        return data

def scrape_and_save_all_faqs(output_file="combined_faqs.csv", use_file=True, websites_file="websites.json"):
    """Scrape FAQs from multiple websites and save to a single CSV."""
    logger.info("Starting FAQ scraping process")
    
    # Get website list
    if use_file:
        websites = get_websites_from_file(websites_file)
    else:
        websites = get_websites_from_user()
    
    if not websites:
        logger.error("No websites provided. Exiting.")
        return
    
    all_faq_data = []
    total_faqs = 0
    
    # Scrape each website
    for site in websites:
        url = site.get("url")
        accordion_id = site.get("accordion_id")
        if not url or not accordion_id:
            logger.warning(f"Skipping invalid entry: {site}")
            continue
        
        faq_data = scrape_faq_from_website(url, accordion_id)
        all_faq_data.extend(faq_data)
        total_faqs += len(faq_data)
    
    # Save to CSV
    if all_faq_data:
        with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ["Website", "Question", "Answer"]
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(all_faq_data)
        logger.info(f"Combined FAQs saved to '{output_file}'. Total FAQs: {total_faqs}")
    else:
        logger.warning("No FAQs found to save.")
    
    logger.info("Scraping process completed")

# Run the script
if __name__ == "__main__":
    # Option 1: Use a file (set use_file=True)
    scrape_and_save_all_faqs(use_file=True, websites_file="websites.json")
    
    # Option 2: Prompt user interactively (uncomment to use)
    # scrape_and_save_all_faqs(use_file=False)

2025-03-10 08:41:05,606 - INFO - Starting FAQ scraping process
2025-03-10 08:41:05,607 - INFO - Loaded 2 websites from 'websites.json'
2025-03-10 08:41:05,607 - INFO - Attempting to scrape https://www.irishimmigration.ie/registering-your-immigration-permission/frequently-asked-questions-for-registration/ with accordion ID 'accordion-17799-1'
2025-03-10 08:41:06,950 - INFO - Found 39 FAQs on https://www.irishimmigration.ie/registering-your-immigration-permission/frequently-asked-questions-for-registration/
2025-03-10 08:41:06,951 - INFO - Attempting to scrape https://www.irishimmigration.ie/coming-to-visit-ireland/frequently-asked-questions/ with accordion ID 'accordion-14617-1'
2025-03-10 08:41:07,167 - INFO - Found 17 FAQs on https://www.irishimmigration.ie/coming-to-visit-ireland/frequently-asked-questions/
2025-03-10 08:41:07,169 - INFO - Combined FAQs saved to 'combined_faqs.csv'. Total FAQs: 56
2025-03-10 08:41:07,169 - INFO - Scraping process completed
