In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import logging

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("scrape_spain_visa_log.log", encoding='utf-8'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

def scrape_spain_visa_faqs(url, output_file="spain_visa_faqs.csv"):
    """Scrape FAQs from the specified Ireland.ie Spain visa FAQ page."""
    try:
        logger.info(f"Starting scrape for {url}")
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        
        if response.status_code != 200:
            logger.error(f"Failed to retrieve {url}. Status code: {response.status_code}")
            return
        
        soup = BeautifulSoup(response.content, 'html.parser')
        main_content = soup.find(id="main-content")
        
        if not main_content:
            logger.error("Main content with ID 'main-content' not found.")
            return
        
        # Target the specific div structure: //*[@id="main-content"]/div[1]/div[4] to div[23]
        faq_container = main_content.find('div')
        if not faq_container:
            logger.error("First div inside 'main-content' not found.")
            return
        
        faq_data = []
        for i in range(4, 24):  # div[4] to div[23] inclusive
            faq_div = faq_container.select_one(f'div:nth-of-type({i})')
            if not faq_div:
                logger.warning(f"Div[{i}] not found in main-content.")
                continue
            
            # Find question (class: story__heading heading--2)
            question_elem = faq_div.find(class_=["story__heading", "heading--2"])
            question = question_elem.get_text(strip=True) if question_elem else "Question not found"
            
            # Find answer (class: rich_text__summary)
            answer_elem = faq_div.find(class_="rich_text__summary")
            answer_text = ""
            
            if answer_elem:
                # Extract nested content (paragraphs, lists, etc.)
                for elem in answer_elem.children:
                    if elem.name == 'p':
                        answer_text += elem.get_text(strip=True) + "\n"
                    elif elem.name == 'ul':
                        for li in elem.find_all('li'):
                            answer_text += f"- {li.get_text(strip=True)}\n"
                    elif elem.name == 'ol':
                        for li in elem.find_all('li'):
                            answer_text += f"{li.get_text(strip=True)}\n"
                    elif elem.name == 'div':
                        answer_text += elem.get_text(strip=True) + "\n"
                answer_text = answer_text.strip()
            else:
                answer_text = "Answer not found"
                logger.debug(f"No answer found for div[{i}]")
            
            if question != "Question not found" or answer_text != "Answer not found":
                faq_data.append({
                    "Website": url,
                    "Question": question,
                    "Answer": answer_text
                })
        
        # Save to CSV
        if faq_data:
            with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
                fieldnames = ["Website", "Question", "Answer"]
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                writer.writeheader()
                writer.writerows(faq_data)
            logger.info(f"Saved {len(faq_data)} FAQs to '{output_file}'")
        else:
            logger.warning("No FAQs found to save.")
        
    except Exception as e:
        logger.error(f"Error during scrape: {str(e)}")

# URL to scrape
url = "https://www.ireland.ie/en/spain/madrid/news-and-events/news-archive/visa-faqs/"

# Run the scraper
if __name__ == "__main__":
    scrape_spain_visa_faqs(url)

2025-03-10 08:48:49,444 - INFO - Starting scrape for https://www.ireland.ie/en/spain/madrid/news-and-events/news-archive/visa-faqs/
2025-03-10 08:48:50,260 - INFO - Saved 20 FAQs to 'spain_visa_faqs.csv'
