In [2]:
import requests
from bs4 import BeautifulSoup
import csv
import logging
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

# Configure logging
logging.basicConfig(filename='scraper.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

URL = "https://pcimmigrationireland.ie/faq/"
CSV_FILE = "faq_data.csv"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
}

def scrape_faq_requests(url):
    try:
        session = requests.Session()
        session.headers.update(HEADERS)
        response = session.get(url)
        response.raise_for_status()
        logging.info(f"Successfully accessed {url} using requests")
        return parse_faq(response.text)
    except requests.exceptions.RequestException as e:
        logging.error(f"Requests method failed: {e}")
        return []

def scrape_faq_selenium(url):
    try:
        options = Options()
        options.add_argument("--headless")
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        driver.get(url)
        html = driver.page_source
        driver.quit()
        logging.info(f"Successfully accessed {url} using Selenium")
        return parse_faq(html)
    except Exception as e:
        logging.error(f"Selenium method failed: {e}")
        return []

def parse_faq(html):
    soup = BeautifulSoup(html, 'html.parser')
    faq_items = soup.find_all(class_='elementor-accordion-item')
    data = []
    for item in faq_items:
        question = item.find(class_='elementor-accordion-title')
        answer = item.find(class_='elementor-tab-content elementor-clearfix')
        if question and answer:
            question_text = question.get_text(strip=True)
            answer_text = ' '.join([p.get_text(strip=True) for p in answer.find_all(['p', 'li'])])
            data.append([URL, question_text, answer_text])
        else:
            logging.warning("Skipping an item due to missing question or answer")
    return data

def save_to_csv(data, filename=CSV_FILE):
    try:
        with open(filename, 'w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(["Website", "Question", "Answer"])
            writer.writerows(data)
        logging.info(f"Data successfully saved to {filename}")
    except Exception as e:
        logging.error(f"Error saving to CSV: {e}")

if __name__ == "__main__":
    faq_data = scrape_faq_requests(URL)
    if not faq_data:
        logging.info("Falling back to Selenium scraping")
        faq_data = scrape_faq_selenium(URL)
    if faq_data:
        save_to_csv(faq_data)