In [1]:
import csv
import logging
import re
import pdfplumber

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("scrape_pdf_faqs_log.log", encoding='utf-8'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text
def extract_faqs_from_text(text, source="PDF Document", output_file="pdf_faqs.csv"):
    """Extract FAQs from the provided text and save to CSV."""
    try:
        logger.info("Starting FAQ extraction from PDF text")
        
        # Split text into lines for processing
        lines = text.split('\n')
        faq_data = []
        current_section = None
        current_question = None
        current_answer = []
        
        # Regex to detect numbered questions (e.g., "1. Question text")
        question_pattern = re.compile(r'^\d+\.\s+(.+)$')
        
        for line in lines:
            line = line.strip()
            if not line:
                continue
            
            # Detect section headers (e.g., "Visas:", "Arrival in Ireland:")
            if line.endswith(':') and not question_pattern.match(line) and len(line.split()) < 5:
                if current_question and current_answer:
                    # Save previous FAQ if exists
                    faq_data.append({
                        "Website": source,
                        "Question": current_question,
                        "Answer": "\n".join(current_answer).strip()
                    })
                    current_answer = []
                current_section = line
                logger.debug(f"Detected section: {current_section}")
                continue
            
            # Detect question
            question_match = question_pattern.match(line)
            if question_match:
                if current_question and current_answer:
                    # Save previous FAQ
                    faq_data.append({
                        "Website": source,
                        "Question": current_question,
                        "Answer": "\n".join(current_answer).strip()
                    })
                    current_answer = []
                current_question = question_match.group(1)
                logger.debug(f"Found question: {current_question}")
                continue
            
            # Accumulate answer lines
            if current_question:
                current_answer.append(line)
        
        # Save the last FAQ if exists
        if current_question and current_answer:
            faq_data.append({
                "Website": source,
                "Question": current_question,
                "Answer": "\n".join(current_answer).strip()
            })
        
        # Save to CSV
        if faq_data:
            with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
                fieldnames = ["Website", "Question", "Answer"]
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                writer.writeheader()
                writer.writerows(faq_data)
            logger.info(f"Saved {len(faq_data)} FAQs to '{output_file}'")
        else:
            logger.warning("No FAQs extracted from the text.")
        
    except Exception as e:
        logger.error(f"Error during extraction: {str(e)}")




# Run the extraction
if __name__ == "__main__":
    pdf_path = "FAQ2023Version.pdf"
    pdf_text = extract_text_from_pdf(pdf_path)

    extract_faqs_from_text(pdf_text)

2025-03-10 08:57:18,013 - INFO - Starting FAQ extraction from PDF text
2025-03-10 08:57:18,014 - INFO - Saved 12 FAQs to 'pdf_faqs.csv'
