In [None]:
import requests
from bs4 import BeautifulSoup
import pdfplumber
import json
import os

# Define the base URL of the website
base_url = "https://info.ncdhhs.gov/dhsr/coneed/certificatesissued.html"
pdf_base_url = "https://info.ncdhhs.gov/dhsr/coneed/"

# Send a GET request to fetch the page content
response = requests.get(base_url)

# Directory to store downloaded PDF files
os.makedirs("pdfs", exist_ok=True)

# JSON data list to store scraped information
certificates_data = []

def download_pdf(url, filename):
    """Downloads a PDF file and saves it locally."""
    pdf_response = requests.get(url)
    with open(filename, 'wb') as file:
        file.write(pdf_response.content)

def extract_pdf_data(pdf_file):
    """Extracts relevant data from a PDF file."""
    with pdfplumber.open(pdf_file) as pdf:
        text_data = []
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                text_data.append(text)
        return text_data

def parse_certificate_data(pdf_text):
    """Parses text data from PDF and extracts certificates information."""
    certificate_list = []
    for page_text in pdf_text:
        # Example of parsing certificate details (customize this based on PDF structure)
        lines = page_text.split('\n')
        for line in lines:
            if 'County' in line:  # Assuming the relevant data starts with "County"
                certificate = {}
                # Extract relevant information by splitting the line (custom logic)
                certificate['line'] = line
                certificate_list.append(certificate)
    return certificate_list

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all the <li> elements containing PDF links
    list_items = soup.find_all('li')

    # Collect PDF links and filenames
    pdf_links = [item.find('a')['href'] for item in list_items if item.find('a') and item.find('a')['href'].endswith('.pdf')]

    # Download and process each PDF
    for pdf_link in pdf_links:
        pdf_url = pdf_base_url + pdf_link
        pdf_filename = os.path.join("pdfs", os.path.basename(pdf_link))
        
        # Download the PDF file
        download_pdf(pdf_url, pdf_filename)
        print(f"Downloaded {pdf_filename}")
        
        # Extract text from the PDF
        pdf_text = extract_pdf_data(pdf_filename)
        
        # Parse certificate data from the extracted text
        certificates = parse_certificate_data(pdf_text)
        certificates_data.extend(certificates)

    # Save the extracted data into a JSON file
    with open('certificates_data.json', 'w') as json_file:
        json.dump(certificates_data, json_file, indent=4)

    print("Scraping completed, data saved to certificates_data.json")

else:
    print(f"Failed to load the page, status code: {response.status_code}")


In [2]:
import os
import pdfplumber
import json
import re

# Helper function for printing debug messages
def debug_message(message):
    print(f"[DEBUG] {message}")

# Function to extract tables from a PDF file
def extract_table_from_pdf(pdf_path):
    """
    Extracts tables from the PDF file and returns them in a structured format.
    """
    debug_message(f"Extracting tables from {pdf_path}")
    
    try:
        with pdfplumber.open(pdf_path) as pdf:
            all_tables = []
            for page in pdf.pages:
                tables = page.extract_tables()
                if tables:
                    all_tables.extend(tables)
        return all_tables
    except Exception as e:
        debug_message(f"Error extracting tables from {pdf_path}: {e}")
        return []

# Function to extract expanded data like "Physical Location" and "Timetable"
def extract_expanded_data(page_text):
    """
    Extracts expanded data such as 'Physical Location' and 'Timetable' from the text.
    """
    location = None
    timetable = []

    # Regular expression to match the "Physical Location"
    location_match = re.search(r'PHYSICAL LOCATION:\s*(.*?)\n', page_text)
    if location_match:
        location = location_match.group(1).strip()

    # Extracting structured timetable entries from the text
    timetable_start = re.search(r'TIMETABLE:', page_text)
    if timetable_start:
        timetable_text = page_text[timetable_start.end():]
        # Extract timetable entries
        for match in re.finditer(r'(\d+\.\s*(.*?)\s*(__+)?\s*(\w+\s+\d+,\s+\d+))', timetable_text):
            milestone = match.group(2).strip()
            date = match.group(4).strip()
            timetable.append({
                "milestone": milestone,
                "date": date
            })

    return location, timetable

# Function to format table data into structured JSON
def format_table_data(tables, pdf_path):
    """
    Formats extracted table data into a structured JSON format, including expanded data.
    """
    debug_message(f"Formatting data for {pdf_path}")
    
    all_data = []
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for table in tables:
                header = table[0]  # Assuming the first row is the header
                for row in table[1:]:
                    data_dict = {header[i]: row[i] for i in range(len(header)) if i < len(row)}

                    # Extract text from the current page for expanded data (simplified)
                    expanded_text = pdf.pages[0].extract_text()
                    location, timetable = extract_expanded_data(expanded_text)

                    data_dict["Expanded_Data"] = {
                        "Physical_Location": location,
                        "Timetable": timetable
                    }

                    all_data.append(data_dict)
    except Exception as e:
        debug_message(f"Error processing {pdf_path}: {e}")
    
    return all_data

# Function to process individual PDF and save to JSON
def process_pdf(pdf_path, output_folder):
    """
    Processes a single PDF and saves the extracted data as a JSON file.
    """
    try:
        pdf_filename = os.path.basename(pdf_path)
        debug_message(f"Processing {pdf_filename}")
        
        # Extract table from the PDF
        tables = extract_table_from_pdf(pdf_path)
        
        # Format the table data into JSON format
        formatted_data = format_table_data(tables, pdf_path)
        
        # Generate output file path
        json_output_path = os.path.join(output_folder, f"{pdf_filename.replace('.pdf', '')}.json")
        
        # Save JSON to file
        with open(json_output_path, 'w') as json_file:
            json.dump(formatted_data, json_file, indent=4)
        
        debug_message(f"Saved JSON output to {json_output_path}")
    
    except Exception as e:
        debug_message(f"Error processing {pdf_filename}: {e}")

# Function to loop through the folder and process all PDF files
def process_pdf_folder(pdf_folder, output_folder):
    """
    Loops through all PDFs in the given folder and processes them.
    """
    debug_message(f"Looking for PDF files in {pdf_folder}")
    
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Loop through each file in the folder
    for filename in os.listdir(pdf_folder):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, filename)
            process_pdf(pdf_path, output_folder)

# Main function
if __name__ == "__main__":
    # Define the input folder and output folder
    pdf_folder = 'pdfs/'  # Folder where your PDFs are stored
    output_folder = 'json_outputs/'  # Output folder for JSON files

    # Start the PDF processing loop
    process_pdf_folder(pdf_folder, output_folder)


[DEBUG] Looking for PDF files in pdfs/
[DEBUG] Processing November2022Certificates.pdf
[DEBUG] Extracting tables from pdfs/November2022Certificates.pdf
[DEBUG] Formatting data for pdfs/November2022Certificates.pdf
[DEBUG] Saved JSON output to json_outputs/November2022Certificates.json
[DEBUG] Processing September2020-CertificatesIssued.pdf
[DEBUG] Extracting tables from pdfs/September2020-CertificatesIssued.pdf
[DEBUG] Formatting data for pdfs/September2020-CertificatesIssued.pdf
[DEBUG] Saved JSON output to json_outputs/September2020-CertificatesIssued.json
[DEBUG] Processing February2022-CertificatesIssued.pdf
[DEBUG] Extracting tables from pdfs/February2022-CertificatesIssued.pdf
[DEBUG] Formatting data for pdfs/February2022-CertificatesIssued.pdf
[DEBUG] Saved JSON output to json_outputs/February2022-CertificatesIssued.json
[DEBUG] Processing Feb2024Certificates.pdf
[DEBUG] Extracting tables from pdfs/Feb2024Certificates.pdf
[DEBUG] Formatting data for pdfs/Feb2024Certificates.pdf
