In [5]:
import fitz  # PyMuPDF
import re
import csv
import pandas as pd

In [6]:
def extract_data_from_text(text):
    # Extract the main section heading
    match_section = re.search(r'^3\.\d+\.\d+ .*?\.', text, re.M)
    section_heading = match_section.group(0) if match_section else None
    
    data_dict = {}
    
    # Patterns for subheadings
    subheadings = ["Additional Information", "Where to Look:", "Who to Talk to:", "Perform Test On:"]
    subheading_data = {}

    # Extract content under each subheading
    for i in range(len(subheadings)):
        current_heading = subheadings[i]
        next_heading = subheadings[i + 1] if i + 1 < len(subheadings) else None

        # Find the start of the current subheading
        start = text.find(current_heading) + len(current_heading)
        # Find the start of the next subheading, if it exists
        end = text.find(next_heading) if next_heading else len(text)

        # Extract the text between current subheading and the next
        content = text[start:end].strip()
        if i != 0:    
            subheading_data[current_heading[:-1]] = content  # Remove the colon at the end of the subheading
        else:
            subheading_data[current_heading] = content
    # Merge the extracted subheading data into the main dictionary
    data_dict.update(subheading_data)

    return data_dict

In [7]:
def extract_topic_data(text):

    # Compiled regex pattern
    pattern = re.compile(r'([^:]+):\s*(SP 800-171)\s*Security Family\s*(\d+\.\d+)')

    # Initialize result dictionary
    result = {}

    # Split the text by new lines to handle multiline text inputs
    for line_number, line in enumerate(text.split('\n'), start=1):
        match = pattern.search(line)
        if match:
            # Creating a dictionary for the match
            result = {
                # 'line': line_number,
                'full_string': line,
                'category_name': match.group(1),
                'standard_reference': match.group(2),
                'version_number': match.group(3),
                'text': "\n".join(text.splitlines()[line_number:])
            }
            return result  # Return as soon as a match is found

    # Return an empty dictionary if no match is found
    return result


In [13]:
def extract_text_after_pattern(pdf_path):
    """
    Extract text from a PDF file starting after a specific pattern.

    Parameters:
            pdf_path (str): Path to the PDF file.

    Returns:
            dict: The extracted text for each of the sections.
    """
    doc = fitz.open(pdf_path)  # Open the PDF file
    
    topic_header = None
    level_header = None
    section_heading = None
    previous_section_heading = None

    section_heading_pattern = re.compile(r'^3\.\d+\.\d+ .*?')
    
    sections = {}
    topics = {}
    for page_num in range(start_page, end_page):
        page = doc.load_page(page_num)
        text = page.get_text("text")

        og_text = text

        for line_number, line in enumerate(text.split('\n'), start=1):

            match = section_heading_pattern.search(line)

            if match:
                text = "".join(text.splitlines()[line_number-1:])
                text = text.replace(" •", "\n•")
                text = text.replace("  ", "\n")
                break

        match_section = re.search(r'^3\.\d{1,2}\.\d{1,2} .*?\.', text)
        previous_section_heading = section_heading
        section_heading = match_section.group(0) if match_section else None
        
        data_dict = {}
        
        if section_heading is None:
            
            topic_data = extract_topic_data(og_text)

            if len(topic_data) > 0:
                topic_data["page_num"] = page_num
                topics[topic_data["full_string"]] = topic_data
            else:
                if previous_section_heading is not None:
                    sections[previous_section_heading]["text"] += text
                else:
                    print("Something Weird")
                    break
        else:
            sections[section_heading] = {
                "section_heading": section_heading,
                "text": text
            }
    
    doc.close()  # Close the document
    
    return sections, topics

In [14]:
start_page = 19
end_page = 156

In [15]:
sections, topics = extract_text_after_pattern("./nist.hb.162.pdf")

In [17]:
for header in sections.keys(): print(header)

3.1.1 Limit system access to authorized users, processes acting on behalf of authorized users, or devices (including other systems).
3.1.2 Limit system access to the types of transactions and functions that authorized users are permitted to execute.
3.1.3 Control the flow of CUI in accordance with approved authorizations.
3.1.4 Separate the duties of individuals to reduce the risk of malevolent activity without collusion.
3.1.5 Employ the principle of least privilege, including for specific security functions and privileged accounts.
3.1.6 Use non-privileged accounts or roles when accessing non-security functions.
3.1.7 Prevent non-privileged users from executing privileged functions and audit the execution of such functions.
3.1.8 Limit unsuccessful logon attempts.
3.1.9 Provide privacy and security notices consistent with applicable CUI rules.
3.1.10 Use session lock with pattern-hiding displays to prevent access/viewing of data after period of inactivity.
3.1.11 Terminate (automatic

In [18]:
print(sections["3.14.7 Identify unauthorized use of the information system."]["text"])

3.14.7 Identify unauthorized use of the information system. Does the company monitor the information system to identify unauthorized access and use? Yes
No
Partially
Does Not Apply
Alternative Approach
Does the company monitor the information for potential misuse? Yes
No
Partially
Does Not Apply
Alternative Approach
Is unauthorized use of the system identified (e.g., log monitoring)? Yes
No
Partially
Does Not Apply
Alternative Approach
Additional Information Information system monitoring includes external and internal monitoring. External monitoring includes the observation of events occurring at the information system boundary (i.e., part of perimeter defense and boundary protection). Internal monitoring includes the observation of events occurring within the information system.
Companies can monitor information systems by observing audit activities in real time or by observing other system aspects such as access patterns, characteristics of access, and other actions. The monitoring o

In [19]:
sections_df = pd.DataFrame(sections).T.reset_index(drop=True)

In [20]:
sections_df.to_csv("./nist_sections.csv", quotechar="|", index=None)

In [21]:
topics_df = pd.DataFrame(topics).T.reset_index(drop=True)

In [22]:
topics_df.to_csv("./nist_topics.csv", quotechar="|", index=None)

In [23]:
print(sections_df["text"][1])

3.1.2 Limit system access to the types of transactions and functions that authorized users are permitted to execute. Do you use access control lists to limit access to applications and data based on role and/or identity? Yes
No
Partially
Does Not Apply
Alternative Approach
Does the system allow for the separation of access control rights and enforcement of those rights? Yes
No
Partially
Does Not Apply
Alternative Approach
Additional information: Even authorized users are restricted to those parts of the system that they are explicitly permitted to use. This is based on their “need-to-know” and their role within the company. Where to Look:
• access control policy
• account management procedures
• access enforcement procedures
• security plan
• configuration management plan
• information system design documentation
• information system configuration settings and associated documentation
• list of active system accounts along with the name of the individual associated with each account
• 