In [76]:
# import fitz

# pdf_path = "NYstandards.pdf"  # Replace with your PDF file path

# def extract_bold_sentences(pdf_path):
#     doc = fitz.open(pdf_path)
#     bold_sentences = []
#     current_sentence = []

#     for page in doc:
#         text_instances = page.get_text("dict")  # Extract text in dictionary format

#         for block in text_instances.get("blocks", []):
#             for line in block.get("lines", []):
#                 for span in line.get("spans", []):
#                     font = span["font"]  # Get font name
#                     text = span["text"].strip()  # Extract text and strip spaces

#                     # Check if font contains 'Bold'
#                     if "Bold" in font:
#                         current_sentence.append(text)  # Add to current sentence
#                     else:
#                         if current_sentence:
#                             sentence = " ".join(current_sentence).strip()  # Join and strip spaces
#                             bold_sentences.append(sentence)
#                             current_sentence = []  # Reset for next sentence

#     # Capture any remaining sentence
#     if current_sentence:
#         sentence = " ".join(current_sentence).strip()
#         bold_sentences.append(sentence)

#     return bold_sentences

# # Run extraction
# bold_sentences = extract_bold_sentences(pdf_path)

# # Print results
# for i, sentence in enumerate(bold_sentences):
#     print(f"Bold Sentence {i+1}: {sentence}")


In [77]:
import fitz  # PyMuPDF

pdf_path = "NYstandards.pdf"  # Replace with your actual PDF path

def extract_bold_sentences(pdf_path):
    doc = fitz.open(pdf_path)
    bold_sentences = []
    temp_sentence = ""
    previous_block_bbox = None  # Track previous block for table detection

    for page in doc:
        text_instances = page.get_text("dict")  # Get text in dictionary format

        for block in text_instances.get("blocks", []):
            if not isinstance(block, dict):  # Skip invalid blocks
                continue
            
            current_block_bbox = block.get("bbox")  # Get bounding box for table separation
            in_table = previous_block_bbox == current_block_bbox  # Check if block is part of a table
            previous_block_bbox = current_block_bbox

            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    font = span.get("font", "")  # Get font name
                    text = span.get("text", "").strip()  # Extract text

                    # Check if the font is bold
                    if "Bold" in font:
                        if in_table:
                            # If text is inside a table, save separately
                            if temp_sentence:
                                bold_sentences.append(temp_sentence)
                                temp_sentence = ""
                            bold_sentences.append(text)
                        else:
                            # Merge multi-line bold text
                            if temp_sentence:
                                temp_sentence += " " + text
                            else:
                                temp_sentence = text
                    else:
                        # If normal text appears, finalize the bold sentence
                        if temp_sentence:
                            bold_sentences.append(temp_sentence)
                            temp_sentence = ""

            # Append any remaining bold text at the end of the block
            if temp_sentence:
                bold_sentences.append(temp_sentence)
                temp_sentence = ""

    return bold_sentences

# Extract bolded sentences
bold_sentences = extract_bold_sentences(pdf_path)

# Display results
for i, sentence in enumerate(bold_sentences):
    print(f"{i+1}. {sentence}")


1. New York State Department of Health  Wadsworth Center  Clinical Laboratory Evaluation Program     
2. TABLE OF CONTENTS
3. DEFINITIONS  Acceptability criteria
4. Alternative assessment
5. Amended report
6. Annual
7. Assistant director(s)
8. Auditing
9. Autorelease
10. Autoverification
11. Blood bank
12. .
13. Category
14. Certificate of qualification (CQ)
15. DEFINITIONS
16. Clinical laboratory
17. Corrected report
18. Corrective action
19. Data integrity
20. Delegate
21. Director (Clinical Laboratory Director)
22. Document control
23. Equipment
24. Function check
25. Health Commerce System (HCS)
26. DEFINITIONS
27. Instrument
28. Laboratory developed test (LDT)
29. Laboratory management
30. Monitoring
31. Nonconformance
32. Patient
33. Performance expectations
34. Performance specification
35. Performance verification
36. Preventive action
37. Process audit
38. DEFINITIONS  Proficiency testing participation
39. Quality indicator (QI)
40. Quality goals
41. Reagents
42. Reference lab

In [None]:
keywords = ["Quality Management System", "Standard", "Guidance", "Director Responsibilities", "Human Resources", "Facility Design", "Laboratory Safety", "Laboratory Safety" ]
my_list = bold_sentences

filtered_list = list(filter(lambda item: not any(keyword in item for keyword in keywords), my_list))
print(filtered_list)


In [78]:
import pdfplumber
import pandas as pd

pdf_path = "NYstandards.pdf"  # Replace with your actual PDF file path

# Initialize a list to store extracted tables
all_tables = []

# Open the PDF and extract tables
with pdfplumber.open(pdf_path) as pdf:
    for page_num, page in enumerate(pdf.pages, start=1):
        table = page.extract_table()  # Extract table from the page
        if table:
            df = pd.DataFrame(table)  # Convert table to DataFrame
            all_tables.append(df)  # Store in list

# Merge all tables into a single DataFrame
if all_tables:
    merged_df = pd.concat(all_tables, ignore_index=True)  # Merge and reset index
else:
    merged_df = pd.DataFrame()  # Create an empty DataFrame if no tables found

# Display the DataFrame
print(merged_df)


                                                     0  \
0                                                        
1                                             Standard   
2    Quality Management System Fundamental Standard...   
3                                                        
4                                             Standard   
..                                                 ...   
568    Oncology – Molecular and Cellular Tumor Markers   
569                         Parentage/Identity Testing   
570                                      Cytopathology   
571                                     Histopathology   
572                                     Trace Elements   

                             1         2       3     4     5  \
0    Quality Management System      None           NaN   NaN   
1                         None  Guidance    None   NaN   NaN   
2                         None              None   NaN   NaN   
3    Quality Management System      None       

In [79]:
merged_df.rename(columns={0: "Source", 1: "Description", 2: "Guidance"}, inplace=True)

In [81]:
merged_df.head()

Unnamed: 0,Source,Description,Guidance,3,4,5,6,7,8,Matched_Sentence,Remaining_Text
0,,Quality Management System,,,,,,,,,
1,Standard,,Guidance,,,,,,,Standard,
2,Quality Management System Fundamental Standard...,,,,,,,,,Quality Management System,Fundamental Standard of\nPractice (QMS FS)\nTh...
3,,Quality Management System,,,,,,,,,
4,Standard,,Guidance,,,,,,,Standard,


In [82]:
# Function to split text based on keyword match
def split_text_based_on_match(text, keyword_list):
    for keyword in keyword_list:
        if text.startswith(keyword):  # Check if text starts with the keyword
            return keyword, text[len(keyword):].strip()  # Split keyword from the rest
    return "", text  # If no match, leave first column empty and keep original text

# Apply function to split text into two columns
merged_df[["Matched_Sentence", "Remaining_Text"]] = merged_df["Source"].apply(
    lambda x: pd.Series(split_text_based_on_match(x, bold_sentences))
)

# Drop the original full text column (optional)
merged_df.drop(columns=["Source"], inplace=True)

# Print the modified DataFrame
print(merged_df)

                   Description  Guidance       3     4     5  \
0    Quality Management System      None           NaN   NaN   
1                         None  Guidance    None   NaN   NaN   
2                         None              None   NaN   NaN   
3    Quality Management System      None           NaN   NaN   
4                         None  Guidance    None   NaN   NaN   
..                         ...       ...     ...   ...   ...   
568                       None      None   OC S2  None  None   
569                       None      None  PIT S8  None  None   
570                       None      None  CY S15  None  None   
571                       None      None   HT S2  None  None   
572                       None      None   TE S9  None  None   

                                                     6     7     8  \
0                                                  NaN   NaN   NaN   
1                                                  NaN   NaN   NaN   
2                    

In [83]:
merged_df.to_csv("processed_sentences.csv", index=False)