In [None]:
!pip install pytesseract

In [None]:
!pip install pdf2image

In [None]:
!pip uninstall numpy -y

In [None]:
!pip install numpy==1.23.5

In [None]:
# Import necessary libraries
import pytesseract
from pdf2image import convert_from_path
import os
from PIL import Image, ImageOps

In [None]:

# Configure path
pytesseract.pytesseract.tesseract_cmd = '/opt/homebrew/bin/tesseract' 

# Function to convert PDF to images
def convert_pdf_to_images(pdf_path, output_folder='temp_images', dpi=300):
    images = convert_from_path(pdf_path, dpi=dpi)
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    image_paths = []
    for i, image in enumerate(images):
        image_path = os.path.join(output_folder, f'page_{i + 1}.png')
        image.save(image_path, 'PNG')
        image_paths.append(image_path)
    return image_paths

# Function to preprocess image (optional for better OCR)
def preprocess_image(image_path):
    image = Image.open(image_path)
    image = ImageOps.grayscale(image)  # Convert to grayscale
    image = image.point(lambda x: 0 if x < 128 else 255, '1')  # Apply thresholding
    return image

# Function to extract text from image using Tesseract
def extract_text_from_image(image):
    text = pytesseract.image_to_string(image)
    return text

# Main function to process PDF and extract text
def extract_text_from_pdf(pdf_path, output_text_folder='extracted_text'):
    image_paths = convert_pdf_to_images(pdf_path)
    full_text = ''

    if not os.path.exists(output_text_folder):
        os.makedirs(output_text_folder)

    for image_path in image_paths:
        preprocessed_image = preprocess_image(image_path)
        text = extract_text_from_image(preprocessed_image)
        full_text += text + '\n'

    # Save the extracted text to a file
    output_file_path = os.path.join(output_text_folder, os.path.basename(pdf_path).replace('.pdf', '.txt'))
    with open(output_file_path, 'w', encoding='utf-8') as f:
        f.write(full_text)

    print(f'Text extracted and saved to {output_file_path}')
    return full_text

# Specify the path to your PDF
pdf_path = '/Users/barrettdowns/eucom/data/zerotrust.pdf'  # Replace 'your_pdf_file.pdf' with your actual PDF file name in the data folder
extracted_text = extract_text_from_pdf(pdf_path)

# Display the extracted text
print(extracted_text)


In [None]:
!pip install spacy

In [None]:
!pip install --upgrade numpy h5py

In [None]:
import numpy
import h5py
print("numpy version:", numpy.__version__)
print("h5py version:", h5py.__version__)


In [None]:
!python -m spacy download en_core_web_sm

In [None]:
import spacy
from collections import Counter

# Load the English NLP model
nlp = spacy.load('en_core_web_sm')

# Process the extracted text
doc = nlp(extracted_text)

# Filter for relevant entity types (people, places, and things)
entity_types = {"PERSON", "GPE", "LOC", "ORG", "PRODUCT"}
filtered_entities = [ent.text for ent in doc.ents if ent.label_ in entity_types]

# Count the frequency of each entity
entity_counts = Counter(filtered_entities)

# Display the frequency analysis sorted by the most common entities
print("Entity Frequency Analysis:")
for entity, count in entity_counts.most_common():
    print(f"{entity}: {count}")

In [None]:
import matplotlib.pyplot as plt

# Plot the most common entities
entities, counts = zip(*entity_counts.most_common(10))  # Top 10 entities
plt.barh(entities, counts)
plt.xlabel('Frequency')
plt.title('Top 10 Entities in the Text')
plt.gca().invert_yaxis()  # Invert the y-axis for better readability
plt.show()


# For multiple PDFs

In [None]:
!pip uninstall numpy -y
!pip install numpy==1.23.5

In [1]:
import os
import pytesseract
from pdf2image import convert_from_path
import spacy
from collections import Counter
import matplotlib.pyplot as plt

# Ensure Tesseract is correctly configured
pytesseract.pytesseract.tesseract_cmd = '/opt/homebrew/bin/tesseract' 

# Load spaCy language model
nlp = spacy.load('en_core_web_sm')

# Directory containing PDF files
pdf_folder = 'data'

# Initialize a variable to store all extracted text
all_extracted_text = ''

# Loop through all PDF files in the folder
for filename in os.listdir(pdf_folder):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(pdf_folder, filename)
        print(f'Processing: {filename}')

        # Convert PDF to images
        images = convert_from_path(pdf_path)
        
        # Extract text from each image page
        for image in images:
            text = pytesseract.image_to_string(image)
            all_extracted_text += text + '\n'  # Combine text from each page

# Process the combined extracted text with spaCy
doc = nlp(all_extracted_text)

# Filter for relevant entity types (people, places, and things)
entity_types = {"PERSON", "GPE", "LOC", "ORG", "PRODUCT"}
filtered_entities = [ent.text for ent in doc.ents if ent.label_ in entity_types]

# Count the frequency of each entity
entity_counts = Counter(filtered_entities)

# Display the frequency analysis sorted by the most common entities
print("Entity Frequency Analysis:")
for entity, count in entity_counts.most_common(10):  # Display top 10 entities
    print(f"{entity}: {count}")


  from pandas.core import (


Processing: besta.pdf
Processing: zerotrust.pdf
Entity Frequency Analysis:
BESTA: 50
SSA: 38
ORBITS: 24
The MITRE Corporation: 17
DLT: 16
SNARE: 14
Distribution Unlimited: 12
Fig: 11
Proc: 11
N. G. Gordon: 10


In [None]:

# Plot the most common entities
entities, counts = zip(*entity_counts.most_common(10))  # Top 10 entities
plt.barh(entities, counts)
plt.xlabel('Frequency')
plt.title('Top 10 Entities in the Text')
plt.gca().invert_yaxis()  # Invert the y-axis for better readability
plt.show()
