In [2]:
import boto3

from pdf2image import convert_from_path
from pathlib import Path

# textract-response-parser
from trp import Document

import os
import openai
openai.api_key = os.environ["OPENAI_API_KEY"]


In [4]:
# Requies Poppler installed
# Windows: https://github.com/oschwartz10612/poppler-windows/releases (add poppler\poppler-23.08.0\Library\bin to PATH afterwards)

def pdf_to_jpeg(pdf_path, output_folder, pdf_name):
    # Convert PDF to a list of JPEG images
    images = convert_from_path(pdf_path)

    path = Path(output_folder, pdf_name)
    path.mkdir(parents=True, exist_ok=True)

    # Save the JPEG images to the output folder
    for i, image in enumerate(images):
        image.save(f"{path}/page_{i+1}.jpg", "JPEG")

In [5]:
def textract_jpeg_dir(dir_path):
  documents = []

  # Initialize the Textract client
  textract_client = boto3.client('textract')

  file_list = os.listdir(dir_path)
  for file_name in file_list:
    full_file_path = os.path.join(dir_path, file_name)
  
    # Read the PDF file as binary data
    with open(full_file_path, 'rb') as pdf_file:
        jpeg_binary = pdf_file.read()

    # Call the Textract API to analyze the PDF document
    response = textract_client.analyze_document(
        Document={'Bytes': jpeg_binary},
        FeatureTypes=['TABLES', 'FORMS']  # Specify the features you want to extract
    )

    print (response)

    documents.append(Document(response))
    
  
  return documents

In [19]:
# Specify the path to the PDF file you want to analyze
pdf_file_path = './data/misc/The VividCloud Discovery Process Whitepaper.pdf'
pdf_to_jpeg(pdf_file_path, "./data/converted_to_jpeg", "vc-whitepaper")

In [20]:
documets = textract_jpeg_dir("./data/converted_to_jpeg/vc-whitepaper")

{'DocumentMetadata': {'Pages': 1}, 'Blocks': [{'BlockType': 'PAGE', 'Geometry': {'BoundingBox': {'Width': 1.0, 'Height': 1.0, 'Left': 0.0, 'Top': 0.0}, 'Polygon': [{'X': 0.012283341959118843, 'Y': 0.0}, {'X': 1.0, 'Y': 0.012414980679750443}, {'X': 0.9903926253318787, 'Y': 1.0}, {'X': 0.0, 'Y': 0.9903821349143982}]}, 'Id': 'cbdbfcb4-659a-41de-8870-369724b5ad87', 'Relationships': [{'Type': 'CHILD', 'Ids': ['7961a8f3-bb69-47f3-8d58-0f1edb13443a', '88bda37b-be61-4b82-b858-90d11084d976', '76842234-5f07-432f-a760-1ab5a1577cde', 'd75f3694-4875-42fa-80a5-a854dc72dd35', '6e4a6666-1131-4e34-80ec-2f8d7abd4203', '3adeeb2f-edb6-4638-a2b3-0d65a33b6bd3', '9a6f1088-0051-40b1-98e7-96fcee0c2d6c', '567590e5-fdbf-40ff-bb2f-ed1ac76a1172', '8ac8ad45-0599-4af0-91f4-fa8350449986', 'c9d10609-3ea0-4aa5-a5a0-0f30108c6205', 'c71137e9-41c2-42bd-8f6f-0897b6e7e90e', '1e685d48-e827-44ed-aa9d-1bb070083a71']}]}, {'BlockType': 'LINE', 'Confidence': 47.413150787353516, 'Text': 'CLUUD', 'Geometry': {'BoundingBox': {'Width

In [21]:
def get_document_content(document):
  text_lines = []
  # Iterate over elements in the document
  for page in document.pages:
      # Print lines and words
      for line in page.lines:
          text_lines.append(line.text)

  return text_lines

In [22]:
text_raw = ""
for documet in documets:
  lines = get_document_content(documet)
  for line in lines:
      text_raw += "".join(lines)

In [23]:
len(text_raw)

363640

In [26]:
with open("./data/converted_to_jpeg/vc-whitepaper/raw-vc-whitepaper.txt", "w+") as f:
  f.write(text_raw)

In [27]:
def summarize_text(text, num_tokens=50):
  input_prompt = f"Create a summary: {text}"

  response = openai.ChatCompletion.create(
      model="gpt-3.5-turbo-16k",
      messages = [
        {"role": "system", "content": input_prompt}
      ]
  )

  return response

In [34]:
curr = 0
end = len(text_raw)
chunk_len = 16000
summaries = []
while curr < end:
  text_chunk = text_raw[curr:curr+chunk_len]
  print (text_chunk)
  curr += chunk_len

  summary = summarize_text(text_chunk, num_tokens=100)
  summaries.append(summary)

CLUUDsmarter software. engineered here.VividCloud Agile DiscoveryProcesscC<<<<<<<(@@@@@@@(@@@@@@@(@@@@@@@CLUUDsmarter software. engineered here.VividCloud Agile DiscoveryProcesscC<<<<<<<(@@@@@@@(@@@@@@@(@@@@@@@CLUUDsmarter software. engineered here.VividCloud Agile DiscoveryProcesscC<<<<<<<(@@@@@@@(@@@@@@@(@@@@@@@CLUUDsmarter software. engineered here.VividCloud Agile DiscoveryProcesscC<<<<<<<(@@@@@@@(@@@@@@@(@@@@@@@CLUUDsmarter software. engineered here.VividCloud Agile DiscoveryProcesscC<<<<<<<(@@@@@@@(@@@@@@@(@@@@@@@CLUUDsmarter software. engineered here.VividCloud Agile DiscoveryProcesscC<<<<<<<(@@@@@@@(@@@@@@@(@@@@@@@CLUUDsmarter software. engineered here.VividCloud Agile DiscoveryProcesscC<<<<<<<(@@@@@@@(@@@@@@@(@@@@@@@CLUUDsmarter software. engineered here.VividCloud Agile DiscoveryProcesscC<<<<<<<(@@@@@@@(@@@@@@@(@@@@@@@CLUUDsmarter software. engineered here.VividCloud Agile DiscoveryProcesscC<<<<<<<(@@@@@@@(@@@@@@@(@@@@@@@CLUUDsmarter software. engineered here.VividCloud Agile

In [35]:
from pprint import pprint

summary_raw = ""
for summary in summaries:
  summary_raw += summary["choices"][0]["message"]["content"]

In [36]:
with open("./data/converted_to_jpeg/vc-whitepaper/summary-vc-whitepaper.txt", "w+") as f:
  f.write(summary_raw)