In [1]:
!pip install -Uqq --upgrade google-cloud-vision

In [20]:
def async_detect_document(gcs_source_uri, gcs_destination_uri):
    """OCR with PDF/TIFF as source files on GCS"""
    import json
    import re
    from google.cloud import vision
    from google.cloud import storage

    # Supported mime_types are: 'application/pdf' and 'image/tiff'
    mime_type = 'application/pdf'

    # How many pages should be grouped into each json output file.
    batch_size = 2

    client = vision.ImageAnnotatorClient()

    feature = vision.Feature(
        type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

    gcs_source = vision.GcsSource(uri=gcs_source_uri)
    input_config = vision.InputConfig(
        gcs_source=gcs_source, mime_type=mime_type)

    gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.OutputConfig(
        gcs_destination=gcs_destination, batch_size=batch_size)

    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature], input_config=input_config,
        output_config=output_config)

    operation = client.async_batch_annotate_files(
        requests=[async_request])

    print('Waiting for the operation to finish.')
    operation.result(timeout=420)

    # Once the request has completed and the output has been
    # written to GCS, we can list all the output files.
    storage_client = storage.Client()

    match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
    bucket_name = match.group(1)
    prefix = match.group(2)
    print(type(gcs_destination))
    bucket = storage_client.get_bucket(bucket_name)
    
    blob_list = list(bucket.list_blobs(prefix=prefix))
    print('Output files:')
    for blob in blob_list:
        print(blob.name)

    # Process the first output file from GCS.
    # Since we specified batch_size=2, the first response contains
    # the first two pages of the input file.
    output = blob_list[0]
    print(output)
    json_string = output.download_as_string()
    response = json.loads(json_string)

    # The actual response for the first page of the input file.
    first_page_response = response['responses'][0]
    annotation = first_page_response['fullTextAnnotation']

    # Here we print the full text from the first page.
    # The response contains more information:
    # annotation/pages/blocks/paragraphs/words/symbols
    # including confidence scores and bounding boxes
    print('Full text:\n')
    print(annotation['text'])

    # List objects with the given prefix.
    

In [21]:
async_detect_document("gs://riscovry_documents/Valid/Free PDF Download Discharge Summary Template.pdf","gs://riscovry_documents/output")

Waiting for the operation to finish.
<class 'google.cloud.vision_v1.types.image_annotator.GcsDestination'>
Output files:
output-1-to-1.json
output/
output/output-1-to-1.json
output/output-1-to-2.json
outputoutput-1-to-1.json
<Blob: riscovry_documents, output-1-to-1.json, 1622885108665450>
Full text:

Provider: Ken Cure, MD Patient: Patient H Sample Provider's Pt ID: 6910828 Sex: Female
Attachment Control Number: XA728302
HOSPITAL DISCHARGE DX
HOSPITAL DISCHARGE PROCEDURES 1. 32650 Thoracoscopy with chest tube placement and pleurodesis. HISTORY OF PRESENT ILLNESS The patient is a very pleasant, 70-year-old female with a history of breast cancer that was originally diagnosed in the early 70's. At that time she had a radical mastectomy with postoperative radiotherapy. In the mid 70's she developed a chest wall recurrence and was treated with further radiation therapy. She then went without evidence of disease for many years until the late 80's when she developed bone metastases with invol