In [95]:
import pandas as pd
import argparse

from google.cloud import vision
from enum import Enum
from PIL import Image, ImageDraw

import fitz  # PyMuPDF
import json
import re
import os

from google.cloud import vision
from google.cloud import storage

## Relevant documentation

[Here](https://stackoverflow.com/questions/74659285/google-cloud-vision-with-google-storage) and [here](https://stackoverflow.com/questions/50840511/google-cloud-import-storage-cannot-import-storage) is some relevant documentation about what to do with your Google account. You have to have a Google developer account, then create a project, and in that project, you have to create a bucket. You also have to enable the API which can be done through `gcloud` command line interface or through the user interface online. 

However, the best way to create this credential file is just to install the `gcloud` command line interface on your local system. Then, using `gcloud init`, you can authenticate and select a default project. 

Then run `gcloud auth application-default login` as mentioned [here](https://cloud.google.com/docs/authentication/provide-credentials-adc#local-dev). Then, you follow the interstructions and grant access. This command leaves a file on your system allowing the `google.cloud` library to recognize you and your default project. 

## Creating a bucket

A bucket is a URL starting with `gs://`, which stores your files. I think the Google Cloudvision API only allows you to OCR images this way. Hence, you have to upload them. However, this storage is paid, so make sure to delete the bucket after you use it. 


## Function to execute

The function to execute is mentioned on [this](https://cloud.google.com/vision/docs/pdf) tutorial page. 



In [21]:
def pdf_to_tiff(pdf_path, output_folder, dpi=300):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    
    # Calculate the zoom factor based on DPI (standard DPI is 72)
    zoom = dpi / 72
    mat = fitz.Matrix(zoom, zoom)
    
    # Iterate over each page
    for page_number in range(len(pdf_document)):
        # Get the page
        page = pdf_document.load_page(page_number)
        
        # Render the page to an image (pixmap) with the specified resolution
        pix = page.get_pixmap(matrix=mat)
        
        # Convert the pixmap to a PIL Image
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        
        # Save the image as a TIFF file
        tiff_path = f"{output_folder}/page_{page_number + 1}.tiff"
        img.save(tiff_path, format="TIFF")
        
        print(f"Page {page_number + 1} saved as {tiff_path}")
        
    # Close the PDF file
    pdf_document.close()

# Example usage
pdf_path = '../../data/primary_sources/istat94_tav3.pdf'  # Specify the path to your PDF file
output_folder = '../../data/primary_sources/'  # Specify the folder to save the TIFF files
pdf_to_tiff(pdf_path, output_folder)


Page 1 saved as ../../data/primary_sources//page_1.tiff
Page 2 saved as ../../data/primary_sources//page_2.tiff
Page 3 saved as ../../data/primary_sources//page_3.tiff
Page 4 saved as ../../data/primary_sources//page_4.tiff
Page 5 saved as ../../data/primary_sources//page_5.tiff
Page 6 saved as ../../data/primary_sources//page_6.tiff
Page 7 saved as ../../data/primary_sources//page_7.tiff
Page 8 saved as ../../data/primary_sources//page_8.tiff
Page 9 saved as ../../data/primary_sources//page_9.tiff
Page 10 saved as ../../data/primary_sources//page_10.tiff
Page 11 saved as ../../data/primary_sources//page_11.tiff
Page 12 saved as ../../data/primary_sources//page_12.tiff
Page 13 saved as ../../data/primary_sources//page_13.tiff
Page 14 saved as ../../data/primary_sources//page_14.tiff
Page 15 saved as ../../data/primary_sources//page_15.tiff
Page 16 saved as ../../data/primary_sources//page_16.tiff
Page 17 saved as ../../data/primary_sources//page_17.tiff
Page 18 saved as ../../data/prim

In [23]:
def async_detect_document(gcs_source_uri, gcs_destination_uri):
    """OCR with PDF/TIFF as source files on GCS"""
    import json
    import re
    from google.cloud import vision
    from google.cloud import storage

    # Supported mime_types are: 'application/pdf' and 'image/tiff'
    mime_type = "image/tiff"

    # How many pages should be grouped into each json output file.
    batch_size = 2

    client = vision.ImageAnnotatorClient()

    feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

    gcs_source = vision.GcsSource(uri=gcs_source_uri)
    input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type)

    gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.OutputConfig(
        gcs_destination=gcs_destination, batch_size=batch_size
    )

    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature], input_config=input_config, output_config=output_config
    )

    operation = client.async_batch_annotate_files(requests=[async_request])

    print("Waiting for the operation to finish.")
    operation.result(timeout=420)

    # Once the request has completed and the output has been
    # written to GCS, we can list all the output files.
    storage_client = storage.Client()

    match = re.match(r"gs://([^/]+)/(.+)", gcs_destination_uri)
    bucket_name = match.group(1)
    prefix = match.group(2)

    bucket = storage_client.get_bucket(bucket_name)

    # List objects with the given prefix, filtering out folders.
    blob_list = [
        blob
        for blob in list(bucket.list_blobs(prefix=prefix))
        if not blob.name.endswith("/")
    ]
    print("Output files:")
    for blob in blob_list:
        print(blob.name)

    # Process the first output file from GCS.
    # Since we specified batch_size=2, the first response contains
    # the first two pages of the input file.
    output = blob_list[0]

    json_string = output.download_as_bytes().decode("utf-8")
    response = json.loads(json_string)

    # The actual response for the first page of the input file.
    first_page_response = response["responses"][0]
    annotation = first_page_response["fullTextAnnotation"]

    # Here we print the full text from the first page.
    # The response contains more information:
    # annotation/pages/blocks/paragraphs/words/symbols
    # including confidence scores and bounding boxes
    print("Full text:\n")
    print(annotation["text"])



In [25]:
async_detect_document('gs://bas_img_bucket/page_2.tiff', 'gs://bas_img_bucket/page2')

Waiting for the operation to finish.
Output files:
page2output-1-to-1.json
Full text:

Tavola 3 segue
COMUNI
Popolazione residente per Comune (a)
POPOLAZIONE RESIDENTE
1861
1871
1881
1901
1911
1921
Segue PROVINCIA DI TORINO 001
1931
1936
1951
1961
1971
1981
1991
071
074
៩៩៩៩៩៩ ៩៩
070
Cavour
7.794
7.449
7.202
6.843
6.512
6.317
6.231
6.244
5.817
5.199
5.043
5.085
5.226
Cercenasco
1.962
1.955
2.080
1.651
1.600
1.590
1.483
1.477
1.471
1.480
1.497
1.561
1.632
072
Ceres
1.880
2.023
2.097
2.203
2.277
1.974
1.832
1.721
1.576
1.280
1.161
1.026
939
073
Ceresole Reale *
320
332
341
272
290
290
277
227
216
237
186
173
167
Cesana Torinese
3.123
2.707
2.499
2.496
2.177
1.803
1.826
1.479
1.196
918
913
909
937
075
Chialamberto
1.707
1.653
1.806
1.510
1.287
1.218
948
950
730
621
483
392
353
076
Chianocco
1.656
1.912
1.980
2.156
2.045
2.005
1.594
1.498
1.427
1.371
1.269
1.493
1.501
077
Chiaverano
•
2.466
2.475
2.540
2.528
2.679
2.368
2.533
1.777
1.699
1.820
2.089
2.217
2.225
•
078 Chieri
12.856
12.248
1

## Parse this into a data.frame

Now the big question is how to parse this information to a pandas `DataFrame` and later to a `.csv`. As we can see, the city name is often parsed correctly, and then it's followed by some numbers. However, sometimes, the _identifier_, e.g. 070, 071, 072 is missing. My recipe would be the following.

Pre-processing: Remove all non-numeric, non-space and non-character lines and characters, e.g. "-", "•", ".", etc. Then:
- Start looking for the string 1991
- Then look for the first (partially) character string afterwards
- Then look for the 13 numeric string lines (after parsing) after, and put them in a row. 
- Then look for the next character string afterwards, and also look for the next 13 numeric string lines (after parsing) afterwards.

I implement that in the following demonstration and apply it to one of the files. Afterwards, I wrap all of this in a function:
    

In [105]:
with open('../../data/cloudvision_ocr/page2output-1-to-1.json') as f:
    d = json.load(f)

text = d["responses"][0]["fullTextAnnotation"]["text"]

In [111]:
# Function to clean and preprocess the text
def preprocess_text(text):
    # Remove unwanted characters including '*'
    text = re.sub(r'[•\-.*⚫]', '', text)
    # Split the text into lines
    lines = text.split('\n')
    # Remove empty lines
    lines = [line.strip() for line in lines if line.strip()]
    return lines

# Function to parse the preprocessed text into a DataFrame
def parse_to_dataframe(lines):
    # Initialize lists to hold data
    data = []
    city = None
    city_data = []

    i = 0
    while i < len(lines):
        line = lines[i]
        if re.match(r'^(?=.*[a-zA-Z])', line):  # Match city names
            if city and len(city_data) == 13:
                # Append the previous city data to the data list
                data.append([city] + city_data)
            # Set the current line as the city name
            city = line.strip('*')
            city_data = []
        elif re.match(r'^[\d]+$', line):  # Match numerical values
            # Add to city_data
            city_data.append(line.replace('.', ''))
            # Ensure we only take 13 numbers for each city
            if len(city_data) == 13:
                data.append([city] + city_data)
                city = None
                city_data = []
        i += 1
        
    # Uncomment if you want to append the last city data
    #if city and len(city_data) == 13:
    #    data.append([city] + city_data)

    # Convert to DataFrame
    columns = ['City'] + [str(year) for year in range(1, 14)]
    df = pd.DataFrame(data, columns=columns)
    
    return df

In [113]:
lines = preprocess_text(text)
df = parse_to_dataframe(lines)

df

Unnamed: 0,City,1,2,3,4,5,6,7,8,9,10,11,12,13
0,Cavour,7794,7449,7202,6843,6512,6317,6231,6244,5817,5199,5043,5085,5226
1,Cercenasco,1962,1955,2080,1651,1600,1590,1483,1477,1471,1480,1497,1561,1632
2,Ceres,1880,2023,2097,2203,2277,1974,1832,1721,1576,1280,1161,1026,939
3,Ceresole Reale,320,332,341,272,290,290,277,227,216,237,186,173,167
4,Cesana Torinese,3123,2707,2499,2496,2177,1803,1826,1479,1196,918,913,909,937
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,Levone,1134,1209,1255,1106,1020,892,731,700,642,576,546,490,445
64,Locana,5637,6438,6486,5980,4924,4729,4415,3914,3580,3012,2405,2186,1983
65,Lombardore,1179,1222,1212,1192,1098,1136,1056,914,889,852,1032,1370,1431
66,Lombriasco,1105,1066,1152,1026,1007,954,964,954,972,848,858,864,937


In [114]:
def convert_json_to_csv(file_path):
    # Open file
    with open(file_path) as f:
        d = json.load(f)
    # Get text
    text = d["responses"][0]["fullTextAnnotation"]["text"]
    lines = preprocess_text(text)
    df = parse_to_dataframe(lines)
    # Write to appropriate directory
    directory, file_name = os.path.split(file_path)
     # Split the file name into name and extension
    file_name, file_extension = os.path.splitext(file_name)
    
    # Create the output file path
    output_file_name = file_name + "_output.csv"
    output_file_path = os.path.join(directory, output_file_name)
    df.to_csv(output_file_path, index=False)
    
    

In [115]:
# Example
convert_json_to_csv('../../data/cloudvision_ocr/page2output-1-to-1.json')

## Pipeline

Now, I upload all the .tiff files to the Google cloud storage bucket. Afterwards, I OCR all of them in a loop from here. Then, I copy the files from the storage bucket to my hard drive. On these files, I use `convert_json_to_csv()`. 


In [120]:
def async_detect_document_short(gcs_source_uri, gcs_destination_uri):
    """OCR with PDF/TIFF as source files on GCS"""
    import json
    import re
    from google.cloud import vision
    from google.cloud import storage

    # Supported mime_types are: 'application/pdf' and 'image/tiff'
    mime_type = "image/tiff"

    # How many pages should be grouped into each json output file.
    batch_size = 2

    client = vision.ImageAnnotatorClient()

    feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

    gcs_source = vision.GcsSource(uri=gcs_source_uri)
    input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type)

    gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.OutputConfig(
        gcs_destination=gcs_destination, batch_size=batch_size
    )

    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature], input_config=input_config, output_config=output_config
    )

    operation = client.async_batch_annotate_files(requests=[async_request])

    print("Waiting for the operation to finish.")
    operation.result(timeout=420)
    print(f"Finish Operation{gcs_source_uri}")


In [None]:
for i in range(4, 129):
    async_detect_document_short(f"gs://bas_img_bucket/page_{i}.tiff", f"gs://bas_img_bucket/output/page{i}")

Waiting for the operation to finish.
Finish Operationgs://bas_img_bucket/page_4.tiff
Waiting for the operation to finish.
Finish Operationgs://bas_img_bucket/page_5.tiff
Waiting for the operation to finish.
