<a href="https://colab.research.google.com/github/azizmansour1/test/blob/main/copy_of_pdf_to_docx_conversion_using_google_cloud_vision_api.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install google-cloud-vision



In [2]:
!pip install python-docx



In [3]:
import os

# Ensure the GOOGLE_APPLICATION_CREDENTIALS variable is set
# Replace with the actual path to your service account key file
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/content/swissfmiauditor-july16-bdd81c90914e.json"

# Set the GOOGLE_CLOUD_PROJECT environment variable to your correct project ID
os.environ["GOOGLE_CLOUD_PROJECT"] = "swissfmiauditor-july16"

print("Environment variables GOOGLE_APPLICATION_CREDENTIALS and GOOGLE_CLOUD_PROJECT set.")

Environment variables GOOGLE_APPLICATION_CREDENTIALS and GOOGLE_CLOUD_PROJECT set.


In [4]:
import os
from google.cloud import vision
from docx import Document
from docx.shared import Inches
import io

def detect_document_text_from_pdf(pdf_file_path):
    """
    Detects document text from a local PDF file using Google Cloud Vision API.
    This function processes the PDF page by page to extract text.

    Args:
        pdf_file_path (str): The path to the local PDF file.

    Returns:
        str: The extracted text content from the PDF.
    """
    # Initialize the Vision API client using the service account key file
    client = vision.ImageAnnotatorClient.from_service_account_json(os.environ["GOOGLE_APPLICATION_CREDENTIALS"])

    # Read the PDF file content
    with io.open(pdf_file_path, 'rb') as pdf_file:
        content = pdf_file.read()

    # Create an input config for the PDF
    # The source is the content of the PDF file itself
    input_config = vision.InputConfig(
        mime_type='application/pdf',
        content=content
    )

    # Configure the features we want to use (document text detection)
    features = [vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)]

    # Create a request for batch annotate files
    # This allows processing multi-page documents
    requests = [
        vision.AnnotateFileRequest(
            input_config=input_config,
            features=features,
            # You can specify pages to process, or leave empty for all pages
            # pages=[1, 2, 3]
        )
    ]

    print(f"Sending PDF '{pdf_file_path}' to Google Cloud Vision API for text detection...")
    response = client.batch_annotate_files(requests=requests)
    print("Received response from Google Cloud Vision API.")

    full_text = []
    # Iterate over the responses for each page
    for image_response in response.responses[0].responses:
        if image_response.full_text_annotation:
            full_text.append(image_response.full_text_annotation.text)
        else:
            full_text.append("") # Append empty string if no text found on a page

    return "\n\n".join(full_text) # Join text from different pages with double newline

def create_docx_from_text(text_content, docx_file_path):
    """
    Creates a new DOCX file and inserts the provided text content.

    Args:
        text_content (str): The text to be inserted into the DOCX.
        docx_file_path (str): The path where the DOCX file will be saved.
    """
    document = Document()
    document.add_heading('Extracted Text from PDF', level=1)

    # Add the extracted text, preserving basic paragraph breaks
    for paragraph_text in text_content.split('\n'):
        document.add_paragraph(paragraph_text)

    document.save(docx_file_path)
    print(f"Successfully created DOCX file: '{docx_file_path}'")

if __name__ == "__main__":
    # --- Configuration ---
    # Directory to search for PDF files
    pdf_directory = "/content"
    # Output directory for DOCX files
    output_directory = "/content"

    # Ensure output directory exists
    os.makedirs(output_directory, exist_ok=True)

    # Find all PDF files in the specified directory
    pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith('.pdf')]

    if not pdf_files:
        print(f"No PDF files found in '{pdf_directory}'.")
    else:
        print(f"Found {len(pdf_files)} PDF files in '{pdf_directory}': {pdf_files}")
        for pdf_file in pdf_files:
            input_pdf_path = os.path.join(pdf_directory, pdf_file)
            # Create output DOCX file name based on the PDF file name
            output_docx_path = os.path.join(output_directory, os.path.splitext(pdf_file)[0] + ".docx")

            # --- Main execution ---
            if os.path.exists(input_pdf_path):
                try:
                    # Step 1: Extract text from PDF using Google Cloud Vision API
                    extracted_text = detect_document_text_from_pdf(input_pdf_path)
                    print("\n--- Extracted Text ---")
                    print(extracted_text[:500] + "..." if len(extracted_text) > 500 else extracted_text) # Print first 500 chars

                    # Step 2: Create DOCX from the extracted text
                    create_docx_from_text(extracted_text, output_docx_path)

                except Exception as e:
                    print(f"\nAn error occurred during the conversion process for '{pdf_file}': {e}")
                    print("Please ensure your Google Cloud credentials are set up correctly and the Cloud Vision API is enabled.")
                    print("For authentication, set the GOOGLE_APPLICATION_CREDENTIALS environment variable.")
                    print("Example: export GOOGLE_APPLICATION_CREDENTIALS='/content/praxis-window-353407-c5e720b49c87.json'")
            else:
                print(f"Error: Input PDF file not found at '{input_pdf_path}'. Please check the path.")

Found 1 PDF files in '/content': ['SCA No.089 - Swiss FM - Labour Supply Works - Head Office-3.pdf']
Sending PDF '/content/SCA No.089 - Swiss FM - Labour Supply Works - Head Office-3.pdf' to Google Cloud Vision API for text detection...
Received response from Google Cloud Vision API.

--- Extracted Text ---
innovo
Page 1 of 11
INNOVO/PROC/HO/SCA/HOU/SFM/089/25
عقد خدمات رقم 089/25/INNOVO/PROC/HO/SCA/HOU/SFM
توفير العمالة اللازمة
بمقر الشركة الرئيسي - كومبوند ميفيدا
أنه في يوم الجمعة الموافق 18 يوليو 2025 تحرر هذا العقد بين كلاً من :
أولاً : - شركة انوفو للبناء (ش.م.م) - سجل تجاري رقم 126987 - مكتب سجل تجاري استثمار القاهرة
رقم التسجيل الضريبي : 573-445-728
والكائن مقرها مبنى رقم B5 ، الدور الثالث ، المبانى الإدارية في مشروع ميفيدا - القاهرة الجديدة – محافظة القاهرة،
ويمثلها في توقيع هذا العقد السيد...
Successfully created DOCX file: '/content/SCA No.089 - Swiss FM - Labour Supply Works - Head Office-3.docx'


In [5]:
import os
from google.cloud import vision
from docx import Document
from docx.shared import Inches
import io

def detect_document_text_from_pdf(pdf_file_path):
    """
    Detects document text from a local PDF file using Google Cloud Vision API.
    This function processes the PDF page by page to extract text.

    Args:
        pdf_file_path (str): The path to the local PDF file.

    Returns:
        str: The extracted text content from the PDF.
    """
    # Initialize the Vision API client using the service account key file
    client = vision.ImageAnnotatorClient.from_service_account_json(os.environ["GOOGLE_APPLICATION_CREDENTIALS"])

    # Read the PDF file content
    with io.open(pdf_file_path, 'rb') as pdf_file:
        content = pdf_file.read()

    # Create an input config for the PDF
    # The source is the content of the PDF file itself
    input_config = vision.InputConfig(
        mime_type='application/pdf',
        content=content
    )

    # Configure the features we want to use (document text detection)
    features = [vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)]

    # Create a request for batch annotate files
    # This allows processing multi-page documents
    requests = [
        vision.AnnotateFileRequest(
            input_config=input_config,
            features=features,
            # You can specify pages to process, or leave empty for all pages
            # pages=[1, 2, 3]
        )
    ]

    print(f"Sending PDF '{pdf_file_path}' to Google Cloud Vision API for text detection...")
    response = client.batch_annotate_files(requests=requests)
    print("Received response from Google Cloud Vision API.")

    full_text = []
    # Iterate over the responses for each page
    for image_response in response.responses[0].responses:
        if image_response.full_text_annotation:
            full_text.append(image_response.full_text_annotation.text)
        else:
            full_text.append("") # Append empty string if no text found on a page

    return "\n\n".join(full_text) # Join text from different pages with double newline

def create_docx_from_text(text_content, docx_file_path):
    """
    Creates a new DOCX file and inserts the provided text content.

    Args:
        text_content (str): The text to be inserted into the DOCX.
        docx_file_path (str): The path where the DOCX file will be saved.
    """
    document = Document()
    document.add_heading('Extracted Text from PDF', level=1)

    # Add the extracted text, preserving basic paragraph breaks
    for paragraph_text in text_content.split('\n'):
        document.add_paragraph(paragraph_text)

    document.save(docx_file_path)
    print(f"Successfully created DOCX file: '{docx_file_path}'")

if __name__ == "__main__":
    # --- Configuration ---
    # Directory to search for PDF files
    pdf_directory = "/content"
    # Output directory for DOCX files
    output_directory = "/content"

    # Ensure output directory exists
    os.makedirs(output_directory, exist_ok=True)

    # Find all PDF files in the specified directory
    pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith('.pdf')]

    if not pdf_files:
        print(f"No PDF files found in '{pdf_directory}'.")
    else:
        print(f"Found {len(pdf_files)} PDF files in '{pdf_directory}': {pdf_files}")
        for pdf_file in pdf_files:
            input_pdf_path = os.path.join(pdf_directory, pdf_file)
            # Create output DOCX file name based on the PDF file name
            output_docx_path = os.path.join(output_directory, os.path.splitext(pdf_file)[0] + ".docx")

            # --- Main execution ---
            if os.path.exists(input_pdf_path):
                try:
                    # Step 1: Extract text from PDF using Google Cloud Vision API
                    extracted_text = detect_document_text_from_pdf(input_pdf_path)
                    print("\n--- Extracted Text ---")
                    print(extracted_text[:500] + "..." if len(extracted_text) > 500 else extracted_text) # Print first 500 chars

                    # Step 2: Create DOCX from the extracted text
                    create_docx_from_text(extracted_text, output_docx_path)

                except Exception as e:
                    print(f"\nAn error occurred during the conversion process for '{pdf_file}': {e}")
                    print("Please ensure your Google Cloud credentials are set up correctly and the Cloud Vision API is enabled.")
                    print("For authentication, set the GOOGLE_APPLICATION_CREDENTIALS environment variable.")
                    print("Example: export GOOGLE_APPLICATION_CREDENTIALS='/content/praxis-window-353407-c5e720b49c87.json'")
            else:
                print(f"Error: Input PDF file not found at '{input_pdf_path}'. Please check the path.")

Found 1 PDF files in '/content': ['SCA No.089 - Swiss FM - Labour Supply Works - Head Office-3.pdf']
Sending PDF '/content/SCA No.089 - Swiss FM - Labour Supply Works - Head Office-3.pdf' to Google Cloud Vision API for text detection...
Received response from Google Cloud Vision API.

--- Extracted Text ---
innovo
Page 1 of 11
INNOVO/PROC/HO/SCA/HOU/SFM/089/25
عقد خدمات رقم 089/25/INNOVO/PROC/HO/SCA/HOU/SFM
توفير العمالة اللازمة
بمقر الشركة الرئيسي - كومبوند ميفيدا
أنه في يوم الجمعة الموافق 18 يوليو 2025 تحرر هذا العقد بين كلاً من :
أولاً : - شركة انوفو للبناء (ش.م.م) - سجل تجاري رقم 126987 - مكتب سجل تجاري استثمار القاهرة
رقم التسجيل الضريبي : 573-445-728
والكائن مقرها مبنى رقم B5 ، الدور الثالث ، المبانى الإدارية في مشروع ميفيدا - القاهرة الجديدة – محافظة القاهرة،
ويمثلها في توقيع هذا العقد السيد...
Successfully created DOCX file: '/content/SCA No.089 - Swiss FM - Labour Supply Works - Head Office-3.docx'


In [6]:
from google.colab import auth

# Authenticate using your Google account
auth.authenticate_user()

print('Authenticated')

AuthorizationError: Failed to fetch user credentials