<a href="https://colab.research.google.com/github/ashik950/Automation/blob/main/ParallelGPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install paddlepaddle-gpu

Collecting paddlepaddle-gpu
  Downloading paddlepaddle_gpu-2.6.1-cp310-cp310-manylinux1_x86_64.whl.metadata (8.6 kB)
Collecting httpx (from paddlepaddle-gpu)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting astor (from paddlepaddle-gpu)
  Downloading astor-0.8.1-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting httpcore==1.* (from httpx->paddlepaddle-gpu)
  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx->paddlepaddle-gpu)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading paddlepaddle_gpu-2.6.1-cp310-cp310-manylinux1_x86_64.whl (758.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m758.9/758.9 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading astor-0.8.1-py2.py3-none-any.whl (27 kB)
Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00

In [None]:
!pip install paddleocr

Collecting paddleocr
  Downloading paddleocr-2.8.1-py3-none-any.whl.metadata (19 kB)
Collecting pyclipper (from paddleocr)
  Downloading pyclipper-1.3.0.post5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (9.0 kB)
Collecting lmdb (from paddleocr)
  Downloading lmdb-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Collecting rapidfuzz (from paddleocr)
  Downloading rapidfuzz-3.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting python-docx (from paddleocr)
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting fire>=0.3.0 (from paddleocr)
  Downloading fire-0.6.0.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.4/88.4 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading paddleocr-2.8.1-py3-none-any.whl (407 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m407

In [None]:
!nvcc --version


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [None]:
import os
import time
import multiprocessing
from paddleocr import PaddleOCR
import torch

# Configuration for PaddleOCR
config = {
    "lang": "en",                              # Language setting (e.g., "en" for English)
    "use_gpu": torch.cuda.is_available(),      # Use GPU if available
    "det_model": "ch_ppocr_mobile_v2.0_det",   # Detection model
    "rec_model": "ch_ppocr_mobile_v2.0_rec",   # Recognition model
    'det_db_thresh': 0.3,                      # Lower threshold for binarization
    'det_char_conf': 0.3,                      # Lower minimum confidence level for character recognition
    'cls_conf_thresh': 0.5,
    "use_angle_cls": True,
    "use_space_char": True,
    "text_layout": True,
    "det_limit_type": "min",
    "max_text_length": 5000,                   # Maximum length of extracted text (experimental)
}

def initialize_ocr():
    global ocr
    ocr = PaddleOCR(**config)

def process_pdf(args):
    pdf_file, output_folder, results = args
    start_time = time.time()  # Record the start time
    try:
        results_from_ocr = ocr.ocr(pdf_file, cls=True)
        for i, page in enumerate(results_from_ocr):
            page_text = ""
            for line in page:
                text = line[1][0]  # Access the text content of the line
                page_text += f"{text}\n"

            output_file = os.path.join(output_folder, os.path.splitext(os.path.basename(pdf_file))[0] + f'_page_{i+1}.txt')

            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(page_text)

        end_time = time.time()  # Record the end time
        total_time = end_time - start_time
        result = f"Processed {pdf_file} in {total_time:.2f} seconds"
        results.append(result)
    except Exception as e:
        result = f"Error processing {pdf_file}: {e}"
        results.append(result)

def process_files(pdf_files, output_folder):
    print(f"Starting processing of {len(pdf_files)} files in {output_folder}")
    start_time = time.time()  # Record the start time for the whole processing
    manager = multiprocessing.Manager()
    results = manager.list()  # Shared list to store results

    num_processes = multiprocessing.cpu_count()  # Adjust this to the number of CPU cores you want to use
    with multiprocessing.Pool(num_processes, initializer=initialize_ocr) as pool:
        pool.map(process_pdf, [(pdf_file, output_folder, results) for pdf_file in pdf_files])

    end_time = time.time()  # Record the end time for the whole processing
    total_time = end_time - start_time
    print(f"Finished processing of {len(pdf_files)} files in {output_folder}. Total time taken: {total_time:.2f} seconds")

    # Print the results after all files have been processed
    for result in results:
        print(result)

# Example usage
if __name__ == "__main__":
    pdf_folder = "/content/sample_data/Input"
    output_folder = "/content/sample_data/Output"
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.lower().endswith('.pdf')]
    process_files(pdf_files, output_folder)


FileNotFoundError: [Errno 2] No such file or directory: '/content/sample_data/Input'

In [None]:
import os
import time
import multiprocessing
from paddleocr import PaddleOCR
import torch

# Configuration for PaddleOCR
config = {
    "lang": "en",                              # Language setting (e.g., "en" for English)
    "use_gpu": torch.cuda.is_available(),      # Use GPU if available
    "det_model": "ch_ppocr_mobile_v2.0_det",   # Detection model
    "rec_model": "ch_ppocr_mobile_v2.0_rec",   # Recognition model
    'det_db_thresh': 0.3,                      # Lower threshold for binarization
    'det_char_conf': 0.3,                      # Lower minimum confidence level for character recognition
    'cls_conf_thresh': 0.5,
    "use_angle_cls": True,
    "use_space_char": True,
    "text_layout": True,
    "det_limit_type": "min",
    "max_text_length": 5000,                   # Maximum length of extracted text (experimental)
}

lock = multiprocessing.Lock()

def initialize_ocr(lock):
    with lock:
        global ocr
        ocr = PaddleOCR(**config)

def process_pdf(args):
    pdf_file, output_folder, results = args
    start_time = time.time()  # Record the start time
    try:
        results_from_ocr = ocr.ocr(pdf_file, cls=True)
        for i, page in enumerate(results_from_ocr):
            page_text = ""
            for line in page:
                text = line[1][0]  # Access the text content of the line
                page_text += f"{text}\n"

            output_file = os.path.join(output_folder, os.path.splitext(os.path.basename(pdf_file))[0] + f'_page_{i+1}.txt')

            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(page_text)

        end_time = time.time()  # Record the end time
        total_time = end_time - start_time
        result = f"Processed {pdf_file} in {total_time:.2f} seconds"
        results.append(result)
    except Exception as e:
        result = f"Error processing {pdf_file}: {e}"
        results.append(result)

def process_files(pdf_files, output_folder):
    print(f"Starting processing of {len(pdf_files)} files in {output_folder}")
    start_time = time.time()  # Record the start time for the whole processing
    manager = multiprocessing.Manager()
    results = manager.list()  # Shared list to store results

    num_processes = multiprocessing.cpu_count()  # Adjust this to the number of CPU cores you want to use
    with multiprocessing.Pool(num_processes, initializer=initialize_ocr, initargs=(lock,)) as pool:
        pool.map(process_pdf, [(pdf_file, output_folder, results) for pdf_file in pdf_files])

    end_time = time.time()  # Record the end time for the whole processing
    total_time = end_time - start_time
    print(f"Finished processing of {len(pdf_files)} files in {output_folder}. Total time taken: {total_time:.2f} seconds")

    # Print the results after all files have been processed
    for result in results:
        print(result)

# Example usage
if __name__ == "__main__":
    pdf_folder = "/content/sample_data/Input"
    output_folder = "/content/sample_data/Output"
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.lower().endswith('.pdf')]

    # Pre-download models
    ocr = PaddleOCR(**config)  # Download models before multiprocessing
    process_files(pdf_files, output_folder)


In [None]:
!pip list

In [None]:
# Install PaddlePaddle with GPU support
!pip install paddlepaddle-gpu==2.5.0.post112 -f https://www.paddlepaddle.org.cn/whl/stable.html

# Install PaddleOCR
!pip install paddleocr

# Restart runtime (manually or using the command below)
# After running this cell, please manually restart the runtime or uncomment the next line to automatically restart.
# import os
# os._exit(00)


In [None]:
!pip install paddlehub

Collecting paddlehub
  Downloading paddlehub-2.4.0-py3-none-any.whl.metadata (1.7 kB)
Collecting colorama (from paddlehub)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from paddlehub)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting paddle2onnx>=0.5.1 (from paddlehub)
  Downloading paddle2onnx-1.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.8 kB)
Collecting paddlenlp>=2.0.0 (from paddlehub)
  Downloading paddlenlp-2.8.1-py3-none-any.whl.metadata (25 kB)
Collecting rarfile (from paddlehub)
  Downloading rarfile-4.2-py3-none-any.whl.metadata (4.4 kB)
Collecting visualdl>=2.0.0 (from paddlehub)
  Downloading visualdl-2.5.3-py3-none-any.whl.metadata (25 kB)
Collecting gradio (from paddlehub)
  Downloading gradio-4.40.0-py3-none-any.whl.metadata (15 kB)
Collecting gunicorn>=19.10.0 (from paddlehub)
  Downloading gunicorn-22.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting seqeval (from paddlenl

In [None]:
import paddlehub as hub

# Load the ERNIE model
model = hub.Module(name='ernie')

# Define the text data
texts = ["I love programming.", "The weather is terrible today."]

# Define the label map
label_map = {0: 'negative', 1: 'positive'}

# Get predictions
results = model.predict(texts)

# Print the results
for text, result in zip(texts, results):
    print(f'Text: {text}\nPrediction: {label_map[result]}\n')


In [None]:
!pip install paddlepaddle-gpu==2.0.0

[31mERROR: Could not find a version that satisfies the requirement paddlepaddle-gpu==2.0.0 (from versions: 2.3.0, 2.3.1, 2.3.2, 2.4.0rc0, 2.4.0, 2.4.1, 2.4.2, 2.5.0rc0, 2.5.0rc1, 2.5.0, 2.5.1, 2.5.2, 2.6.0, 2.6.1)[0m[31m
[0m[31mERROR: No matching distribution found for paddlepaddle-gpu==2.0.0[0m[31m
[0m

In [None]:
import os
import time
import multiprocessing
from paddleocr import PaddleOCR
import torch
import paddlehub as hub


# Configuration for PaddleOCR
config = {
    "lang": "en",                              # Language setting (e.g., "en" for English)
    "use_gpu": torch.cuda.is_available(),      # Use GPU if available
    "det_model": "ch_ppocr_mobile_v2.0_det",   # Detection model
    "rec_model": "ch_ppocr_mobile_v2.0_rec",   # Recognition model
    'det_db_thresh': 0.3,                      # Lower threshold for binarization
    'det_char_conf': 0.3,                      # Lower minimum confidence level for character recognition
    'cls_conf_thresh': 0.5,
    "use_angle_cls": True,
    "use_space_char": True,
    "text_layout": True,
    "det_limit_type": "min",
    "max_text_length": 5000,                   # Maximum length of extracted text (experimental)
}

def initialize_ocr():
    global ocr
    ocr = PaddleOCR(**config)

def process_pdf(args):
    pdf_file, output_folder, results = args
    start_time = time.time()  # Record the start time
    try:
        results_from_ocr = ocr.ocr(pdf_file, cls=True)
        for i, page in enumerate(results_from_ocr):
            page_text = ""
            for line in page:
                text = line[1][0]  # Access the text content of the line
                page_text += f"{text}\n"

            output_file = os.path.join(output_folder, os.path.splitext(os.path.basename(pdf_file))[0] + f'_page_{i+1}.txt')

            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(page_text)

        end_time = time.time()  # Record the end time
        total_time = end_time - start_time
        result = f"Processed {pdf_file} in {total_time:.2f} seconds"
        results.append(result)
    except Exception as e:
        result = f"Error processing {pdf_file}: {e}"
        results.append(result)

def process_files(pdf_files, output_folder):
    print(f"Starting processing of {len(pdf_files)} files in {output_folder}")
    start_time = time.time()  # Record the start time for the whole processing
    manager = multiprocessing.Manager()
    results = manager.list()  # Shared list to store results

    num_processes = multiprocessing.cpu_count()  # Adjust this to the number of CPU cores you want to use
    with multiprocessing.Pool(num_processes, initializer=initialize_ocr) as pool:
        pool.map(process_pdf, [(pdf_file, output_folder, results) for pdf_file in pdf_files])

    end_time = time.time()  # Record the end time for the whole processing
    total_time = end_time - start_time
    print(f"Finished processing of {len(pdf_files)} files in {output_folder}. Total time taken: {total_time:.2f} seconds")

    # Print the results after all files have been processed
    for result in results:
        print(result)

# Example usage
if __name__ == "__main__":
    pdf_folder = "/content/Input"
    output_folder = "/content/output"

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.lower().endswith('.pdf')]
    process_files(pdf_files, output_folder)


Starting processing of 2 files in /content/output
[2024/08/05 10:51:18] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='min', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48

In [None]:
import sys
print(sys.path)


['/usr/local/lib/python3.10/dist-packages/paddleocr', '/usr/local/lib/python3.10/dist-packages/paddleocr', '/usr/local/lib/python3.10/dist-packages/paddleocr/ppstructure', '/usr/local/lib/python3.10/dist-packages/paddleocr', '/usr/local/lib/python3.10/dist-packages/paddleocr', '/usr/local/lib/python3.10/dist-packages/paddleocr', '/usr/local/lib/python3.10/dist-packages/paddleocr', '/usr/local/lib/python3.10/dist-packages/paddleocr', '/usr/local/lib/python3.10/dist-packages/paddleocr', '/content', '/env/python', '/usr/lib/python310.zip', '/usr/lib/python3.10', '/usr/lib/python3.10/lib-dynload', '', '/usr/local/lib/python3.10/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.10/dist-packages/IPython/extensions', '/usr/local/lib/python3.10/dist-packages/setuptools/_vendor', '/root/.ipython', '/usr/local/lib/python3.10/dist-packages/paddleocr/', '/usr/local/lib/python3.10/dist-packages/paddleocr/tools/infer', '/usr/local/lib/python3.10/dist-packages/paddleocr/tools/

In [None]:
!pip install paddlepaddle

Collecting paddlepaddle
  Downloading paddlepaddle-2.6.1-cp310-cp310-manylinux1_x86_64.whl.metadata (8.6 kB)
Downloading paddlepaddle-2.6.1-cp310-cp310-manylinux1_x86_64.whl (125.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.9/125.9 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: paddlepaddle
Successfully installed paddlepaddle-2.6.1


In [None]:
!pip install paddlepaddle paddleocr



In [None]:
!pip install paddlehub==2.4.0


Collecting paddlehub==2.4.0
  Downloading paddlehub-2.4.0-py3-none-any.whl.metadata (1.7 kB)
Collecting colorama (from paddlehub==2.4.0)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from paddlehub==2.4.0)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting paddle2onnx>=0.5.1 (from paddlehub==2.4.0)
  Downloading paddle2onnx-1.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.8 kB)
Collecting paddlenlp>=2.0.0 (from paddlehub==2.4.0)
  Downloading paddlenlp-2.8.1-py3-none-any.whl.metadata (25 kB)
Collecting rarfile (from paddlehub==2.4.0)
  Downloading rarfile-4.2-py3-none-any.whl.metadata (4.4 kB)
Collecting visualdl>=2.0.0 (from paddlehub==2.4.0)
  Downloading visualdl-2.5.3-py3-none-any.whl.metadata (25 kB)
Collecting gradio (from paddlehub==2.4.0)
  Downloading gradio-4.40.0-py3-none-any.whl.metadata (15 kB)
Collecting gunicorn>=19.10.0 (from paddlehub==2.4.0)
  Downloading gunicorn-22.0.0-py3-n

In [None]:
!pip show paddlepaddle paddleocr paddlehub

Name: paddlepaddle
Version: 2.6.1
Summary: Parallel Distributed Deep Learning
Home-page: https://www.paddlepaddle.org.cn/
Author: 
Author-email: Paddle-better@baidu.com
License: Apache Software License
Location: /usr/local/lib/python3.10/dist-packages
Requires: astor, decorator, httpx, numpy, opt-einsum, Pillow, protobuf
Required-by: 
---
Name: paddleocr
Version: 2.8.1
Summary: Awesome OCR toolkits based on PaddlePaddle(8.6M ultra-lightweight pre-trained model, support training and deployment among server, mobile, embedded and IoT devices)
Home-page: https://github.com/PaddlePaddle/PaddleOCR
Author: 
Author-email: PaddlePaddle <Paddle-better@baidu.com>
License: Apache License 2.0
Location: /usr/local/lib/python3.10/dist-packages
Requires: beautifulsoup4, cython, fire, fonttools, imgaug, lmdb, numpy, opencv-contrib-python, opencv-python, Pillow, pyclipper, python-docx, pyyaml, rapidfuzz, requests, scikit-image, shapely, tqdm
Required-by: 
---
Name: paddlehub
Version: 2.4.0
Summary: A to

In [None]:
# Install PaddlePaddle (GPU version)
!pip install paddlepaddle-gpu==2.0.0.post101 -f https://www.paddlepaddle.org.cn/whl/stable.html

# Install PaddleHub
!pip install paddlehub==2.0.0

# Verify the installations
import paddle
import paddlehub as hub

print(paddle.__version__)  # Should be 2.0.0
print(hub.__version__)     # Should be 2.0.0

# Load the ERNIE model
ernie_model = hub.Module(name='ernie')


Looking in links: https://www.paddlepaddle.org.cn/whl/stable.html
[31mERROR: Could not find a version that satisfies the requirement paddlepaddle-gpu==2.0.0.post101 (from versions: 2.3.0, 2.3.1, 2.3.2, 2.4.0rc0, 2.4.0, 2.4.1, 2.4.2, 2.5.0rc0, 2.5.0rc1, 2.5.0, 2.5.1, 2.5.2, 2.6.0, 2.6.1)[0m[31m
[0m[31mERROR: No matching distribution found for paddlepaddle-gpu==2.0.0.post101[0m[31m
[0mCollecting paddlehub==2.0.0
  Downloading paddlehub-2.0.0-py3-none-any.whl.metadata (1.6 kB)
Collecting gitpython (from paddlehub==2.0.0)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting gitdb<5,>=4.0.1 (from gitpython->paddlehub==2.0.0)
  Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)
Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython->paddlehub==2.0.0)
  Downloading smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)
Downloading paddlehub-2.0.0-py3-none-any.whl (191 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m191.6/191.6 kB[0m [31

2.6.1
2.4.0


EnvironmentMismatchError: [31mernie[39m cannot be installed because some conditions are not met:
+---------------+----------+-----------------------------------+-----------------------------------+
|[36m     Name      [39m|[36m Version  [39m|[36m   PaddlePaddle Version Required   [39m|[36m    PaddleHub Version Required     [39m|
+---------------+----------+-----------------------------------+-----------------------------------+
|     ernie     |  1.0.0   |                Any                |                Any                |
+---------------+----------+-----------------------------------+-----------------------------------+
|     ernie     |  1.0.1   |                Any                |                Any                |
+---------------+----------+-----------------------------------+-----------------------------------+
|     ernie     |  1.0.2   |                Any                |                Any                |
+---------------+----------+-----------------------------------+-----------------------------------+
|     ernie     |  1.1.0   |              >=1.5.0              |                Any                |
+---------------+----------+-----------------------------------+-----------------------------------+
|     ernie     |  1.2.0   |              >=1.6.2              |              >=1.6.0              |
+---------------+----------+-----------------------------------+-----------------------------------+
|     ernie     |  2.0.0   |              >=2.0.0              |              >=2.0.0              |
+---------------+----------+-----------------------------------+-----------------------------------+
|     ernie     |  2.0.1   |              >=2.0.0              |              >=2.0.0              |
+---------------+----------+-----------------------------------+-----------------------------------+
|     ernie     |  2.0.2   |              >=2.0.0              |              >=2.0.0              |
+---------------+----------+-----------------------------------+-----------------------------------+


In [None]:
!pip list


Package                          Version
-------------------------------- ---------------------
absl-py                          1.4.0
accelerate                       0.32.1
aiofiles                         23.2.1
aiohttp                          3.9.5
aiosignal                        1.3.1
aistudio-sdk                     0.2.5
alabaster                        0.7.16
albucore                         0.0.12
albumentations                   1.4.12
altair                           4.2.2
annotated-types                  0.7.0
anyio                            3.7.1
argon2-cffi                      23.1.0
argon2-cffi-bindings             21.2.0
array_record                     0.5.1
arviz                            0.18.0
asn1crypto                       1.5.1
astor                            0.8.1
astropy                          6.1.2
astropy-iers-data                0.2024.7.29.0.32.7
astunparse                       1.6.3
async-timeout                    4.0.3
atpublic                 

In [None]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.9-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.24.9 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.9-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.24.9-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.5/3.5 MB[0m [31m105.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m64.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyMuPDFb-1.24.9-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m101.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.9 PyMuPDFb-1.

In [None]:
import os
import time
import multiprocessing
from paddleocr import PaddleOCR
import torch
import paddlehub as hub
import fitz  # PyMuPDF

# Configuration for PaddleOCR
config = {
    "lang": "en",                              # Language setting (e.g., "en" for English)
    "use_gpu": torch.cuda.is_available(),      # Use GPU if available
    "det_model": "ch_ppocr_mobile_v2.0_det",   # Detection model
    "rec_model": "ch_ppocr_mobile_v2.0_rec",   # Recognition model
    'det_db_thresh': 0.3,                      # Lower threshold for binarization
    'det_char_conf': 0.3,                      # Lower minimum confidence level for character recognition
    'cls_conf_thresh': 0.5,
    "use_angle_cls": True,
    "use_space_char": True,
    "text_layout": True,
    "det_limit_type": "min",
    "max_text_length": 5000,                   # Maximum length of extracted text (experimental)
}

def initialize_ocr():
    global ocr
    ocr = PaddleOCR(**config)

def process_pdf(args):
    pdf_file, output_folder, results = args
    start_time = time.time()  # Record the start time
    try:
        results_from_ocr = ocr.ocr(pdf_file, cls=True)
        for i, page in enumerate(results_from_ocr):
            page_text = ""
            for line in page:
                text = line[1][0]  # Access the text content of the line
                page_text += f"{text}\n"

            output_file = os.path.join(output_folder, os.path.splitext(os.path.basename(pdf_file))[0] + f'_page_{i+1}.txt')

            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(page_text)

        end_time = time.time()  # Record the end time
        total_time = end_time - start_time
        result = f"Processed {pdf_file} in {total_time:.2f} seconds"
        results.append(result)
    except Exception as e:
        result = f"Error processing {pdf_file}: {e}"
        results.append(result)

def process_files(pdf_files, output_folder):
    print(f"Starting processing of {len(pdf_files)} files in {output_folder}")
    start_time = time.time()  # Record the start time for the whole processing
    manager = multiprocessing.Manager()
    results = manager.list()  # Shared list to store results

    num_processes = multiprocessing.cpu_count()  # Adjust this to the number of CPU cores you want to use
    with multiprocessing.Pool(num_processes, initializer=initialize_ocr) as pool:
        pool.map(process_pdf, [(pdf_file, output_folder, results) for pdf_file in pdf_files])

    end_time = time.time()  # Record the end time for the whole processing
    total_time = end_time - start_time
    print(f"Finished processing of {len(pdf_files)} files in {output_folder}. Total time taken: {total_time:.2f} seconds")

    # Print the results after all files have been processed
    for result in results:
        print(result)

# Example usage
if __name__ == "__main__":
    pdf_folder = "/content/Input"
    output_folder = "/content/output"

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.lower().endswith('.pdf')]
    process_files(pdf_files, output_folder)


Starting processing of 2 files in /content/output
[2024/08/05 12:12:54] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='min', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48