In [117]:
import sys
import argparse
import numpy as np
import cv2
import datetime
import time
import pytz
import json

from pipelines.document_structurization import DocumentStructurization
from modules.file_loading import load_document, load_whole_pdf

import pdfplumber
from pathlib import Path
import os
import logging
import pandas as pd
from langchain_core.documents import Document
from langchain_core.load.dump import dumps as lang_dumps


In [4]:
[t for t in pytz.all_timezones if t.startswith('America/Mex')]

['America/Mexico_City']

In [9]:
def load_whole_pdf(pdf_path):

    # initialization
    image_list = []

    # read PDF file (load all pages in the PDF file)
    name = pdf_path.lower()
    if name.endswith('.pdf'):
        with pdfplumber.open(pdf_path) as pdf:
            page_count = len(pdf.pages)
            for page_index in range(page_count):  # traverse all pages
                page = pdf.pages[page_index]  # select the current page
                page_image = page.to_image(resolution=150) # convert the page to image by default (20230815)
                image = cv2.cvtColor(np.array(page_image.original), cv2.COLOR_RGB2BGR)

                image_list.append(image)

            pdf.close()

    return image_list

In [20]:
def whole_pdf_conversion_example(image_list):

    # configure
    configs = dict()
    
    layout_analysis_configs = dict()
    layout_analysis_configs['from_modelscope_flag'] = False
    layout_analysis_configs['model_path'] = '/home/DocXLayout_231012.pth'  # note that: currently the layout analysis model is NOT from modelscope
    configs['layout_analysis_configs'] = layout_analysis_configs
    
    text_detection_configs = dict()
    text_detection_configs['from_modelscope_flag'] = True
    text_detection_configs['model_path'] = 'damo/cv_resnet18_ocr-detection-line-level_damo'
    configs['text_detection_configs'] = text_detection_configs

    text_recognition_configs = dict()
    text_recognition_configs['from_modelscope_flag'] = True
    text_recognition_configs['model_path'] = 'damo/cv_convnextTiny_ocr-recognition-document_damo'  # alternatives: 'damo/cv_convnextTiny_ocr-recognition-scene_damo', 'damo/cv_convnextTiny_ocr-recognition-general_damo', 'damo/cv_convnextTiny_ocr-recognition-handwritten_damo' 
    configs['text_recognition_configs'] = text_recognition_configs

    formula_recognition_configs = dict()
    formula_recognition_configs['from_modelscope_flag'] = False
    formula_recognition_configs['image_resizer_path'] = '/home/LaTeX-OCR_image_resizer.onnx'
    formula_recognition_configs['encoder_path'] = '/home/LaTeX-OCR_encoder.onnx'
    formula_recognition_configs['decoder_path'] = '/home/LaTeX-OCR_decoder.onnx'
    formula_recognition_configs['tokenizer_json'] = '/home/LaTeX-OCR_tokenizer.json'
    configs['formula_recognition_configs'] = formula_recognition_configs

    # initialize
    document_structurizer = DocumentStructurization(configs)

    # run
    final_result = []
    page_index = 0
    for image in image_list:
        result = document_structurizer(image)

        page_info = {'page': page_index, 'information': result}
        final_result.append(page_info)

        page_index = page_index + 1

    if True:
        print (final_result)

    # release
    document_structurizer.release()

    return final_result

#### Manual tryout

In [None]:
#root_string = '/home/camilo/Documents/by_species/' #Conabio
root_string = '/home/camilo/Documents/00-Conabio/by_species/' #Casa
root_path = Path(root_string)

processed_list=['Leptonycteris yerbabuenae', 'Leptonycteris nivalis', 'Melipona beecheii'] 
species_subfolders = os.listdir(root_path)
pending_process_species = list(set(species_subfolders)-set(processed_list))
pending_process_species 

In [11]:
import pdfplumber

my_full_path = '/home/camilo/Documents/00-Conabio/by_species/test species_A/test species_A_bibliografía/Arita, 1991.pdf' #casa
#my_full_path = '/home/camilo/Documents/by_species/test species_A/test species_A_bibliografía/Arita, 1991.pdf' #Conabio
my_full_path

'/home/camilo/Documents/00-Conabio/by_species/test species_A/test species_A_bibliografía/Arita, 1991.pdf'

In [50]:
df_agg = result_to_chunks(final_result)

information
text_list


### ALL TOGETHER NOW!

In [5]:
def check_species_processed(root_path, 
                            processed_list=['Leptonycteris yerbabuenae', 'Leptonycteris nivalis', 'Melipona beecheii']):
    species_subfolders = os.listdir(root_path)
    pending_process_species = list(set(species_subfolders)-set(processed_list))
    return pending_process_species 


In [7]:
def get_list_paths(root_path, 
               input_subfolder_sufix="_bibliografía", 
               output_subfolder = "output"
               ):

    path_dics = []
    
    pending_process_species = check_species_processed(root_path)

    for species_folder in pending_process_species:
        species_folder_path = Path(root_path,species_folder)
        file_name_list = os.listdir(Path(root_path,species_folder,species_folder+'_bibliografía'))
        for file_name in file_name_list:
            path_dics.append(
                dict(
                    file_input_path = Path(species_folder_path,str(species_folder_path.name)+input_subfolder_sufix,file_name),
                    folder_output_path = Path(species_folder_path,output_subfolder),
                    species_folder = species_folder,
                    file_name = file_name
                )
            )
    return path_dics

In [24]:
root_path = '/home/camilo/Documents/by_species' #Conabio
root_path = '/home/camilo/Documents/00-Conabio/by_species' #casa

list_paths = get_list_paths(root_path)
list_paths[0].get('file_input_path')

PosixPath('/home/camilo/Documents/00-Conabio/by_species/test species_A/test species_A_bibliografía/Arita, 1991.pdf')

In [16]:
def loop_OCR(root_path, max_files = 100): 
    '''
    Expected folder Structure :

    root_folder/
        species A/
            species A_bibiliografía/
                file_A1.pdf
                file_A2.pdf
            output/
                Doc_A1
                Doc_A2
    
    Note: The subfolders under root_folde define the species-name. 
    '''
    list_paths = get_list_paths(root_path)
    final_results = []

    for file_path_dict in list_paths[0:max_files]:
        pdf_path = file_path_dict.get('file_input_path')
        output_folder = file_path_dict.get('folder_output_path')
        if not os.path.exists(output_folder):
            print("Creating output folder in : {output_folder.parent.name}")
            os.mkdir(output_folder)

        image_list = load_whole_pdf(str(pdf_path))

        try:
            final_result = whole_pdf_conversion_example(image_list)
            
            
        
        except:
            final_result = {'error_species':file_path_dict.get('species_folder'), 
                            'error_file': file_path_dict.get('file_input_path')
                            }
            logging.error("Custom_error_msg", exc_info=True)
        
        final_results.append(final_result)
    
    return final_results

In [47]:
def res_to_df_chunks(final_result: list, filter_criteria = ['plain text']) -> Document:
    # 1. Check if well defined
    # 1. to DataFrame and "Explode" for relevant columns
    # 2. Select type of objects, default 'plain text' category
    # 3. Group text into region-polygon-chunks

    # 1
    try: 
        df = pd.DataFrame(final_result)
        cols = ['information','text_list']

        for col in cols:
            print(col)
            df = df.explode(col).reset_index(drop=True)
            df = df.drop(columns=[col]).join(df[col].apply(pd.Series), rsuffix=f".{col}")

        df['content'] = df['content'].apply(lambda x: x[0] if isinstance(x,list) else '')
        df['region_poly'] = df['region_poly'].apply(lambda x: tuple(x)) # lists are mutable 
        df['content_type'] = df['content'].apply(lambda x: type(x).__name__)
        
        # 2
        df = df[df.category_name.apply(lambda x: x in filter_criteria)]

        # 3
        aggr_level = ['page', 'region_poly']
        grouped_df = df.groupby(aggr_level)
        # Concatenate the content of the 'content' column with spaces
        df_agg = df.groupby(aggr_level)['content'].apply(lambda x: ' '.join(x)).reset_index()
    
    except:
        return (final_result)

        
    return df_agg

In [38]:
list_paths[0].keys(), \
list_paths[0].get('folder_output_path').parent.name

(dict_keys(['file_input_path', 'folder_output_path', 'species_folder', 'file_name']),
 'test species_A')

In [23]:
# warning
final_results = loop_OCR(root_path, max_files = 5)

Fix size testing.
training chunk_sizes: [32]
The output will be saved to  /home/camilo/Documents/00-Conabio/01-Tutoriales/AdvancedLiterateMachinery/modules/../DocXLayout/../../exp/ctdet_subfield/default
heads {'hm': 11, 'cls': 4, 'ftype': 3, 'wh': 8, 'hm_sub': 2, 'wh_sub': 8, 'reg': 2, 'reg_sub': 2}
Namespace(task='ctdet_subfield', dataset='huntie', test=False, data_src='default', exp_id='default', vis_corner=0, convert_onnx=0, onnx_path='auto', debug=0, load_model='/home/DocXLayout_231012.pth', resume=False, gpus=[0], num_workers=16, not_cuda_benchmark=False, seed=317, print_iter=0, hide_data_time=False, save_all=False, metric='loss', vis_thresh=0.3, nms_thresh=0.3, corner_thresh=0.3, debugger_theme='white', arch='dlav0subfield_34', head_conv=256, down_ratio=4, input_res=768, input_h=768, input_w=768, lr=0.000125, lr_step=[80], NotFixList='', num_epochs=90, batch_size=32, master_batch_size=32, num_iters=-1, val_intervals=5, trainval=False, negative=False, adamW=False, save_dir='/home/

  checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage)
2024-09-11 16:58:52.675384: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-11 16:58:52.715764: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-09-11 16:58:54,612 - modelscope - INFO - initiate model from /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo
2024-09-11 16:58:54,613 - modelscope - INFO - initiate model from location /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo.
2024-09-11 16:58:54,619 - modelscope - INFO - loading model from dir /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo


Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.




Instructions for updating:
tf.py_func is deprecated in TF V2. Instead, there are two
    options available in V2.
    - tf.py_function takes a python function which manipulates tf eager
    tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
    an ndarray (just call tensor.numpy()) but having access to eager tensors
    means `tf.py_function`s can use accelerators such as GPUs as well as
    being differentiable using a gradient tape.
    - tf.numpy_function maintains the semantics of the deprecated tf.py_func
    (it is not differentiable, and manipulates numpy arrays). It drops the
    stateful argument making all functions stateful.
    


2024-09-11 16:58:56,648 - modelscope - INFO - loading model from /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo/tf_ckpts/checkpoint-80000


09/11/2024 16:58:57 - INFO - tensorflow -   Restoring parameters from /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo/tf_ckpts/checkpoint-80000


2024-09-11 16:58:57.175007: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:353] MLIR V1 optimization pass is not enabled
2024-09-11 16:59:00,345 - modelscope - INFO - initiate model from /home/camilo/.cache/modelscope/hub/damo/cv_convnextTiny_ocr-recognition-document_damo
2024-09-11 16:59:00,346 - modelscope - INFO - initiate model from location /home/camilo/.cache/modelscope/hub/damo/cv_convnextTiny_ocr-recognition-document_damo.
2024-09-11 16:59:00,347 - modelscope - INFO - initialize model from /home/camilo/.cache/modelscope/hub/damo/cv_convnextTiny_ocr-recognition-document_damo
  params_pretrained = torch.load(model_path, map_location='cpu')
2024-09-11 16:59:00,601 - modelscope - INFO - cuda is not available, using cpu instead.
2024-09-11 16:59:00,602 - modelscope - INFO - loading model from dir /home/camilo/.cache/modelscope/hub/damo/cv_convnextTiny_ocr-recognition-document_damo
2024-09-11 16:59:00,608 - modelscope - INFO - loading model done


[{'page': 0, 'information': [{'category_index': 0, 'category_name': 'title', 'region_poly': [79, 194, 986, 194, 986, 265, 79, 265], 'text_list': [{'position': [81, 191, 986, 191, 986, 222, 81, 222], 'content': ['Spatial Segregation in Long-Nosed Bats, Leptonycteris nivalis and']}, {'position': [82, 230, 547, 231, 547, 261, 81, 261], 'content': ['Leptonycteris curasoae, in Mexico']}]}, {'category_index': 0, 'category_name': 'title', 'region_poly': [82, 466, 140, 466, 140, 481, 82, 481], 'text_list': [{'position': [82, 465, 137, 465, 137, 479, 82, 479], 'content': ['1 author:']}]}, {'category_index': 1, 'category_name': 'figure', 'region_poly': [92, 771, 128, 771, 128, 794, 92, 794], 'text_list': []}, {'category_index': 1, 'category_name': 'figure', 'region_poly': [92, 708, 129, 708, 129, 731, 92, 731], 'text_list': []}, {'category_index': 1, 'category_name': 'figure', 'region_poly': [82, 506, 124, 506, 124, 531, 82, 531], 'text_list': []}, {'category_index': 1, 'category_name': 'figure'

  checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage)
2024-09-11 17:01:33,335 - modelscope - INFO - initiate model from /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo
2024-09-11 17:01:33,338 - modelscope - INFO - initiate model from location /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo.
2024-09-11 17:01:33,352 - modelscope - INFO - loading model from dir /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo
2024-09-11 17:01:35,390 - modelscope - INFO - loading model from /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo/tf_ckpts/checkpoint-80000


09/11/2024 17:01:35 - INFO - tensorflow -   Restoring parameters from /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo/tf_ckpts/checkpoint-80000


2024-09-11 17:01:42,545 - modelscope - INFO - initiate model from /home/camilo/.cache/modelscope/hub/damo/cv_convnextTiny_ocr-recognition-document_damo
2024-09-11 17:01:42,547 - modelscope - INFO - initiate model from location /home/camilo/.cache/modelscope/hub/damo/cv_convnextTiny_ocr-recognition-document_damo.
2024-09-11 17:01:42,554 - modelscope - INFO - initialize model from /home/camilo/.cache/modelscope/hub/damo/cv_convnextTiny_ocr-recognition-document_damo
  params_pretrained = torch.load(model_path, map_location='cpu')
2024-09-11 17:01:42,828 - modelscope - INFO - cuda is not available, using cpu instead.
2024-09-11 17:01:42,829 - modelscope - INFO - loading model from dir /home/camilo/.cache/modelscope/hub/damo/cv_convnextTiny_ocr-recognition-document_damo
2024-09-11 17:01:42,835 - modelscope - INFO - loading model done


[{'page': 0, 'information': [{'category_index': 0, 'category_name': 'title', 'region_poly': [85, 986, 231, 986, 231, 1003, 85, 1003], 'text_list': [{'position': [83, 985, 230, 984, 230, 1001, 83, 1002], 'content': ['INTRODUCTION']}]}, {'category_index': 0, 'category_name': 'title', 'region_poly': [84, 190, 1112, 190, 1112, 258, 84, 258], 'text_list': [{'position': [82, 187, 1106, 191, 1106, 220, 82, 216], 'content': ['Pollination system of the Pilosocereus leucocephalus columnar']}, {'position': [83, 227, 738, 229, 738, 258, 83, 256], 'content': ['cactus (tribe Cereeae) in eastern Mexico']}]}, {'category_index': 0, 'category_name': 'title', 'region_poly': [83, 389, 159, 389, 159, 406, 83, 406], 'text_list': [{'position': [83, 389, 158, 389, 158, 404, 83, 404], 'content': ['Keywords']}]}, {'category_index': 0, 'category_name': 'title', 'region_poly': [418, 390, 513, 390, 513, 407, 418, 407], 'text_list': [{'position': [413, 390, 508, 390, 508, 406, 413, 406], 'content': ['ABSTRACT']}]},

2024-09-11 17:05:20,247 - modelscope - INFO - initiate model from /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo
2024-09-11 17:05:20,250 - modelscope - INFO - initiate model from location /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo.
2024-09-11 17:05:20,273 - modelscope - INFO - loading model from dir /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo
2024-09-11 17:05:22,319 - modelscope - INFO - loading model from /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo/tf_ckpts/checkpoint-80000


09/11/2024 17:05:22 - INFO - tensorflow -   Restoring parameters from /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo/tf_ckpts/checkpoint-80000


2024-09-11 17:05:25,628 - modelscope - INFO - initiate model from /home/camilo/.cache/modelscope/hub/damo/cv_convnextTiny_ocr-recognition-document_damo
2024-09-11 17:05:25,631 - modelscope - INFO - initiate model from location /home/camilo/.cache/modelscope/hub/damo/cv_convnextTiny_ocr-recognition-document_damo.
2024-09-11 17:05:25,637 - modelscope - INFO - initialize model from /home/camilo/.cache/modelscope/hub/damo/cv_convnextTiny_ocr-recognition-document_damo
2024-09-11 17:05:25,871 - modelscope - INFO - cuda is not available, using cpu instead.
2024-09-11 17:05:25,871 - modelscope - INFO - loading model from dir /home/camilo/.cache/modelscope/hub/damo/cv_convnextTiny_ocr-recognition-document_damo
2024-09-11 17:05:25,876 - modelscope - INFO - loading model done


[{'page': 0, 'information': [{'category_index': 0, 'category_name': 'title', 'region_poly': [191, 405, 1053, 405, 1053, 573, 191, 573], 'text_list': [{'position': [193, 403, 1044, 404, 1044, 459, 193, 459], 'content': ['PRODUCCION DEL ACHIOTE']}, {'position': [334, 508, 905, 509, 904, 571, 334, 570], 'content': ['(Bixa Orellana L.)"']}]}, {'category_index': 0, 'category_name': 'title', 'region_poly': [314, 276, 925, 276, 925, 338, 314, 338], 'text_list': [{'position': [328, 276, 926, 277, 926, 332, 327, 330], 'content': ['“MANUAL PARA LA']}]}, {'category_index': 1, 'category_name': 'figure', 'region_poly': [257, 701, 979, 701, 979, 1468, 257, 1468], 'text_list': []}]}, {'page': 1, 'information': [{'category_index': 0, 'category_name': 'title', 'region_poly': [286, 146, 952, 146, 952, 189, 286, 189], 'text_list': [{'position': [282, 145, 955, 145, 955, 187, 282, 186], 'content': ['UNIVERSIDAD VERACRUZANA']}]}, {'category_index': 0, 'category_name': 'title', 'region_poly': [222, 954, 102

2024-09-11 17:11:19,778 - modelscope - INFO - initiate model from /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo
2024-09-11 17:11:19,780 - modelscope - INFO - initiate model from location /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo.
2024-09-11 17:11:19,801 - modelscope - INFO - loading model from dir /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo
2024-09-11 17:11:22,276 - modelscope - INFO - loading model from /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo/tf_ckpts/checkpoint-80000


09/11/2024 17:11:22 - INFO - tensorflow -   Restoring parameters from /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo/tf_ckpts/checkpoint-80000


2024-09-11 17:11:25,401 - modelscope - INFO - initiate model from /home/camilo/.cache/modelscope/hub/damo/cv_convnextTiny_ocr-recognition-document_damo
2024-09-11 17:11:25,403 - modelscope - INFO - initiate model from location /home/camilo/.cache/modelscope/hub/damo/cv_convnextTiny_ocr-recognition-document_damo.
2024-09-11 17:11:25,409 - modelscope - INFO - initialize model from /home/camilo/.cache/modelscope/hub/damo/cv_convnextTiny_ocr-recognition-document_damo
2024-09-11 17:11:25,656 - modelscope - INFO - cuda is not available, using cpu instead.
2024-09-11 17:11:25,657 - modelscope - INFO - loading model from dir /home/camilo/.cache/modelscope/hub/damo/cv_convnextTiny_ocr-recognition-document_damo
2024-09-11 17:11:25,661 - modelscope - INFO - loading model done


[{'page': 0, 'information': [{'category_index': 0, 'category_name': 'title', 'region_poly': [109, 1410, 230, 1410, 230, 1429, 109, 1429], 'text_list': [{'position': [108, 1410, 228, 1410, 228, 1426, 108, 1426], 'content': ['Introduction']}]}, {'category_index': 0, 'category_name': 'title', 'region_poly': [108, 261, 510, 261, 510, 284, 108, 284], 'text_list': [{'position': [107, 258, 509, 259, 509, 283, 107, 282], 'content': ['ORIGINAL RESEARCH ARTICLE']}]}, {'category_index': 0, 'category_name': 'title', 'region_poly': [100, 303, 989, 303, 989, 327, 100, 327], 'text_list': [{'position': [108, 299, 985, 302, 985, 327, 108, 325], 'content': ['Stingless bee distribution and richness in El Salvador (Apidae, Meliponinae)']}]}, {'category_index': 0, 'category_name': 'title', 'region_poly': [100, 272, 988, 272, 988, 327, 100, 327], 'text_list': [{'position': [108, 299, 985, 302, 985, 327, 108, 325], 'content': ['Stingless bee distribution and richness in El Salvador (Apidae, Meliponinae)']}]}

2024-09-11 17:15:35,119 - modelscope - INFO - initiate model from /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo
2024-09-11 17:15:35,119 - modelscope - INFO - initiate model from location /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo.
2024-09-11 17:15:35,125 - modelscope - INFO - loading model from dir /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo
2024-09-11 17:15:37,308 - modelscope - INFO - loading model from /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo/tf_ckpts/checkpoint-80000


09/11/2024 17:15:37 - INFO - tensorflow -   Restoring parameters from /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo/tf_ckpts/checkpoint-80000


2024-09-11 17:15:41,924 - modelscope - INFO - initiate model from /home/camilo/.cache/modelscope/hub/damo/cv_convnextTiny_ocr-recognition-document_damo
2024-09-11 17:15:41,927 - modelscope - INFO - initiate model from location /home/camilo/.cache/modelscope/hub/damo/cv_convnextTiny_ocr-recognition-document_damo.
2024-09-11 17:15:41,932 - modelscope - INFO - initialize model from /home/camilo/.cache/modelscope/hub/damo/cv_convnextTiny_ocr-recognition-document_damo
2024-09-11 17:15:42,274 - modelscope - INFO - cuda is not available, using cpu instead.
2024-09-11 17:15:42,275 - modelscope - INFO - loading model from dir /home/camilo/.cache/modelscope/hub/damo/cv_convnextTiny_ocr-recognition-document_damo
2024-09-11 17:15:42,281 - modelscope - INFO - loading model done


09/11/2024 17:16:00 - ERROR - root -   Custom_error_msg
Traceback (most recent call last):
  File "/tmp/ipykernel_78729/1208262450.py", line 29, in loop_OCR
    final_result = whole_pdf_conversion_example(image_list)
  File "/tmp/ipykernel_78729/2244096519.py", line 36, in whole_pdf_conversion_example
    result = document_structurizer(image)
  File "/home/camilo/Documents/00-Conabio/01-Tutoriales/AdvancedLiterateMachinery/pipelines/document_structurization.py", line 55, in __call__
    det_result = self.text_detection_module(image)
  File "/home/camilo/Documents/00-Conabio/01-Tutoriales/AdvancedLiterateMachinery/modules/text_detection.py", line 55, in __call__
    det_result = self.text_detector(image)
  File "/home/camilo/Documents/00-Conabio/01-Tutoriales/AdvancedLiterateMachinery/env_doc/lib/python3.10/site-packages/modelscope/pipelines/cv/ocr_detection_pipeline.py", line 194, in __call__
    return super().__call__(input, **kwargs)
  File "/home/camilo/Documents/00-Conabio/01-Tuto

In [None]:
final_results

In [35]:
list_paths[0:2]

[{'file_input_path': PosixPath('/home/camilo/Documents/00-Conabio/by_species/test species_A/test species_A_bibliografía/Arita, 1991.pdf'),
  'folder_output_path': PosixPath('/home/camilo/Documents/00-Conabio/by_species/test species_A/output'),
  'species_folder': 'test species_A',
  'file_name': 'Arita, 1991.pdf'},
 {'file_input_path': PosixPath('/home/camilo/Documents/00-Conabio/by_species/test species_A/test species_A_bibliografía/Munguia et.al., 2010.pdf'),
  'folder_output_path': PosixPath('/home/camilo/Documents/00-Conabio/by_species/test species_A/output'),
  'species_folder': 'test species_A',
  'file_name': 'Munguia et.al., 2010.pdf'}]

In [48]:
df_agg = res_to_df_chunks(final_results[0])
df_agg.head()

information
text_list


Unnamed: 0,page,region_poly,content
0,0,"(81, 657, 570, 657, 570, 672, 81, 672)",Some of the authors ofthis publication are als...
1,0,"(86, 302, 372, 302, 372, 322, 86, 322)",Article in Joumal of Ma nalogy·No vember 1991
2,0,"(87, 325, 164, 325, 164, 337, 87, 337)",
3,0,"(87, 403, 117, 403, 117, 429, 87, 429)",58 8
4,0,"(120, 524, 383, 524, 383, 543, 120, 543)",Universidad Nacional Autonoma de Mexico


In [38]:
file_path_dict = list_paths[0]
file_path_dict

{'file_input_path': PosixPath('/home/camilo/Documents/00-Conabio/by_species/test species_A/test species_A_bibliografía/Arita, 1991.pdf'),
 'folder_output_path': PosixPath('/home/camilo/Documents/00-Conabio/by_species/test species_A/output'),
 'species_folder': 'test species_A',
 'file_name': 'Arita, 1991.pdf'}

In [90]:
df_to_doc(df_agg, file_path_dict)

[Document(page_content='Some of the authors ofthis publication are also working on these related projects:'),
 Document(page_content='Article in Joumal of Ma nalogy·No vember 1991'),
 Document(page_content=''),
 Document(page_content='58 8'),
 Document(page_content='Universidad Nacional Autonoma de Mexico'),
 Document(page_content='89 PUBLICATIONS 3,591CITATIONS'),
 Document(page_content='He TArita'),
 Document(page_content='SEE PROFILE'),
 Document(page_content='ecologia y conservacion de carnivoros View'),
 Document(page_content='READS 386'),
 Document(page_content='Jones (1965) and Findley (1969) found segregation of certain sympatricspecies of vespertilionid bats along altitudinal gradients in Arizona aud New Mexico. Koopiman (1978) and Graliam (1983 I presented several examples of altitudinal segregation of closely related spccies of the Andes o Peru. Start and Marshall (1976) observed segregation by habitat in two species of Macroglossus tlhat occur sympatrically in Malaysia; M. 

In [29]:
len(final_results)

5

In [142]:
def df_to_doc(df_agg, file_path_dict):
    """Converts a Pandas DataFrame to a list of LangChain Documents with enhanced data handling.

    Args:
        df_agg (pandas.DataFrame): The DataFrame containing the data to be converted.
        file_path_dict (dict): A dictionary containing file paths for various purposes.

    Returns:
        list: A list of LangChain Document objects with page_content and metadata.
    """

    documents = []
    for _, row in df_agg.iterrows():
        metadata = {
            "page": row.get("page"),  # Use get() for potential missing values
            "file_name": file_path_dict.get("file_name"),
            "region_poly": row.get("region_poly"),
            "input_file": file_path_dict.get("file_input_path"),
            "species_folder": file_path_dict.get("species_folder"),
            "output_folder": str(Path(file_path_dict.get("folder_output_path"))),  # Convert Path to string for JSON
            "output_file": str(Path(file_path_dict.get("folder_output_path"), file_path_dict.get("file_name").split(".")[0] + ".json"))  # Use get() and string conversion
        }

        # Remove potential None values from metadata
        metadata = {key: value for key, value in metadata.items() if value is not None}

        documents.append(Document(page_content=row.get("content"), metadata=metadata))

    return documents

In [143]:
def df_to_doc(df_agg, file_path_dict):
    """Converts a Pandas DataFrame to a list of LangChain Documents with enhanced data handling.

    Args:
        df_agg (pandas.DataFrame): The DataFrame containing the data to be converted.
        file_path_dict (dict): A dictionary containing file paths for various purposes.

    Returns:
        list: A list of LangChain Document objects with page_content and metadata.
    """

    documents = [
        Document(
            page_content=row.get("content"),
            metadata={
                "page": row.get("page"),  # Use get() for potential missing values
                "file_name": file_path_dict.get("file_name"),
                "region_poly": row.get("region_poly"),
                "input_file": file_path_dict.get("file_input_path"),
                "species_folder": file_path_dict.get("species_folder"),
                "output_folder": str(Path(file_path_dict.get("folder_output_path"))),  # Convert Path to string for JSON
                "output_file": str(Path(file_path_dict.get("folder_output_path"), file_path_dict.get("file_name").split(".")[0] + ".json"))  # Use get() and string conversion
            }
        )
        for _, row in df_agg.iterrows()
    ]

    return documents

In [134]:
file_path_dict.get("folder_output_path")

PosixPath('/home/camilo/Documents/00-Conabio/by_species/test species_A/output')

In [111]:
len(df_to_doc(res, path_dic))

84

In [145]:
dfs = [res_to_df_chunks(df) for df in final_results]


for res, path_dic in  zip(dfs,list_paths):
    output_file_path = Path(path_dic.get('folder_output_path'),
                            path_dic.get('file_name').split(".")[0]+".json")
        
    # if it is a DataFrame, result finished correctly
    if isinstance(res, pd.DataFrame):
         list_of_docs = df_to_doc(res, path_dic)
         lang_dumps(list_of_docs, pretty=True)
    else:
         lang_dumps(res, indent = 4)

information
text_list
information
text_list
information
text_list
information
text_list


In [144]:
list_paths

[{'file_input_path': PosixPath('/home/camilo/Documents/00-Conabio/by_species/test species_A/test species_A_bibliografía/Arita, 1991.pdf'),
  'folder_output_path': PosixPath('/home/camilo/Documents/00-Conabio/by_species/test species_A/output'),
  'species_folder': 'test species_A',
  'file_name': 'Arita, 1991.pdf'},
 {'file_input_path': PosixPath('/home/camilo/Documents/00-Conabio/by_species/test species_A/test species_A_bibliografía/Munguia et.al., 2010.pdf'),
  'folder_output_path': PosixPath('/home/camilo/Documents/00-Conabio/by_species/test species_A/output'),
  'species_folder': 'test species_A',
  'file_name': 'Munguia et.al., 2010.pdf'},
 {'file_input_path': PosixPath('/home/camilo/Documents/00-Conabio/by_species/test species_B/test species_B_bibliografía/Narciso-Reyes, 2003.pdf'),
  'folder_output_path': PosixPath('/home/camilo/Documents/00-Conabio/by_species/test species_B/output'),
  'species_folder': 'test species_B',
  'file_name': 'Narciso-Reyes, 2003.pdf'},
 {'file_input_p

In [115]:
doc = df_to_doc(res, path_dic)

In [116]:
with open(path_)doc

[Document(metadata={'page': 0, 'file_name': 'Arita, 1991.pdf', 'region_poly': (81, 657, 570, 657, 570, 672, 81, 672), 'input_file': PosixPath('/home/camilo/Documents/00-Conabio/by_species/test species_A/test species_A_bibliografía/Arita, 1991.pdf'), 'species_folder': 'test species_A', 'output_folder': '/home/camilo/Documents/00-Conabio/by_species/test species_A/output', 'output_file': '/home/camilo/Documents/00-Conabio/by_species/test species_A/output/Arita, 1991.json'}, page_content='Some of the authors ofthis publication are also working on these related projects:'),
 Document(metadata={'page': 0, 'file_name': 'Arita, 1991.pdf', 'region_poly': (86, 302, 372, 302, 372, 322, 86, 322), 'input_file': PosixPath('/home/camilo/Documents/00-Conabio/by_species/test species_A/test species_A_bibliografía/Arita, 1991.pdf'), 'species_folder': 'test species_A', 'output_folder': '/home/camilo/Documents/00-Conabio/by_species/test species_A/output', 'output_file': '/home/camilo/Documents/00-Conabio/b

In [83]:
path_dic

{'file_input_path': PosixPath('/home/camilo/Documents/00-Conabio/by_species/test species_A/test species_A_bibliografía/Arita, 1991.pdf'),
 'folder_output_path': PosixPath('/home/camilo/Documents/00-Conabio/by_species/test species_A/output'),
 'species_folder': 'test species_A',
 'file_name': 'Arita, 1991.pdf'}

In [69]:
file_path_dict

{'file_input_path': PosixPath('/home/camilo/Documents/00-Conabio/by_species/test species_A/test species_A_bibliografía/Arita, 1991.pdf'),
 'folder_output_path': PosixPath('/home/camilo/Documents/00-Conabio/by_species/test species_A/output'),
 'species_folder': 'test species_A',
 'file_name': 'Arita, 1991.pdf'}

In [88]:
for res, path_dic in  zip(dfs,list_paths):
    print(df_to_doc(res, path_dic)[0].file_name )
    


AttributeError: 'Document' object has no attribute 'file_name'

In [133]:
text = lang_dumps(df_to_doc(res, path_dic))
eval(text)[0].get('kwargs')["page_content"]

'Some of the authors ofthis publication are also working on these related projects:'