In [6]:
import sys
import argparse
import numpy as np
import cv2
import datetime
import time
import pytz
import json

from pipelines.document_structurization import DocumentStructurization
from modules.file_loading import load_document, load_whole_pdf

import pdfplumber
from pathlib import Path
import os
import logging
import pandas as pd
from langchain_core.documents import Document
from langchain_core.load.dump import dumps as lang_dumps


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
[t for t in pytz.all_timezones if t.startswith('America/Mex')]

['America/Mexico_City']

In [61]:
root_string = '/home/camilo/Documents/by_species/' #Conabio
root_string = '/home/camilo/Documents/00-Conabio/by_species/' #Casa

root_path = Path(root_string)

processed_list=['Leptonycteris yerbabuenae', 'Leptonycteris nivalis', 'Melipona beecheii'] 
species_subfolders = os.listdir(root_path)
pending_process_species = list(set(species_subfolders)-set(processed_list))
pending_process_species

['test species_A', 'test species_B']

#### Funcitons

In [9]:
def check_species_processed(root_path, 
                            processed_list=['Leptonycteris yerbabuenae', 'Leptonycteris nivalis', 'Melipona beecheii']):
    species_subfolders = os.listdir(root_path)
    pending_process_species = list(set(species_subfolders)-set(processed_list))
    return pending_process_species 


In [8]:
def get_list_paths(root_path, 
               input_subfolder_sufix="_bibliografía", 
               output_subfolder = "output"
               ):

    path_dics = []
    
    pending_process_species = check_species_processed(root_path)

    for species_folder in pending_process_species:
        species_folder_path = Path(root_path,species_folder)
        file_name_list = os.listdir(Path(root_path,species_folder,species_folder+'_bibliografía'))
        for file_name in file_name_list:
            path_dics.append(
                dict(
                    file_input_path = Path(species_folder_path,str(species_folder_path.name)+input_subfolder_sufix,file_name),
                    folder_output_path = Path(species_folder_path,output_subfolder),
                    species_folder = species_folder,
                    file_name = file_name
                )
            )
    return path_dics

In [10]:
def load_whole_pdf(pdf_path):

    # initialization
    image_list = []

    # read PDF file (load all pages in the PDF file)
    name = pdf_path.lower()
    if name.endswith('.pdf'):
        with pdfplumber.open(pdf_path) as pdf:
            page_count = len(pdf.pages)
            for page_index in range(page_count):  # traverse all pages
                page = pdf.pages[page_index]  # select the current page
                page_image = page.to_image(resolution=150) # convert the page to image by default (20230815)
                image = cv2.cvtColor(np.array(page_image.original), cv2.COLOR_RGB2BGR)

                image_list.append(image)

            pdf.close()

    return image_list

In [11]:
def whole_pdf_conversion_example(image_list):

    # configure
    configs = dict()
    
    layout_analysis_configs = dict()
    layout_analysis_configs['from_modelscope_flag'] = False
    layout_analysis_configs['model_path'] = '/home/DocXLayout_231012.pth'  # note that: currently the layout analysis model is NOT from modelscope
    configs['layout_analysis_configs'] = layout_analysis_configs
    
    text_detection_configs = dict()
    text_detection_configs['from_modelscope_flag'] = True
    text_detection_configs['model_path'] = 'damo/cv_resnet18_ocr-detection-line-level_damo'
    configs['text_detection_configs'] = text_detection_configs

    text_recognition_configs = dict()
    text_recognition_configs['from_modelscope_flag'] = True
    text_recognition_configs['model_path'] = 'damo/cv_convnextTiny_ocr-recognition-document_damo'  # alternatives: 'damo/cv_convnextTiny_ocr-recognition-scene_damo', 'damo/cv_convnextTiny_ocr-recognition-general_damo', 'damo/cv_convnextTiny_ocr-recognition-handwritten_damo' 
    configs['text_recognition_configs'] = text_recognition_configs

    formula_recognition_configs = dict()
    formula_recognition_configs['from_modelscope_flag'] = False
    formula_recognition_configs['image_resizer_path'] = '/home/LaTeX-OCR_image_resizer.onnx'
    formula_recognition_configs['encoder_path'] = '/home/LaTeX-OCR_encoder.onnx'
    formula_recognition_configs['decoder_path'] = '/home/LaTeX-OCR_decoder.onnx'
    formula_recognition_configs['tokenizer_json'] = '/home/LaTeX-OCR_tokenizer.json'
    configs['formula_recognition_configs'] = formula_recognition_configs

    # initialize
    document_structurizer = DocumentStructurization(configs)

    # run
    final_result = []
    page_index = 0
    for image in image_list:
        result = document_structurizer(image)

        page_info = {'page': page_index, 'information': result}
        final_result.append(page_info)

        page_index = page_index + 1

    if True:
        print (final_result)

    # release
    document_structurizer.release()

    return final_result

In [12]:
def loop_OCR(root_path, max_files = 100): 
    '''
    Expected folder Structure :

    root_folder/
        species A/
            species A_bibiliografía/
                file_A1.pdf
                file_A2.pdf
            output/
                Doc_A1
                Doc_A2
    
    Note: The subfolders under root_folde define the species-name. 
    '''
    list_paths = get_list_paths(root_path)
    final_results = []

    for file_path_dict in list_paths[0:max_files]:
        pdf_path = file_path_dict.get('file_input_path')
        output_folder = file_path_dict.get('folder_output_path')
        if not os.path.exists(output_folder):
            print("Creating output folder in : {output_folder.parent.name}")
            os.mkdir(output_folder)

        image_list = load_whole_pdf(str(pdf_path))

        try:
            final_result = whole_pdf_conversion_example(image_list)
            
            
        
        except:
            final_result = {'error_species':str(file_path_dict.get('species_folder')), 
                            'error_file': str(file_path_dict.get('file_input_path'))
                            }
            logging.error("Custom_error_msg", exc_info=True)
        
        final_results.append(final_result)
    
    return final_results

In [13]:
def res_to_df_chunks(final_result: list, filter_criteria = ['plain text']) -> Document:
    # 1. to DataFrame and "Explode" for relevant columns
    # 2. Select type of objects, default 'plain text' category
    # 3. Group text into region-polygon-chunks

    
    try: 
        df = pd.DataFrame(final_result)
        cols = ['information','text_list']

        for col in cols:
            #print(col)
            df = df.explode(col).reset_index(drop=True)
            df = df.drop(columns=[col]).join(df[col].apply(pd.Series), rsuffix=f".{col}")

        df['content'] = df['content'].apply(lambda x: x[0] if isinstance(x,list) else '')
        df['region_poly'] = df['region_poly'].apply(lambda x: tuple(x)) # lists are mutable 
        df['content_type'] = df ['content'].apply(lambda x: type(x).__name__)
        
        # 2
        df = df[df.category_name.apply(lambda x: x in filter_criteria)]

        # 3
        aggr_level = ['page', 'region_poly']
        grouped_df = df.groupby(aggr_level)
        # Concatenate the content of the 'content' column with spaces
        df_agg = df.groupby(aggr_level)['content'].apply(lambda x: ' '.join(x)).reset_index()
    
    except:
        return (final_result)

        
    return df_agg

In [14]:
def df_to_doc(df_agg, file_path_dict):
    """Converts a Pandas DataFrame to a list of LangChain Documents with enhanced data handling.

    Args:
        df_agg (pandas.DataFrame): The DataFrame containing the data to be converted.
        file_path_dict (dict): A dictionary containing file paths for various purposes.

    Returns:
        list: A list of LangChain Document objects with page_content and metadata.
    """

    documents = []
    for _, row in df_agg.iterrows():
        metadata = {
            "page": row.get("page"),  # Use get() for potential missing values
            "file_name": file_path_dict.get("file_name"),
            "region_poly": row.get("region_poly"),
            "input_file": str(file_path_dict.get("file_input_path")),
            "species_folder": file_path_dict.get("species_folder"),
            "output_folder": str(Path(file_path_dict.get("folder_output_path"))),  # Convert Path to string for JSON
            "output_file": str(Path(file_path_dict.get("folder_output_path"), file_path_dict.get("file_name").split(".")[0] + ".json"))  # Use get() and string conversion
        }

        # Remove potential None values from metadata
        metadata = {key: value for key, value in metadata.items() if value is not None}

        documents.append(Document(page_content=row.get("content"), metadata=metadata))

    return documents

In [60]:
def df_to_dict(df_agg, file_path_dict):

    documents = [
        dict(
            page_content=row.get("content"),
            metadata={
                "page": row.get("page"),  # Use get() for potential missing values
                "file_name": file_path_dict.get("file_name"),
                "region_poly": row.get("region_poly"),
                "input_file": str(file_path_dict.get("file_input_path")),
                "species_folder": file_path_dict.get("species_folder"),
                "output_folder": str(Path(file_path_dict.get("folder_output_path"))),  # Convert Path to string for JSON
                "output_file": str(Path(file_path_dict.get("folder_output_path"), file_path_dict.get("file_name").split(".")[0] + ".json"))  # Use get() and string conversion
            }
        )
        for _, row in df_agg.iterrows()
    ]

    return documents

In [52]:
def new_list_paths(root_path= root_path, pattern='*/output*.json'):
    return list(root_path.glob(pattern=pattern))

In [87]:
def write_to_json(dfs:list,list_paths):
     for res, path_dic in  zip(dfs,list_paths):
          output_folder = path_dic.get('folder_output_path')
          output_file_path = Path(output_folder,
                                   path_dic.get('file_name').split(".")[0]+".json")
               
          # if it is a DataFrame, result finished correctly
          if isinstance(res, pd.DataFrame):
               list_of_docs = df_to_doc(res,path_dic)
               if not os.path.exists(output_folder):
                    os.makedirs(output_folder)
               for chunk in list_of_docs:
                         with open (output_file_path,"a") as fp:
                              json.dump(chunk.dict(),fp)
                              #lang_dumps(list_of_docs, fp)
          else:
               with open ('./log_errors',"a") as fp:
                              try:
                                   json.dump(res,fp)
                              except:
                                   print("TypeError: Object of type PosixPath is not JSON serializable")


### ALL TOGETHER NOW!

In [63]:
# Step 1: Generate list of dictionaries with multiple paths
# All items
list_paths = get_list_paths(root_path)

list_paths[0], \
list_paths[0].keys(), \
list_paths[0].get('folder_output_path').parent.name

({'file_input_path': PosixPath('/home/camilo/Documents/00-Conabio/by_species/test species_A/test species_A_bibliografía/Arita, 1991.pdf'),
  'folder_output_path': PosixPath('/home/camilo/Documents/00-Conabio/by_species/test species_A/output'),
  'species_folder': 'test species_A',
  'file_name': 'Arita, 1991.pdf'},
 dict_keys(['file_input_path', 'folder_output_path', 'species_folder', 'file_name']),
 'test species_A')

In [21]:
# Step 2: Run the OCR. 
# uses: get_list_paths, loop_OCR
# All items
final_results = loop_OCR(root_path, max_files = 2)

Fix size testing.
training chunk_sizes: [32]
The output will be saved to  /home/camilo/Documents/00-Conabio/01-Tutoriales/AdvancedLiterateMachinery/modules/../DocXLayout/../../exp/ctdet_subfield/default
heads {'hm': 11, 'cls': 4, 'ftype': 3, 'wh': 8, 'hm_sub': 2, 'wh_sub': 8, 'reg': 2, 'reg_sub': 2}
Namespace(task='ctdet_subfield', dataset='huntie', test=False, data_src='default', exp_id='default', vis_corner=0, convert_onnx=0, onnx_path='auto', debug=0, load_model='/home/DocXLayout_231012.pth', resume=False, gpus=[0], num_workers=16, not_cuda_benchmark=False, seed=317, print_iter=0, hide_data_time=False, save_all=False, metric='loss', vis_thresh=0.3, nms_thresh=0.3, corner_thresh=0.3, debugger_theme='white', arch='dlav0subfield_34', head_conv=256, down_ratio=4, input_res=768, input_h=768, input_w=768, lr=0.000125, lr_step=[80], NotFixList='', num_epochs=90, batch_size=32, master_batch_size=32, num_iters=-1, val_intervals=5, trainval=False, negative=False, adamW=False, save_dir='/home/

  checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage)
2024-09-18 11:41:37.148010: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-18 11:41:37.187472: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-09-18 11:41:39,220 - modelscope - INFO - initiate model from /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo
2024-09-18 11:41:39,220 - modelscope - INFO - initiate model from location /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo.
2024-09-18 11:41:39,227 - modelscope - INFO - loading model from dir /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo


Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.




Instructions for updating:
tf.py_func is deprecated in TF V2. Instead, there are two
    options available in V2.
    - tf.py_function takes a python function which manipulates tf eager
    tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
    an ndarray (just call tensor.numpy()) but having access to eager tensors
    means `tf.py_function`s can use accelerators such as GPUs as well as
    being differentiable using a gradient tape.
    - tf.numpy_function maintains the semantics of the deprecated tf.py_func
    (it is not differentiable, and manipulates numpy arrays). It drops the
    stateful argument making all functions stateful.
    


2024-09-18 11:41:41,047 - modelscope - INFO - loading model from /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo/tf_ckpts/checkpoint-80000


09/18/2024 11:41:41 - INFO - tensorflow -   Restoring parameters from /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo/tf_ckpts/checkpoint-80000


2024-09-18 11:41:41.271127: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:353] MLIR V1 optimization pass is not enabled
2024-09-18 11:41:45,201 - modelscope - INFO - initiate model from /home/camilo/.cache/modelscope/hub/damo/cv_convnextTiny_ocr-recognition-document_damo
2024-09-18 11:41:45,204 - modelscope - INFO - initiate model from location /home/camilo/.cache/modelscope/hub/damo/cv_convnextTiny_ocr-recognition-document_damo.
2024-09-18 11:41:45,210 - modelscope - INFO - initialize model from /home/camilo/.cache/modelscope/hub/damo/cv_convnextTiny_ocr-recognition-document_damo
  params_pretrained = torch.load(model_path, map_location='cpu')
2024-09-18 11:41:45,427 - modelscope - INFO - cuda is not available, using cpu instead.
2024-09-18 11:41:45,427 - modelscope - INFO - loading model from dir /home/camilo/.cache/modelscope/hub/damo/cv_convnextTiny_ocr-recognition-document_damo
2024-09-18 11:41:45,432 - modelscope - INFO - loading model done


[{'page': 0, 'information': [{'category_index': 0, 'category_name': 'title', 'region_poly': [79, 194, 986, 194, 986, 265, 79, 265], 'text_list': [{'position': [81, 191, 986, 191, 986, 222, 81, 222], 'content': ['Spatial Segregation in Long-Nosed Bats, Leptonycteris nivalis and']}, {'position': [82, 230, 547, 231, 547, 261, 81, 261], 'content': ['Leptonycteris curasoae, in Mexico']}]}, {'category_index': 0, 'category_name': 'title', 'region_poly': [82, 466, 140, 466, 140, 481, 82, 481], 'text_list': [{'position': [82, 465, 137, 465, 137, 479, 82, 479], 'content': ['1 author:']}]}, {'category_index': 1, 'category_name': 'figure', 'region_poly': [92, 771, 128, 771, 128, 794, 92, 794], 'text_list': []}, {'category_index': 1, 'category_name': 'figure', 'region_poly': [92, 708, 129, 708, 129, 731, 92, 731], 'text_list': []}, {'category_index': 1, 'category_name': 'figure', 'region_poly': [82, 506, 124, 506, 124, 531, 82, 531], 'text_list': []}, {'category_index': 1, 'category_name': 'figure'

  checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage)
2024-09-18 11:44:07,794 - modelscope - INFO - initiate model from /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo
2024-09-18 11:44:07,795 - modelscope - INFO - initiate model from location /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo.
2024-09-18 11:44:07,817 - modelscope - INFO - loading model from dir /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo
2024-09-18 11:44:09,776 - modelscope - INFO - loading model from /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo/tf_ckpts/checkpoint-80000


09/18/2024 11:44:09 - INFO - tensorflow -   Restoring parameters from /home/camilo/.cache/modelscope/hub/damo/cv_resnet18_ocr-detection-line-level_damo/tf_ckpts/checkpoint-80000


2024-09-18 11:44:13,156 - modelscope - INFO - initiate model from /home/camilo/.cache/modelscope/hub/damo/cv_convnextTiny_ocr-recognition-document_damo
2024-09-18 11:44:13,160 - modelscope - INFO - initiate model from location /home/camilo/.cache/modelscope/hub/damo/cv_convnextTiny_ocr-recognition-document_damo.
2024-09-18 11:44:13,165 - modelscope - INFO - initialize model from /home/camilo/.cache/modelscope/hub/damo/cv_convnextTiny_ocr-recognition-document_damo
  params_pretrained = torch.load(model_path, map_location='cpu')
2024-09-18 11:44:13,688 - modelscope - INFO - cuda is not available, using cpu instead.
2024-09-18 11:44:13,689 - modelscope - INFO - loading model from dir /home/camilo/.cache/modelscope/hub/damo/cv_convnextTiny_ocr-recognition-document_damo
2024-09-18 11:44:13,695 - modelscope - INFO - loading model done


[{'page': 0, 'information': [{'category_index': 0, 'category_name': 'title', 'region_poly': [85, 986, 231, 986, 231, 1003, 85, 1003], 'text_list': [{'position': [83, 985, 230, 984, 230, 1001, 83, 1002], 'content': ['INTRODUCTION']}]}, {'category_index': 0, 'category_name': 'title', 'region_poly': [84, 190, 1112, 190, 1112, 258, 84, 258], 'text_list': [{'position': [82, 187, 1106, 191, 1106, 220, 82, 216], 'content': ['Pollination system of the Pilosocereus leucocephalus columnar']}, {'position': [83, 227, 738, 229, 738, 258, 83, 256], 'content': ['cactus (tribe Cereeae) in eastern Mexico']}]}, {'category_index': 0, 'category_name': 'title', 'region_poly': [83, 389, 159, 389, 159, 406, 83, 406], 'text_list': [{'position': [83, 389, 158, 389, 158, 404, 83, 404], 'content': ['Keywords']}]}, {'category_index': 0, 'category_name': 'title', 'region_poly': [418, 390, 513, 390, 513, 407, 418, 407], 'text_list': [{'position': [413, 390, 508, 390, 508, 406, 413, 406], 'content': ['ABSTRACT']}]},

In [44]:
len(final_results),final_results[1][0].keys(), final_results[1][0].get('information')

(2,
 dict_keys(['page', 'information']),
 [{'category_index': 0,
   'category_name': 'title',
   'region_poly': [85, 986, 231, 986, 231, 1003, 85, 1003],
   'text_list': [{'position': [83, 985, 230, 984, 230, 1001, 83, 1002],
     'content': ['INTRODUCTION']}]},
  {'category_index': 0,
   'category_name': 'title',
   'region_poly': [84, 190, 1112, 190, 1112, 258, 84, 258],
   'text_list': [{'position': [82, 187, 1106, 191, 1106, 220, 82, 216],
     'content': ['Pollination system of the Pilosocereus leucocephalus columnar']},
    {'position': [83, 227, 738, 229, 738, 258, 83, 256],
     'content': ['cactus (tribe Cereeae) in eastern Mexico']}]},
  {'category_index': 0,
   'category_name': 'title',
   'region_poly': [83, 389, 159, 389, 159, 406, 83, 406],
   'text_list': [{'position': [83, 389, 158, 389, 158, 404, 83, 404],
     'content': ['Keywords']}]},
  {'category_index': 0,
   'category_name': 'title',
   'region_poly': [418, 390, 513, 390, 513, 407, 418, 407],
   'text_list': [{'

In [88]:
df = res_to_df_chunks(final_results[0], filter_criteria = ['plain text'])
dfs = [res_to_df_chunks(df,filter_criteria = ['plain text']) for df in final_results]

In [64]:
df.head()

Unnamed: 0,page,region_poly,content
0,0,"(81, 657, 570, 657, 570, 672, 81, 672)",Some of the authors ofthis publication are als...
1,0,"(86, 302, 372, 302, 372, 322, 86, 322)",Article in Joumal of Ma nalogy·No vember 1991
2,0,"(87, 325, 164, 325, 164, 337, 87, 337)",
3,0,"(87, 403, 117, 403, 117, 429, 87, 429)",58 8
4,0,"(120, 524, 383, 524, 383, 543, 120, 543)",Universidad Nacional Autonoma de Mexico


In [85]:
# jsons (Results) to dictionaries
# uses: get_list_paths, df_to_dict
# One item
file_path_dict = get_list_paths(root_path)[0]
dicts = df_to_dict(df, file_path_dict)
all_dicts = [df_to_dict(df, fi le_path_dict) for  df in dfs and file_path_dict in 
for d in dicts[:]:
    if d.get('metadata').get('page')==1:
        print(d.get('page_content'),"------------------------------", d.get('metadata').get('page'))

Jones (1965) and Findley (1969) found segregation of certain sympatricspecies of vespertilionid bats along altitudinal gradients in Arizona aud New Mexico. Koopiman (1978) and Graliam (1983 I presented several examples of altitudinal segregation of closely related spccies of the Andes o Peru. Start and Marshall (1976) observed segregation by habitat in two species of Macroglossus tlhat occur sympatrically in Malaysia; M. mininus forages in coastal zones, feeding nainly or the flowers of Sonneratta sp., wbereas M. sobrinus selects lowers of wild banana plants (Musa sp.) that grow inland. Handley (1967) and Bonaccorso (1979) documented several examples of  segregation of sympatric bats by habital and vertical stratum in Ncotropical forests. Some othe examples of segregation of bat specics along gradients of altitude or habitat were discussed by IHumphrey and Bonaccorso (1979). ------------------------------ 1
Big (Leptonycteris nioalis) and little (L. curasoae) long-noscd bats are sympat

In [86]:
dicts[0]

{'page_content': 'Some of the authors ofthis publication are also working on these related projects:',
 'metadata': {'page': 0,
  'file_name': 'Arita, 1991.pdf',
  'region_poly': (81, 657, 570, 657, 570, 672, 81, 672),
  'input_file': '/home/camilo/Documents/00-Conabio/by_species/test species_A/test species_A_bibliografía/Arita, 1991.pdf',
  'species_folder': 'test species_A',
  'output_folder': '/home/camilo/Documents/00-Conabio/by_species/test species_A/output',
  'output_file': '/home/camilo/Documents/00-Conabio/by_species/test species_A/output/Arita, 1991.json'}}

In [None]:
# 
# uses:write_to_json
# All items

In [None]:
dfs = [res_to_df_chunks(df) for df in final_results]

def write_to_json(dfs:list,list_paths):
     for res, path_dic in  zip(dfs,list_paths):
          output_folder = path_dic.get('folder_output_path')
          output_file_path = Path(output_folder,
                                   path_dic.get('file_name').split(".")[0]+".json")
               
          # if it is a DataFrame, result finished correctly
          if isinstance(res, pd.DataFrame):
               list_of_docs = df_to_doc(res,path_dic)
               if not os.path.exists(output_folder):
                    os.makedirs(output_folder)
               for page in list_of_docs:
                         with open (output_file_path,"a") as fp:
                              json.dump(page.dict(),fp)
                              #lang_dumps(list_of_docs, fp)
          else:
               with open ('./log_errors',"a") as fp:
                              try:
                                   json.dump(res,fp)
                              except:
                                   print("TypeError: Object of type PosixPath is not JSON serializable")

In [None]:
path_dic

In [None]:
list_paths

In [None]:
doc = df_to_doc(res, path_dic)

In [None]:
path_dic

In [None]:
file_path_dict

In [None]:
for res, path_dic in  zip(dfs,list_paths):
    print(df_to_doc(res, path_dic)[0].file_name )
    


In [None]:
text = lang_dumps(df_to_doc(res, path_dic))
eval(text)[0].get('kwargs')["page_content"]