In [1]:
from docling.document_converter import DocumentConverter, PdfFormatOption, WordFormatOption, ExcelFormatOption
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
    TableFormerMode,
    AcceleratorDevice,
    AcceleratorOptions,
    RapidOcrOptions,
)
from docling.datamodel.base_models import InputFormat
from docling.backend import pypdfium2_backend
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
from docling.pipeline.simple_pipeline import SimplePipeline

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def scrape_file(file_path):
    try:
        ocr_options = RapidOcrOptions()
        pipeline_options = PdfPipelineOptions(do_table_structure=True, do_ocr=True)
        pipeline_options.ocr_options = ocr_options
        pipeline_options.ocr_options.lang = ["en"]
        pipeline_options.accelerator_options = AcceleratorOptions(
            num_threads=4, device=AcceleratorDevice.CUDA
        )
        pipeline_options.table_structure_options.do_cell_matching = False
        pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE

        pipeline_options.create_legacy_output = True

        converter = DocumentConverter(
            allowed_formats=[
                InputFormat.PDF,
                InputFormat.IMAGE,
                InputFormat.DOCX,
                InputFormat.HTML,
                InputFormat.PPTX,
                InputFormat.ASCIIDOC,
                InputFormat.MD,
                InputFormat.XLSX,
            ],
            format_options={
                InputFormat.PDF: PdfFormatOption(
                    pipeline_options=pipeline_options,
                    backend=pypdfium2_backend.PyPdfiumDocumentBackend,
                ),
                InputFormat.DOCX: WordFormatOption(
                    pipeline_cls=SimplePipeline  
                ),
                InputFormat.XLSX: ExcelFormatOption(
                    pipeline_cls=SimplePipeline
                ),
            }
        )
        doc = converter.convert(source=file_path).document
        print("Processed")
        return doc
    except Exception as e:
        return e

In [25]:
file = r'C:\Users\HariharanAS\Desktop\folder\combined_chart.png'


In [26]:
scrape_file(file)

Downloading detection model, please wait. This may take several minutes depending upon your network connection.
Downloading recognition model, please wait. This may take several minutes depending upon your network connection.


Processed


DoclingDocument(schema_name='DoclingDocument', version='1.0.0', name='combined_chart', origin=DocumentOrigin(mimetype='application/pdf', binary_hash=5868393679793775344, filename='combined_chart.png', uri=None), furniture=GroupItem(self_ref='#/furniture', parent=None, children=[], name='_root_', label=<GroupLabel.UNSPECIFIED: 'unspecified'>), body=GroupItem(self_ref='#/body', parent=None, children=[RefItem(cref='#/pictures/0')], name='_root_', label=<GroupLabel.UNSPECIFIED: 'unspecified'>), groups=[], texts=[TextItem(self_ref='#/texts/0', parent=RefItem(cref='#/pictures/0'), children=[], label=<DocItemLabel.TEXT: 'text'>, prov=[ProvenanceItem(page_no=1, bbox=BoundingBox(l=1.6666666666666667, t=1109.6666666666667, r=36.0, b=1083.0, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 2))], orig='25', text='25'), TextItem(self_ref='#/texts/1', parent=RefItem(cref='#/pictures/0'), children=[], label=<DocItemLabel.TEXT: 'text'>, prov=[ProvenanceItem(page_no=1, bbox=BoundingBo

In [18]:
csv_file = r'C:\Users\HariharanAS\Desktop\folder\idl.csv'

In [19]:
# convert this csv file to a dataframe
import pandas as pd
df = pd.read_csv(csv_file)


In [24]:
df

Unnamed: 0,No,tournament_name,tournament_date,email,name,grade,format,team_name,team_member2,team_member3,Score 1,judge email,judge name,opponent team_name,verdict,judge feedback,motion
0,1,FSDL,2024-05-05,kishore@augli.ai,kishore,11,kesoe,team 44,siri,karna,33,judge23@gmail.com,Ganesh,team 66,1.0,good,r1
1,2,ertr,2024-06-07,kiran@gmail.com,anand,12,kdla,team 44,giri,suman,22,jud@gmail.com,kiranr,tema32,300.0,fine,f3


In [22]:
# save this dataframe to a excel file
df.to_excel(r'C:\Users\HariharanAS\Desktop\folder\idl.xlsx', index=False)
