In [1]:
from models.mistralocr import MistralOCRModel 
from models.gemini import GeminiFlashModel
import docdataset as dd
import prompt_templates as pt
from prompt_templates import TextSchema, TableSchema, TitleSchema, ClassSchema
import docfocus as df

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = dd.download_dataset()['train']

## Prompting Proprietary Models

### Mistral OCR/Pixtral

In [3]:
model = MistralOCRModel()

In [None]:
# OCR Text sections
dataset = dataset.shuffle(seed=42)

# model.set_prompt(pt.get_text_template())
# model.set_output_schema(TextSchema)
df.ocr_dataset(dataset, "../results/ocr_text_mistralocr.jsonl", ocr_fn= model.process_doc_image, chunk_size=2, lang="ukr", region_types=["text"])

In [None]:
# OCR Doc sections
dataset = dataset.shuffle(seed=42)

# model.set_prompt(pt.get_text_template())
# model.set_output_schema(TextSchema)
df.ocr_dataset(dataset, "../results/ocr_whole_doc_mistralocr.jsonl", ocr_fn= model.process_doc_image, chunk_size=2, lang="ukr")

In [None]:
# Table Extraction Table sections
dataset = dataset.shuffle(seed=42)
subset = dataset.select(range(22, len(dataset)))

model.set_prompt(pt.get_table_template())
model.set_output_schema(TableSchema)
df.ocr_dataset(subset, "../results/table_table_mistralocr.jsonl", ocr_fn= model.process_doc_image, chunk_size=2, lang="ukr", region_types=["table"])

In [None]:
# Table Extraction Doc sections
dataset = dataset.shuffle(seed=42)
subset = dataset.select(range(46, len(dataset)))

model.set_prompt(pt.get_table_template())
model.set_output_schema(TableSchema)
df.ocr_dataset(subset, "../results/table_whole_doc_mistralocr.jsonl", ocr_fn= model.process_doc_image, chunk_size=2, lang="ukr")

In [None]:
# Layout Analysis Doc sections
dataset = dataset.shuffle(seed=42)

model.set_prompt(pt.get_title_template())
model.set_output_schema(TitleSchema)
df.ocr_dataset(dataset, "../results/layout_whole_doc_mistralocr.jsonl", ocr_fn= model.process_doc_image, chunk_size=2, lang="ukr")

In [None]:
# Classification Doc sections
dataset = dataset.shuffle(seed=42)

model.set_prompt(pt.get_class_template())
model.set_output_schema(ClassSchema)
df.ocr_dataset(dataset, "../results/class_whole_doc_mistralocr.jsonl", ocr_fn= model.process_doc_image, chunk_size=2, lang="ukr")

### Gemini 2.0 Flash

In [3]:
model = GeminiFlashModel(model_name = "gemini-2.0-flash")

In [None]:
# OCR Text sections
dataset = dataset.shuffle(seed=42)

model.set_prompt(pt.get_text_template())
df.ocr_dataset(dataset, "../results/ocr_text_gemini.jsonl", ocr_fn= model.process_doc_image, chunk_size=2, lang="ukr", region_types=["text"])

In [None]:
# OCR Doc sections
dataset = dataset.shuffle(seed=42)

model.set_prompt(pt.get_text_template())
df.ocr_dataset(dataset, "../results/ocr_whole_doc_gemini.jsonl", ocr_fn= model.process_doc_image, chunk_size=2, lang="ukr")

In [None]:
# Table Extraction Table sections
dataset = dataset.shuffle(seed=42)

model.set_prompt(pt.get_table_template())
df.ocr_dataset(dataset, "../results/table_table_gemini.jsonl", ocr_fn= model.process_doc_image, chunk_size=2, lang="ukr", region_types=["table"])

In [None]:
# Table Extraction Doc sections
dataset = dataset.shuffle(seed=42)

model.set_prompt(pt.get_table_template())
df.ocr_dataset(dataset, "../results/table_whole_doc_gemini.jsonl", ocr_fn= model.process_doc_image, chunk_size=2, lang="ukr")

In [None]:
# Layout Analysis Doc sections
dataset = dataset.shuffle(seed=42)

model.set_prompt(pt.get_title_template())
df.ocr_dataset(dataset, "../results/layout_whole_doc_gemini.jsonl", ocr_fn= model.process_doc_image, chunk_size=2, lang="ukr")

In [None]:
# Classification Doc sections
dataset = dataset.shuffle(seed=42)

model.set_prompt(pt.get_class_template())
df.ocr_dataset(dataset, "../results/class_whole_doc_gemini.jsonl", ocr_fn= model.process_doc_image, chunk_size=2, lang="ukr")