In [2]:
from glob import glob
import os, random, re

from tqdm import tqdm
from utils.enums import DocumentSegments

from controller.ocr import OcrData
from controller.document_segmentation import DocumentSegmentation
from controller.table_segmentation import TableSegmentation
from controller.prompt_pipeline import PromptPipeline

from learning.table_detection.microsoft_TART import TableDetection
from learning.layout_analysis.pickle_file import SegmenterModel

from model.image_data import ImageData

[nltk_data] Downloading package punkt to
[nltk_data]     /home/carlos_rocha/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/carlos_rocha/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/carlos_rocha/nltk_data...


Quantidade de stopwords: 560


  from .autonotebook import tqdm as notebook_tqdm


# Models

In [None]:
OCR_DATA_PATH = "samples/ocr"
PAGE_IMAGE_PATH = "samples/images"

table_detection_model = "microsoft/table-transformer-structure-recognition-v1.1-all"

seg_model = SegmenterModel(PAGE_IMAGE_PATH)
table_detector = TableDetection(table_detection_model)

def extract_document_data(page_path: str, ticker: str, year: int, page: int):
    page_data = ImageData(page_path)
    #como estamos trabalhando só com os demonstrativos
    ocr_filename = f"{ticker}_demonstrativo_{year}.json"
    ocr_data = OcrData(os.path.join(OCR_DATA_PATH, ocr_filename), page, True)
    
    doc_segmentation = DocumentSegmentation(page_data, seg_model, ocr_data)
    
    table_segments = []
    prompt_data = ""
    table_id = 1
    text_id = 1
    for segment in doc_segmentation.segments:
        if segment.seg_type == DocumentSegments.TABLE:
            table = TableSegmentation(page_data, segment, table_detector, table_id)
            if len(table.get_table_text().strip()) > 0:
                table_segments.append(table)
                prompt_data += f"\nTABELA {table_id}:\n" + table.get_table_text() + "\n"
                table_id += 1
        elif segment.seg_type == DocumentSegments.IMAGE:
            continue
        else:
            if len(segment.texts) <= 0:
                continue
            
            prompt_data += f"T{text_id}:"
            text_id += 1
            for text_block in segment.texts:
                prompt_data += text_block.text + "\n"
            prompt_data += "\n"
    
    print(prompt_data)
    return page_data, doc_segmentation, table_segments, prompt_data

In [None]:
def generate_qa_examples(qtd_questions):
    examples = ""
    for i in range(1, qtd_questions+1):
        examples += "    {i} - pergunta: {pergunta} | resposta: {resposta} | região do texto: {região}\n"\
            .replace("{i}", str(i))
    
    return examples

def annotate(llm_model_name: str, llm_model):
    numbers_pattern = re.compile(r'[^0-9]') #identificar char não numéricos
    page_files = glob(os.path.join(PAGE_IMAGE_PATH, "*.jpg"))
    random.shuffle(page_files)
    
    for page_path in tqdm(page_files):        
        page_filename = page_path.split("/")[-1].split(".")[0]
        filename_metadatas = page_filename.split("_") # o nome das imagens devem ser: ticker_ano_pagina.jpg
        ticker = filename_metadatas[0]
        year = int(numbers_pattern.sub('', filename_metadatas[1]))
        page = int(numbers_pattern.sub('', filename_metadatas[2]))
        
        page_data, doc_segmentation, table_segments, prompt_data = extract_document_data(page_path, ticker, year, page)
        print(prompt_data)
        return

        #criando o prompt para geração das perguntas e respostas
        # generator_prompts = sorted(glob("prompt/qa_agent/*.txt"))
        # with open(generator_prompts[-1]) as prompt_file:
        with open('/home/carlos/Documentos/Pesquisa/DocumentUnderstanding/prompt/qa_agent_ablation/ablation_3.txt') as prompt_file:
            prompt_str = prompt_file.read()
        
        data = {
            "{dominio}": "financeiro",
            "{prompt_data}": prompt_data,
            "{qtd_questions}": "3",
            "{question_examples}": generate_qa_examples(3)
        }
        
        prompt_pipe = PromptPipeline(prompt_str)
        prompt_pipe.add_data_to_prompt(data)
        text_blocks = doc_segmentation.filter_segments(DocumentSegments.TEXT)
        
        try:
            # print(page_filename)
            llm_model_responses, llm_model_usage = llm_model.call(prompt_pipe)
            # print(llm_model_responses)
            parsed_responses = prompt_pipe.parse_prompt_response(
                llm_model_responses, text_blocks, 
                table_segments
            )
            questions = []
            for resp in parsed_responses:
                questions.append(resp.to_dict())
            
            width, height = page_data.image.size
            annotation = {
                "ticker": ticker,
                "filename": page_filename,
                "page": page,
                "page_size": {"width": width, "height": height},
                "model": llm_model_name,
                "questions": questions,
                "cost": llm_model_usage.to_dict() if llm_model_usage is not None else llm_model_usage,
                "review_counts": 0,
                "review_costs": [],
            }

            annotations_col.insert_one(annotation)
        except Exception as e:
            print(e)
            print("Falha no arquivo:", page_filename)

# Annotations Stage

## Gemini

In [None]:
from learning.language_model.base import LanguageModelInterface, GEMINI_PRICE
from langchain_google_genai import ChatGoogleGenerativeAI

llm_model_name = "gemini-1.5-flash-001"
llm = ChatGoogleGenerativeAI(
    model=llm_model_name,
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)

llm_model = LanguageModelInterface(llm, GEMINI_PRICE)
annotate(llm_model_name, llm_model)

## OpenAI

In [None]:
from learning.language_model.base import LanguageModelInterface, GPT4O_MINI_PRICE
from langchain_openai import ChatOpenAI

llm_model_name = "gpt-4o-mini"
llm = ChatOpenAI(
    model=llm_model_name,
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

llm_model = LanguageModelInterface(llm, GPT4O_MINI_PRICE)
annotate(llm_model_name, llm_model)

## Anthropic

In [None]:
from learning.language_model.base import LanguageModelInterface, CLAUDE_3_HAIKU_PRICE
from langchain_anthropic import ChatAnthropic

llm_model_name = "claude-3-haiku-20240307"
llm = ChatAnthropic(
    model=llm_model_name,
    temperature=0,
    max_tokens=1024,
    timeout=None,
    max_retries=2,
)

llm_model = LanguageModelInterface(llm, CLAUDE_3_HAIKU_PRICE)
annotate(llm_model_name, llm_model)

# LLama3

In [None]:
from learning.language_model.base import LanguageModelInterface, LLAMA3_PRICE
from langchain_groq import ChatGroq

llm_model_name = "llama3-70b-8192"
llm = ChatGroq(
    model=llm_model_name,
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

llm_model = LanguageModelInterface(llm, LLAMA3_PRICE)
annotate(llm_model_name, llm_model)

# Mixtrall

In [None]:
from learning.language_model.base import LanguageModelInterface, MIXTRAL_MOE_8X22B_PRICE
from langchain_fireworks import ChatFireworks

llm_model_name = "accounts/fireworks/models/mixtral-8x22b-instruct"
llm = ChatFireworks(
    model=llm_model_name,
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

llm_model = LanguageModelInterface(llm, MIXTRAL_MOE_8X22B_PRICE)
annotate(llm_model_name, llm_model)