In [1]:
import re
from pathlib import Path
from langchain.schema import Document
from langchain_community.document_loaders import (
    PyPDFLoader,
    TextLoader,
    CSVLoader,
    UnstructuredHTMLLoader
)


In [2]:
# Verificar os arquivos disponíveis em data/raw
from pathlib import Path

raw_path = Path("data/raw")
list(raw_path.rglob("*"))


[WindowsPath('data/raw/apple_10k_2022.txt'),
 WindowsPath('data/raw/eu_annex2_sanctions.pdf'),
 WindowsPath('data/raw/fatf_annual_report_2022-2023.pdf'),
 WindowsPath('data/raw/fatf_assessment_methodology_2022.pdf'),
 WindowsPath('data/raw/fatf_effectiveness_compliance_report_2022.pdf'),
 WindowsPath('data/raw/fatf_procedures_mutual_evaluations_2022.pdf'),
 WindowsPath('data/raw/fatf_universal_procedures.pdf'),
 WindowsPath('data/raw/fincen_advisory_corruption_2022.pdf'),
 WindowsPath('data/raw/fincen_advisory_elder_exploitation_2022.pdf'),
 WindowsPath('data/raw/fincen_alert_pig_butchering_2023.pdf'),
 WindowsPath('data/raw/fincen_alert_russian_elites_2022.pdf'),
 WindowsPath('data/raw/fincen_ransomware_advisory.pdf'),
 WindowsPath('data/raw/microsoft_10k_2022.txt'),
 WindowsPath('data/raw/ofac_sdn_list.csv')]

In [4]:
# Carregando um documento PDF para inspecionar
pdf_path = next(raw_path.rglob("*.pdf"), None)

if pdf_path:
    loader = PyPDFLoader(str(pdf_path))
    raw_docs = loader.load()
    print(f"Total de páginas: {len(raw_docs)}")
    print(raw_docs[0].page_content[:10000])  # Visualizar um trecho
else:
    print("Nenhum PDF encontrado.")

Total de páginas: 15
1 
Annex II – Sanctions-related commitments   
 
The sequence of implementation of the commitments d etailed in this Annex is 
specified in Annex V (Implementation Plan) to this Joint Comprehensive Plan of 
Action (JCPOA). 
  
 
A.  European Union 1 
 
1.  The EU and EU Member States commit to terminate all  provisions of 
Council Regulation (EU) No 267/2012 (as subsequentl y amended) 
implementing all nuclear-related sanctions or restr ictive measures as 
specified in Sections 1.1-1.10 below, to terminate all provisions of 
Council Decision 2010/413/CFSP (as subsequently ame nded), as 
specified in Sections 1.1-1.10 below, and to termin ate or amend 
national implementing legislation as required, in a ccordance with 
Annex V:  
 
1.1. 
 Financial, banking and insurance measures 2 
 
1.1.1 
 Prohibition and authorisation regimes on financial transfers to and 
from Iran (Article 10 of Council Decision 2010/413/ CFSP; Articles 30, 
30a, 30b and 31 of Council Regulati

In [5]:
# Vamos inspecionar várias páginas para ver padrões repetitivos (headers/footers)
for i, doc in enumerate(raw_docs[:3]):
    print(f"\n--- Página {i+1} ---")
    print(doc.page_content[:500])



--- Página 1 ---
1 
Annex II – Sanctions-related commitments   
 
The sequence of implementation of the commitments d etailed in this Annex is 
specified in Annex V (Implementation Plan) to this Joint Comprehensive Plan of 
Action (JCPOA). 
  
 
A.  European Union 1 
 
1.  The EU and EU Member States commit to terminate all  provisions of 
Council Regulation (EU) No 267/2012 (as subsequentl y amended) 
implementing all nuclear-related sanctions or restr ictive measures as 
specified in Sections 1.1-1.10 below, t

--- Página 2 ---
2 
 
1.1.7. 
 Sanctions on Government of Iran public-guaranteed b onds (Article 13 
of Council Decision 2010/413/CFSP; Article 34 of Co uncil Regulation 
(EU) No 267/2012); and 
 
1.1.8. 
 Sanctions on associated services 3 for each of the categories above (see 
the references above). 
 
1.2. 
 Oil, gas and petrochemical sectors 
 
1.2.1. 
 Sanctions on the import of oil and gas from Iran ( A rticles 3a, 3c and 
3e of Council Decision 2010/413/CFSP; Articles 

In [6]:
sample_text = raw_docs[0].page_content
lines = sample_text.splitlines()

clean_lines = []
for line in lines:
    line = line.strip()
    if not line:
        continue
    if re.match(r"^(Page \d+ of \d+|©.*|Confidential.*)$", line, re.IGNORECASE):
        continue
    clean_lines.append(line)

cleaned_text = " ".join(clean_lines)
cleaned_text = re.sub(r"\s+", " ", cleaned_text)
cleaned_text = re.sub(r"[^\x20-\x7E]+", " ", cleaned_text)

print(cleaned_text[:1000])


1 Annex II   Sanctions-related commitments The sequence of implementation of the commitments d etailed in this Annex is specified in Annex V (Implementation Plan) to this Joint Comprehensive Plan of Action (JCPOA). A. European Union 1 1. The EU and EU Member States commit to terminate all provisions of Council Regulation (EU) No 267/2012 (as subsequentl y amended) implementing all nuclear-related sanctions or restr ictive measures as specified in Sections 1.1-1.10 below, to terminate all provisions of Council Decision 2010/413/CFSP (as subsequently ame nded), as specified in Sections 1.1-1.10 below, and to termin ate or amend national implementing legislation as required, in a ccordance with Annex V: 1.1. Financial, banking and insurance measures 2 1.1.1 Prohibition and authorisation regimes on financial transfers to and from Iran (Article 10 of Council Decision 2010/413/ CFSP; Articles 30, 30a, 30b and 31 of Council Regulation (EU) No 267/2 012); 1.1.2. Sanctions on banking activities

In [7]:
def load_documents(directory: str) -> list[Document]:
    docs = []
    path = Path(directory)

    for file in path.rglob("*"):
        suffix = file.suffix.lower()
        if suffix == ".pdf":
            docs.extend(PyPDFLoader(str(file)).load())
        elif suffix == ".txt":
            docs.extend(TextLoader(str(file)).load())
        elif suffix == ".csv":
            docs.extend(CSVLoader(file_path=str(file), encoding='utf-8').load())
        elif suffix in [".html", ".htm"]:
            docs.extend(UnstructuredHTMLLoader(str(file)).load())
    return docs


def clean_and_parse(docs: list[Document]) -> list[Document]:
    cleaned_docs = []
    header_footer_pattern = re.compile(
        r"^(Page \d+ of \d+|©.*|Confidential.*)$", re.IGNORECASE
    )

    for doc in docs:
        text = doc.page_content or ""
        lines = text.splitlines()
        filtered_lines = []
        for line in lines:
            line = line.strip()
            if not line or header_footer_pattern.match(line):
                continue
            filtered_lines.append(line)

        cleaned_text = " ".join(filtered_lines)
        cleaned_text = re.sub(r"\s+", " ", cleaned_text)
        cleaned_text = re.sub(r"[^\x20-\x7E]+", " ", cleaned_text)

        cleaned_docs.append(Document(page_content=cleaned_text, metadata=doc.metadata))

    return cleaned_docs


----------------------------------------------------------------

### Checking embeddings

In [None]:
# 1. Imports
import os
from src.parser import load_documents
from src.embedder import embed_and_store

# 🛠 2. Paths de teste (usar diretório separado para não sobrescrever index principal)
DOCS_DIR = "data/raw"
TEST_INDEX_DIR = "data/index_test"

# 🧪 3. Executando a pipeline manualmente (equivalente ao build_vector_store)
print(f"=== Carregando documentos de {DOCS_DIR} ===")
documents = load_documents(DOCS_DIR)
print(f"=== {len(documents)} documentos carregados ===") 

=== Carregando documentos de data/raw ===
=== 18047 documentos carregados ===


In [16]:
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings


In [None]:
import pandas as pd
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

def inspect_index(index_path="data/index", preview_chars=300):
    embedding = HuggingFaceEmbeddings(
        model_name="sentence-transformers/paraphrase-albert-small-v2",
        encode_kwargs={"normalize_embeddings": True}
    )

    db = FAISS.load_local(index_path, embedding, allow_dangerous_deserialization=True)

    docs_summary = []
    for i, doc in enumerate(db.docstore._dict.values()):
        summary = {
            "doc_id": i + 1,
            "source": doc.metadata.get("source", "unknown"),
            "preview": doc.page_content[:preview_chars]
        }
        docs_summary.append(summary)

    # Convertendo para DataFrame para visualização
    df = pd.DataFrame(docs_summary)

    # Exibe metadados úteis
    print("Total de chunks armazenados:", len(df))
    print(" Total de vetores FAISS:", db.index.ntotal)

    return df 

In [21]:
df_chunks = inspect_index()
df_chunks.head(5)  # mostra os primeiros 5


📦 Total de chunks armazenados: 2673
🔢 Total de vetores FAISS: 2673


Unnamed: 0,doc_id,source,preview
0,1,data\raw\eu_annex2_sanctions.pdf,1 \nAnnex II – Sanctions-related commitments ...
1,2,data\raw\eu_annex2_sanctions.pdf,"specified in Sections 1.1-1.10 below, to termi..."
2,3,data\raw\eu_annex2_sanctions.pdf,"30a, 30b and 31 of Council Regulation (EU) No ..."
3,4,data\raw\eu_annex2_sanctions.pdf,1.1.5. \n Sanctions on financial support for t...
4,5,data\raw\eu_annex2_sanctions.pdf,"(iii) any legal person, entity or body having..."
