# Imports

In [None]:
import os
import fitz
from openai import OpenAI
from dotenv import load_dotenv

In [None]:
import os
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

model = "gpt-4o-mini"

In [None]:
file_path = '/Users/renatoboemer/code/developer/luthor/data/Memo 2  - Crypto assets disposal - FINISHED .docx'

# Data processing

## Load

In [None]:
import os
import fitz
from docx import Document

def read_file(file_path):
    # Check if file exists
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"The file {file_path} does not exist.")
    
    # Get the file extension
    _, file_extension = os.path.splitext(file_path)

    # Handle .txt files
    if file_extension.lower() == '.txt':
        return read_txt(file_path)

    # Handle .docx files
    elif file_extension.lower() == '.docx':
        return read_docx(file_path)

    # Handle .pdf files
    elif file_extension.lower() == '.pdf':
        return read_pdf(file_path)

    else:
        raise ValueError(f"Unsupported file extension: {file_extension}")

In [None]:
def read_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def read_docx(file_path):
    document = Document(file_path)
    full_text = []
    for paragraph in document.paragraphs:
        full_text.append(paragraph.text)
    return '\n'.join(full_text)


def read_pdf(file_path):
    document = fitz.open(file_path)
    all_text = []
    for page in document:
        text = page.get_text()
        all_text.append(text)
    return '\n'.join(all_text)

In [None]:
text = read_file(file_path)
print(text[:350])

## Process

In [None]:
import re

def text_segmentation(text):
    """
    Split text into smaller, manageable chunks (e.g., paragraphs).

    Args:
        text (str): The full text to be segmented.

    Returns:
        List[str]: A list of segmented text chunks.
    """
    # Split text by double newlines or similar paragraph markers
    segments = re.split(r'\n\s*\n', text)
    
    return segments

In [None]:
# import numpy as np
# from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# # Load the tokenizer and model for NER
# tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
# model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")

# # Create a pipeline for NER
# ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

# # Example text for NER
# example = "Hugging Face was founded in 2016 by French entrepreneurs Clément Delangue, Julien Chaumond, and Thomas Wolf in New York City."

# # Run the NER pipeline on the example text
# ner_results = ner_pipeline(example)
# print(ner_results)


In [None]:
# organized_results = {'LOC': [], 'PER': [], 'ORG': [], 'MISC': []}

# current_entity = None
# current_words = []

# for result in ner_results:
#     entity_type = result['entity'].split('-')[1]
#     if result['entity'].startswith('B-'):
#         if current_entity:
#             organized_results[current_entity].append(' '.join(current_words))
#         current_entity = entity_type
#         current_words = [result['word']]
#     elif result['entity'].startswith('I-') and current_entity == entity_type:
#         current_words.append(result['word'])

# # Handle the last entity
# if current_entity:
#     organized_results[current_entity].append(' '.join(current_words))

# # Remove hash symbols from words
# for key, value in organized_results.items():
#     organized_results[key] = [' '.join(word.split('##')) for word in value]

# print(organized_results)

In [None]:
def tokenize_text(text):
    """
    Tokenize the text into words.

    Args:
        text (str): The text to be tokenized.

    Returns:
        List[str]: A list of tokens (words).
    """
    # Use simple regex to split words; nlp(text) can be used for more advanced tokenization
    tokens = re.findall(r'\b\w+\b', text)
    
    return tokens

In [None]:
def clean_special_characters(text):
    """
    Clean up non-informative special characters or artifacts.

    Args:
        text (str): The text from which to remove special characters.

    Returns:
        str: Cleaned text with unnecessary special characters removed.
    """
    # Remove characters not usually found in legal texts
    cleaned_text = re.sub(r'[^\w\s,.!?;:()-]', '', text)
    
    return cleaned_text

In [None]:
def preserve_structure(text):
    """
    Maintain the document's structural elements, such as headings.

    Args:
        text (str): The text to process for structural preservation.

    Returns:
        str: Text with preserved structure for headings and sections.
    """
    # This can involve wrapping or tagging headings, using markdown for sections
    structured_text = text
    
    # Example: Keep lines starting with capital words as headings
    structured_text = re.sub(r'(?m)^(?=[A-Z])(.+)$', r'## \1', structured_text)
    
    return structured_text

In [None]:
def preprocess_doc(file_path):
    """
    Preprocess a legal document by executing a series of text processing steps.

    Args:
        file_path (str): The path to the legal document text file.

    Returns:
        Tuple: A tuple containing:
            - Original text (str)
            - Segmented text chunks (List[str])
            - Cleaned text (str)
            - Tokenized words (List[str])
            - Structured text (str)
    """
    # Load text from the file
    text = read_file(file_path)

    # Split text into segments (paragraphs)
    segments = text_segmentation(text)

    # Clean up non-informative special characters
    cleaned_text = clean_special_characters(text)

    # Tokenize the text into words
    tokens = tokenize_text(text)

    # Preserve structural elements, e.g., headings
    structured_text = preserve_structure(text)

    return text, segments, cleaned_text, tokens, structured_text


In [None]:
# Example usage
original_text, segments, cleaned_text, tokens, structured_text = preprocess_doc(file_path)

print("Original Text:", original_text[:200], "...")
print("---" * 25)
print("Segments:", segments[:3])
print("---" * 25)
print("Cleaned Text:", cleaned_text[:200], "...")
print("---" * 25)
print("Tokens:", tokens[:10])
print("---" * 25)
print("Structured Text:", structured_text[:200], "...")

## Embed

In [None]:
from sentence_transformers import SentenceTransformer

def embed_text_chunks(chunks, model_name='all-MiniLM-L6-v2'):
    """
    Generate embeddings for text chunks using a pre-trained model.

    Args:
        chunks (List[str]): The list of text chunks to embed.
        model_name (str): The pre-trained model name from sentence-transformers.

    Returns:
        Tuple: A tuple containing:
            - Text chunks (List[str])
            - Corresponding embeddings (List[List[float]])
    """
    # Load the embedding model
    model = SentenceTransformer(model_name)
    
    # Generate embeddings for each chunk
    embeddings = model.encode(chunks, convert_to_tensor=False).tolist()
    
    return chunks, embeddings

In [None]:
original_text, segments, cleaned_text, tokens, structured_text = preprocess_doc(file_path)

# Generate embeddings for the segmented text chunks
chunks, embeddings = embed_text_chunks(segments)

# Display some of the embeddings
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
    print(f"Chunk {i+1}:")
    print(chunk)
    print(f"Embedding (first 5 values): {embedding[:5]}\n")

## Create a Search Index

## Query the System