In [1]:
pip install PyPDF2 shiba-model

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [6]:
import PyPDF2
from shiba import Shiba, CodepointTokenizer, get_pretrained_from_hub

# Load SHIBA model and tokenizer
shiba_model = Shiba()
shiba_model.load_state_dict(get_pretrained_from_hub())
shiba_model.eval()  # Disable dropout
tokenizer = CodepointTokenizer()

def extract_text_from_pdf(pdf_path):
    """Extract text from a given PDF file."""
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text()
    return text

def split_text(text, max_length=1800):
    """Split text into chunks within the max length allowed by SHIBA."""
    return [text[i:i + max_length] for i in range(0, len(text), max_length)]

def process_pdf_with_shiba(pdf_path):
    # Extract and clean text from PDF
    text = extract_text_from_pdf(pdf_path)
    # Split text into manageable chunks
    text_chunks = split_text(text)
    all_outputs = []
    encoded_chunks = []

    for i, chunk in enumerate(text_chunks):
        # Encode each chunk
        encoded = tokenizer.encode_batch([chunk])
        print(f"Chunk {i} encoding result:", encoded)  # Debugging statement

        # Access `input_ids` and `attention_mask` and structure inputs for the model
        if 'input_ids' in encoded and 'attention_mask' in encoded:
            input_ids = encoded['input_ids']
            attention_mask = encoded['attention_mask']
            inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}
            outputs = shiba_model(**inputs)
            all_outputs.append(outputs)
            encoded_chunks.append(input_ids[0])  # Store input_ids for decoding

    return all_outputs, encoded_chunks

def decode_tokens(encoded_chunks):
    decoded_text = ""
    for tokens in encoded_chunks:
        decoded_text += tokenizer.decode(tokens)
    return decoded_text

# Example usage
pdf_path = "sample.pdf"
outputs, encoded_chunks = process_pdf_with_shiba(pdf_path)
decoded_text = decode_tokens(encoded_chunks)
print("Decoded Text:", decoded_text)


Chunk 0 encoding result: {'input_ids': tensor([[57344,    83,    97,  ...,    32,    68,   111]]), 'attention_mask': tensor([[False, False, False,  ..., False, False, False]])}
Chunk 1 encoding result: {'input_ids': tensor([[57344,   110,   101,  ...,    97,   109,    46]]), 'attention_mask': tensor([[False, False, False,  ..., False, False, False]])}
Decoded Text: Sample PDFThis is a simple PDF ﬁle. Fun fun fun.Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Phasellus facilisis odio sed mi. Curabitur suscipit. Nullam vel nisi. Etiam semper ipsum ut lectus. Proin aliquam, erat eget pharetra commodo, eros mi condimentum quam, sed commodo justo quam ut velit. Integer a erat. Cras laoreet ligula cursus enim. Aenean scelerisque velit et tellus. Vestibulum dictum aliquet sem. Nulla facilisi. Vestibulum accumsan ante vitae elit. Nulla erat dolor, blandit in, rutrum quis, semper pulvinar, enim. Nullam varius congue risus. Vivamus sollicitudin, metus ut interdum eleifend, nisi tellu

In [3]:
pip install fpdf

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [4]:
import PyPDF2
from fpdf import FPDF
from shiba import Shiba, CodepointTokenizer, get_pretrained_from_hub

# Load SHIBA model and tokenizer
shiba_model = Shiba()
shiba_model.load_state_dict(get_pretrained_from_hub())
shiba_model.eval()  # Disable dropout
tokenizer = CodepointTokenizer()

def extract_text_from_pdf(pdf_path):
    """Extract text from a given PDF file."""
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text()
    return text

def split_text(text, max_length=1800):
    """Split text into chunks within the max length allowed by SHIBA."""
    return [text[i:i + max_length] for i in range(0, len(text), max_length)]

def process_pdf_with_shiba(pdf_path):
    # Extract and clean text from PDF
    text = extract_text_from_pdf(pdf_path)
    # Split text into manageable chunks
    text_chunks = split_text(text)
    all_outputs = []
    encoded_chunks = []

    for i, chunk in enumerate(text_chunks):
        # Encode each chunk
        encoded = tokenizer.encode_batch([chunk])
        print(f"Chunk {i} encoding result:", encoded)  # Debugging statement

        # Access `input_ids` and `attention_mask` and structure inputs for the model
        if 'input_ids' in encoded and 'attention_mask' in encoded:
            input_ids = encoded['input_ids']
            attention_mask = encoded['attention_mask']
            inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}
            outputs = shiba_model(**inputs)
            all_outputs.append(outputs)
            encoded_chunks.append(input_ids[0])  # Store input_ids for decoding

    return all_outputs, encoded_chunks

def decode_tokens(encoded_chunks):
    decoded_text = ""
    for tokens in encoded_chunks:
        decoded_text += tokenizer.decode(tokens)
    return decoded_text

def save_text_to_pdf(text, output_pdf_path):
    """Save the provided text to a PDF file."""
    pdf = FPDF()
    pdf.add_page()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.set_font("Arial", size=12)

    # Add each line of the text to the PDF, handling line breaks
    for line in text.splitlines():
        pdf.cell(200, 10, txt=line, ln=True)

    pdf.output(output_pdf_path)

# Example usage
pdf_path = "sample.pdf"
output_pdf_path = "output.pdf"
outputs, encoded_chunks = process_pdf_with_shiba(pdf_path)
decoded_text = decode_tokens(encoded_chunks)
print("Decoded Text:", decoded_text)

# Save the decoded text as a PDF
save_text_to_pdf(decoded_text, output_pdf_path)
print(f"Processed text saved as {output_pdf_path}")


  return torch.load(save_location, map_location=torch.device('cpu'))


Chunk 0 encoding result: {'input_ids': tensor([[57344,    83,    97,  ...,    32,    68,   111]]), 'attention_mask': tensor([[False, False, False,  ..., False, False, False]])}
Chunk 1 encoding result: {'input_ids': tensor([[57344,   110,   101,  ...,    97,   109,    46]]), 'attention_mask': tensor([[False, False, False,  ..., False, False, False]])}
Decoded Text: Sample PDFThis is a simple PDF ﬁle. Fun fun fun.Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Phasellus facilisis odio sed mi. Curabitur suscipit. Nullam vel nisi. Etiam semper ipsum ut lectus. Proin aliquam, erat eget pharetra commodo, eros mi condimentum quam, sed commodo justo quam ut velit. Integer a erat. Cras laoreet ligula cursus enim. Aenean scelerisque velit et tellus. Vestibulum dictum aliquet sem. Nulla facilisi. Vestibulum accumsan ante vitae elit. Nulla erat dolor, blandit in, rutrum quis, semper pulvinar, enim. Nullam varius congue risus. Vivamus sollicitudin, metus ut interdum eleifend, nisi tellu

UnicodeEncodeError: 'latin-1' codec can't encode character '\ue000' in position 50: ordinal not in range(256)