In [2]:
#!pip3 install open-webui python-docx

Collecting open-webui
  Downloading open_webui-0.3.21-py3-none-any.whl.metadata (16 kB)
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting aiohttp==3.10.5 (from open-webui)
  Downloading aiohttp-3.10.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.5 kB)
Collecting alembic==1.13.2 (from open-webui)
  Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
Collecting anthropic (from open-webui)
  Downloading anthropic-0.34.2-py3-none-any.whl.metadata (18 kB)
Collecting apscheduler==3.10.4 (from open-webui)
  Downloading APScheduler-3.10.4-py3-none-any.whl.metadata (5.7 kB)
Collecting authlib==1.3.2 (from open-webui)
  Downloading Authlib-1.3.2-py2.py3-none-any.whl.metadata (3.9 kB)
Collecting bcrypt==4.2.0 (from open-webui)
  Downloading bcrypt-4.2.0-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (9.6 kB)
Collecting black==24.8.0 (from open-webui)
  Downloading black-24.8.0-cp311-cp311-manylinux_2_17_x86_64.ma

In [3]:
from docx import Document

def extract_text_from_paragraphs(paragraphs):
    text_chunks = []
    current_chunk = ""
    max_chunk_length = 512  # Adjust based on model's token limit

    for para in paragraphs:
        # Combine paragraphs until reaching the maximum chunk length
        if len(current_chunk.split()) + len(para.text.split()) < max_chunk_length:
            current_chunk += " " + para.text
        else:
            text_chunks.append(current_chunk.strip())
            current_chunk = para.text

    if current_chunk:
        text_chunks.append(current_chunk.strip())

    return text_chunks

def chunk_document(doc_path):
    doc = Document(doc_path)
    chunks = []
    
    # Extract text from each section (paragraphs)
    chunks.extend(extract_text_from_paragraphs(doc.paragraphs))
    
    # Process tables separately if needed
    for table in doc.tables:
        table_text = []
        for row in table.rows:
            row_text = [cell.text for cell in row.cells]
            table_text.append(' | '.join(row_text))
        # Treat the entire table as a single chunk
        if table_text:
            chunks.append('\n'.join(table_text))
    
    return chunks



In [4]:

# Example usage
doc_path = '/home/cdsw/data/input_doc.docx'
chunks = chunk_document(doc_path)

# Output the chunked text
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}:\n{chunk}\n{'-'*40}")



Chunk 1:
Sensitivity: Yes./No. Paragraph numbering begins here and it continuous for the rest of the document. Do not include recommendations in paragraph numbering.  Anticipated negative reactions from those affected by the recommendations should be listed here. Consider sensitivities from a ‘whole-of-Organisation’ and/or whole-of-Organisation perspective, including key Organisation risks: capability, safety, reputation and security. Financial Impacts: Yes./No. Financial implications must be detailed here. You must consult the Chief Financial Officer when there are financial implications. List who has been consulted in the consultation section below. Financial implication must be considered from a whole-of-Organisation and/or a whole-of-Organisation perspective. Systems/Policy/deregulation: Yes./No. List relevant Policy and how it applies. If applicable, detail the managers accountabilities under policy and regulation. State new or amended regulations, application forms or any other l

In [None]:
# Assuming editor is the KAREN model pipeline
corrections = []
for chunk in chunks:
    input_text = (
        "<|system|>You are an assistant that corrects spelling and grammatical errors in text.<|end|>\n"
        f"<|user|>Edit the following text for spelling and grammar mistakes: '{chunk}'<|end|>\n"
        "<|assistant|>\n"
    )
    output = editor(input_text, max_length=512, num_return_sequences=1)
    corrected_text = output[0]['generated_text'].split("<|assistant|>")[-1].strip()
    corrections.append(corrected_text)

# Output the corrected chunks
for i, corrected_text i n enumerate(corrections):
    print(f"Corrected Chunk {i+1}:\n{corrected_text}\n{'-'*40}")

In [6]:
from docx import Document
from transformers import pipeline
from llama_cpp import Llama
from smartdoc_utils import document_postprocessing, process_llm_output
model_path = "/home/cdsw/.cache/huggingface/hub/models--TheBloke--Karen_TheEditor_V2_STRICT_Mistral_7B-GGUF/snapshots/6c654aa207a4b673379db7a928b87a01d644676c/karen_theeditor_v2_strict_mistral_7b.Q8_0.gguf"



def proofread_and_edit(input_file, output_file, correction_file):
    # Load the document
    doc = Document(input_file)
    corrections_doc = Document()
    corrections_doc.add_heading("Corrections Made", 0)
    # Initialize the model with GPU support
    llm = Llama(
        model_path=model_path,
        n_gpu_layers=-1,  # -1 means use all available GPU layers
        n_ctx=16384,  # adjust based on your GPU memory
    )

    
    # Process each paragraph
    for para in doc.paragraphs:
        # Store the original formatting
        original_runs = para.runs.copy()
        
        if not para.text.strip():
            print("Skipping LLM CALL:")
            continue
        else:
            # Get the text content
            text = para.text

        # Use the LLM to proofread and edit
        # prompt = (
        #     "<|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>\n"
        #     "<|user|> ** TASK**  \n" 
        #     f"1. Edit the following text for spelling and grammar mistakes:  '{text}'\n"
        #     f"2. Remove any formatting errors such as extra spaces"
        #     f"**IMPORTANT** First show the edited text and then , list **each correction made**."
        #     "<|end|>\n"
        #     "<|assistant|>\n"
        # )

        #Use the LLM to proofread and edit
        prompt = (
            "<|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>\n"
            "<|user|> ** TASK**  \n" 
            f"1. Edit the following text for spelling and grammar mistakes:  '{text}'\n"
            f"2. Remove any formatting errors such as extra spaces"
            f"**IMPORTANT** use the following template to format your response **."
            f"1. Edited Text: "
            f"[ Your corrected text here ]"
            f"2. Corrections: "
            f"[ list **each correction made** ]"
            "<|end|>\n"
            "<|assistant|>\n"
        )
#        edited_text = llm(prompt, max_length=len(text) + 50)[0]['generated_text']
        edited_text=llm(prompt, max_tokens=400, temperature=0.7, top_p=0.1, top_k=40, repeat_penalty=1.18)
        print(f"PROMPT: \n {prompt} \n\n")
        print(f"LLM RESPONSE: \n {edited_text['choices'][0]['text'].strip()}\n\n")        
        llm_output_text = edited_text['choices'][0]['text'].strip()
        # edits, corrections = document_postprocessing(llm_output_text)
        edits, corrections = process_llm_output(llm_output_text)
        # Clear the paragraph and add the edited text
        para.clear()
        #para.add_run(edited_text['choices'][0]['text'].strip())
        para.add_run(edits)
        
        
        # Attempt to reapply formatting
        new_runs = para.runs
        for i, run in enumerate(new_runs):
            if i < len(original_runs):
                run.font.name = original_runs[i].font.name
                run.font.size = original_runs[i].font.size
                run.font.bold = original_runs[i].font.bold
                run.font.italic = original_runs[i].font.italic
                # Add more formatting attributes as needed
        
        # Let us log the corrections made
    
        corrections_doc.add_paragraph()
        corrections_doc.add_heading(f"Original Text : \n {text}")
        corrections_doc.add_paragraph(f"Corrections: \n{llm_output_text}")
        
    
    # Save the edited document
    doc.save(output_file)
    corrections_doc.save(correction_file)
    
    


In [8]:
input_file = "/home/cdsw/data/simple-word-file-2.docx"
output_file = "/home/cdsw/data/output_doc.docx"
correction_file = "/home/cdsw/data/correction_file.docx"
proofread_and_edit(input_file, output_file, correction_file)

llama_model_loader: loaded meta data with 23 key-value pairs and 291 tensors from /home/cdsw/.cache/huggingface/hub/models--TheBloke--Karen_TheEditor_V2_STRICT_Mistral_7B-GGUF/snapshots/6c654aa207a4b673379db7a928b87a01d644676c/karen_theeditor_v2_strict_mistral_7b.Q8_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = fpham_karen_theeditor_v2_strict_mistr...
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32    

llm_load_tensors:        CPU buffer size =  7338.66 MiB
...................................................................................................
llama_new_context_with_model: n_ctx      = 16384
llama_new_context_with_model: n_batch    = 512
llama_new_context_with_model: n_ubatch   = 512
llama_new_context_with_model: flash_attn = 0
llama_new_context_with_model: freq_base  = 10000.0
llama_new_context_with_model: freq_scale = 1
llama_kv_cache_init:        CPU KV buffer size =  2048.00 MiB
llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
llama_new_context_with_model:        CPU  output buffer size =     0.12 MiB
llama_new_context_with_model:        CPU compute buffer size =  1088.01 MiB
llama_new_context_with_model: graph nodes  = 1030
llama_new_context_with_model: graph splits = 1
AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 |

PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  'HEADING'
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ Make a numbered list of your corrections ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
 1. Edited Text: HEADING
2. Corrections: None needed.


Skipping LLM CALL:



llama_print_timings:        load time =    3671.91 ms
llama_print_timings:      sample time =      15.51 ms /    23 runs   (    0.67 ms per token,  1482.91 tokens per second)
llama_print_timings: prompt eval time =    2033.38 ms /    72 tokens (   28.24 ms per token,    35.41 tokens per second)
llama_print_timings:        eval time =    4555.04 ms /    22 runs   (  207.05 ms per token,     4.83 tokens per second)
llama_print_timings:       total time =    6623.81 ms /    94 tokens
Llama.generate: 57 prefix-match hit, remaining 75 prompt tokens to eval


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  'SECTION 1 '
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ Make a numbered list of your corrections ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
 1. Edited Text: 'SECTION 1'
2. Corrections: None needed.





llama_print_timings:        load time =    3671.91 ms
llama_print_timings:      sample time =      16.04 ms /    24 runs   (    0.67 ms per token,  1496.54 tokens per second)
llama_print_timings: prompt eval time =    2301.27 ms /    75 tokens (   30.68 ms per token,    32.59 tokens per second)
llama_print_timings:        eval time =    4759.36 ms /    23 runs   (  206.93 ms per token,     4.83 tokens per second)
llama_print_timings:       total time =    7096.89 ms /    98 tokens
Llama.generate: 57 prefix-match hit, remaining 72 prompt tokens to eval


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  'I am trying something bold here. '
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ Make a numbered list of your corrections ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
 1. Edited Text: I am trying something bold here.
2. Corrections: None needed.


Skipping LLM CALL:
Skipping LLM CALL:



llama_print_timings:        load time =    3671.91 ms
llama_print_timings:      sample time =      15.94 ms /    24 runs   (    0.66 ms per token,  1505.36 tokens per second)
llama_print_timings: prompt eval time =    2050.35 ms /    72 tokens (   28.48 ms per token,    35.12 tokens per second)
llama_print_timings:        eval time =    4766.07 ms /    23 runs   (  207.22 ms per token,     4.83 tokens per second)
llama_print_timings:       total time =    6852.57 ms /    95 tokens
Llama.generate: 57 prefix-match hit, remaining 70 prompt tokens to eval


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  'SECTION 2 '
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ Make a numbered list of your corrections ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
 1. Edited Text: SECTION 2
2. Corrections: No formatting errors found.


Skipping LLM CALL:



llama_print_timings:        load time =    3671.91 ms
llama_print_timings:      sample time =      12.77 ms /    19 runs   (    0.67 ms per token,  1487.51 tokens per second)
llama_print_timings: prompt eval time =    2082.25 ms /    70 tokens (   29.75 ms per token,    33.62 tokens per second)
llama_print_timings:        eval time =    3744.46 ms /    18 runs   (  208.03 ms per token,     4.81 tokens per second)
llama_print_timings:       total time =    5855.26 ms /    88 tokens
Llama.generate: 57 prefix-match hit, remaining 127 prompt tokens to eval


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  'Product Details'
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ Make a numbered list of your corrections ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
 1. Edited Text: Product Details
2. Corrections: None needed.


Skipping LLM CALL:



llama_print_timings:        load time =    3671.91 ms
llama_print_timings:      sample time =      72.50 ms /   106 runs   (    0.68 ms per token,  1462.15 tokens per second)
llama_print_timings: prompt eval time =    3728.35 ms /   127 tokens (   29.36 ms per token,    34.06 tokens per second)
llama_print_timings:        eval time =   21829.08 ms /   105 runs   (  207.90 ms per token,     4.81 tokens per second)
llama_print_timings:       total time =   25733.52 ms /   232 tokens
Llama.generate: 57 prefix-match hit, remaining 126 prompt tokens to eval


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  'I saw a smaple product. These products is below expected quality. \n I start to understand what he said is quite     right.  The correct value is 12.5  % and 3   r d position. The time is 10:30 am '
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ Make a numbered list of your corrections ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
 1. Edited Text: 'I saw a sample product. These products are below expected quality. I start to understand what he said is quite right. The correct value is 12.5% and the 3rd position. The time is 10:30 am.'
2. Corrections: [Your corrected text here]
- Replace 'smaple' with 'sample'.
- Remove extra space after '%'.
- Add a period at the end


llama_print_timings:        load time =    3671.91 ms
llama_print_timings:      sample time =      50.47 ms /    73 runs   (    0.69 ms per token,  1446.49 tokens per second)
llama_print_timings: prompt eval time =    3590.44 ms /   126 tokens (   28.50 ms per token,    35.09 tokens per second)
llama_print_timings:        eval time =   14920.39 ms /    72 runs   (  207.23 ms per token,     4.83 tokens per second)
llama_print_timings:       total time =   18626.28 ms /   198 tokens
Llama.generate: 57 prefix-match hit, remaining 70 prompt tokens to eval


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  '                   The length of the object is 845mm and cost $1m. The sunrise happens at 0750 EST and visibily is 35 km.      The right way to summarize and favor some advisors is still to be found'
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ Make a numbered list of your corrections ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
 1. Edited Text: The length of the object is 845mm and costs $1m. The sunrise happens at 0750 EST and visibility is 35 km. The right way to summarize and favor some advisors is still to be found.'
2. Corrections: [No corrections needed]


Skipping LLM CALL:
Skipping LLM CALL:



llama_print_timings:        load time =    3671.91 ms
llama_print_timings:      sample time =      12.59 ms /    19 runs   (    0.66 ms per token,  1509.25 tokens per second)
llama_print_timings: prompt eval time =    2038.83 ms /    70 tokens (   29.13 ms per token,    34.33 tokens per second)
llama_print_timings:        eval time =    3724.97 ms /    18 runs   (  206.94 ms per token,     4.83 tokens per second)
llama_print_timings:       total time =    5792.25 ms /    88 tokens
Llama.generate: 57 prefix-match hit, remaining 131 prompt tokens to eval


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  'Table Section'
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ Make a numbered list of your corrections ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
 1. Edited Text: Table Section
2. Corrections: None needed.





llama_print_timings:        load time =    3671.91 ms
llama_print_timings:      sample time =      79.62 ms /   112 runs   (    0.71 ms per token,  1406.61 tokens per second)
llama_print_timings: prompt eval time =    3859.18 ms /   131 tokens (   29.46 ms per token,    33.95 tokens per second)
llama_print_timings:        eval time =   23150.52 ms /   111 runs   (  208.56 ms per token,     4.79 tokens per second)
llama_print_timings:       total time =   27198.47 ms /   242 tokens


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  'Some table to be addd latr here by I. I am trying t  o  make a looooong  setence to see if the LLM is able to capture some of these                       errors such as 2 n    d  or 4 t  h or 6 %s '
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ Make a numbered list of your corrections ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
 1. Edited Text: 'Some table to be added later here by I. I am trying t o make a looooong setence to see if the LLM is able to capture some of these errors such as 2 n, d or 4 t h or 6 %s'
2. Corrections: [1] Remove extra spaces between words and numbers; [2] Add missing period after "I"; [3] Replace "looooong" with a more appropriate word 