In [1]:
#!pip3 install -q open-webui python-docx

In [4]:
#imports and set up the local model
from docx import Document
from transformers import pipeline
from llama_cpp import Llama
from copy import deepcopy
from smartdoc_utils import document_postprocessing, process_llm_output
model_path = "/home/cdsw/.cache/huggingface/hub/models--TheBloke--Karen_TheEditor_V2_STRICT_Mistral_7B-GGUF/snapshots/6c654aa207a4b673379db7a928b87a01d644676c/karen_theeditor_v2_strict_mistral_7b.Q8_0.gguf"
# Initialize the model with GPU support
llm = Llama(
    model_path=model_path,
    n_gpu_layers=-1,  # -1 means use all available GPU layers
    n_ctx=16384,  # adjust based on your GPU memory
)


  from .autonotebook import tqdm as notebook_tqdm
llama_model_loader: loaded meta data with 23 key-value pairs and 291 tensors from /home/cdsw/.cache/huggingface/hub/models--TheBloke--Karen_TheEditor_V2_STRICT_Mistral_7B-GGUF/snapshots/6c654aa207a4b673379db7a928b87a01d644676c/karen_theeditor_v2_strict_mistral_7b.Q8_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = fpham_karen_theeditor_v2_strict_mistr...
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5: 

In [9]:

def fetch_llm_response(text):
    #Use the LLM to proofread and edit
    # prompt = (
    #     "<|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>\n"
    #     "<|user|> ** TASK**  \n" 
    #     f"1. Edit the following text for spelling and grammar mistakes:  '{text}'\n"
    #     f"2. Remove any formatting errors such as extra spaces \n"
    #     f"**IMPORTANT** use the following template to format your response **.\n"
    #     f"1. Edited Text: \n"
    #     f"[ Your corrected text here ]\n"
    #     f"2. Corrections: \n"
    #     f"[ list **each correction made here** ]\n"
    #     "<|end|>\n"
    #     "<|assistant|>\n"
    # )
    prompt = (
        "<|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>\n"
        "<|user|> ** TASK**  \n" 
        f"1. Edit the following text for spelling and grammar mistakes:  '{text}'\n"
        f"2. Remove any formatting errors such as extra spaces"
        f"**IMPORTANT** use the following template to format your response **."
        f"1. Edited Text: "
        f"[ Your corrected text here ]"
        f"2. Corrections: "
        f"[ Make a numbered list of your corrections ]"
        "<|end|>\n"
        "<|assistant|>\n"
    )    
#        edited_text = llm(prompt, max_length=len(text) + 50)[0]['generated_text']
    edited_text=llm(prompt, max_tokens=400, temperature=0.7, top_p=0.1, top_k=40, repeat_penalty=1.18)
    response_text = edited_text['choices'][0]['text'].strip()
    print(f"PROMPT: \n {prompt} \n\n")
    print(f"LLM RESPONSE: \n{response_text}\n\n")      
    
    return response_text
    
# code that Aamir showed with language_tool checks comes here
# Pre-process an input document to handle formatting, 
# language checks, etc. before further processing
#
# Args:
#   input_doc: The original input document 
#   corrections_doc: An empty document to populate with corrections 
#
# Returns: 
#   modified_doc: A modified version of the input document 
#   corrections_doc: Populated with any corrections made during p
def pre_process_document(input_doc, corrections_doc): 
    
    ## REPLACE this with logic of the modified doc
    modified_doc = deepcopy(input_doc)
    
    # logic for pre-processing Fonts, Australian Language Checks, etc. 
    
    return modified_doc, corrections_doc

def process_document_paragraphs(modified_doc, corrections_doc) : 

    #modified_doc = deepcopy(input_doc)
    # Process each paragraph
    for para in modified_doc.paragraphs:
        # Store the original formatting
        original_runs = para.runs.copy()
        
        if not para.text.strip():
            print("Skipping LLM CALL:")
            continue
        else:
            # Get the text content
            text = para.text

        llm_output_text = fetch_llm_response(text)

        # edits, corrections = document_postprocessing(llm_output_text)
        edits, corrections = process_llm_output(llm_output_text)
        print("EDITS : \n", edits)
        print("CORRECTIONS: \n", corrections)
        # Clear the paragraph and add the edited text
        para.clear()
        #para.add_run(edited_text['choices'][0]['text'].strip())
        para.add_run(edits)
        # Attempt to reapply formatting
        new_runs = para.runs
        for i, run in enumerate(new_runs):
            if i < len(original_runs):
                run.font.name = original_runs[i].font.name
                run.font.size = original_runs[i].font.size
                run.font.bold = original_runs[i].font.bold
                run.font.italic = original_runs[i].font.italic
                # Add more formatting attributes as needed
        

        # Let us log the corrections made
        corrections_doc.add_paragraph()
        corrections_doc.add_paragraph(f"Original Text : \n {text}")
        corrections_doc.add_paragraph(f"Edits : \n {edits}")
        corrections_doc.add_paragraph(f"Corrections: \n{llm_output_text}")
                   
    
    return modified_doc, corrections_doc

    
def process_document_tables(modified_doc, corrections_doc) : 
    
    ## REPLACE this with logic of the modified doc
    #modified_doc = deepcopy(input_doc)
    # Iterate through all tables in the document
    for table in modified_doc.tables:
        print("IN Table")        
        printed_cells = set()  # To keep track of cells that have been processed
        for r_index, row in enumerate(table.rows):
            for c_index, cell in enumerate(row.cells):
                cell_id = (r_index, c_index)  # Unique identifier for the cell
                
                # Skip this cell if it is already processed as part of a merged cell
                if cell_id in printed_cells:
                    continue

                # Detect merged cells
                is_merged = False
                for other_cell in row.cells:
                    if other_cell is not cell and other_cell._element is cell._element:
                        is_merged = True
                        break

                # If it's a merged cell, avoid processing duplicates
                if is_merged:
                    # Register this cell's element to skip duplicates
                    for merged_row_index, merged_row in enumerate(table.rows):
                        for merged_cell_index, merged_cell in enumerate(merged_row.cells):
                            if merged_cell._element is cell._element:
                                printed_cells.add((merged_row_index, merged_cell_index))

                # Append '**' to the text of the cell if not already processed
                if cell.text.strip():  # Check if the cell is not empty
#                    cell.text += '*T*B*L'
                    for para in cell.paragraphs:
                        # Add an asterisk (*) to the end of each cell paragraph
                        print(para.text)
                        # Just a small check to see that we processed this
                        #para.add_run('*T')

                        # Store the original formatting
                        original_runs = para.runs.copy()                        
                        # let us call the llm 
                        llm_output_text = fetch_llm_response(para.text)

                        # edits, corrections = document_postprocessing(llm_output_text)
                        edits, corrections = process_llm_output(llm_output_text) 
                        
                        #let us reapply formatting
                        
                        # Clear the paragraph and add the edited text
                        para.clear()
                        #para.add_run(edited_text['choices'][0]['text'].strip())
                        para.add_run(edits)
                        # Attempt to reapply formatting
                        new_runs = para.runs
                        for i, run in enumerate(new_runs):
                            if i < len(original_runs):
                                run.font.name = original_runs[i].font.name
                                run.font.size = original_runs[i].font.size
                                run.font.bold = original_runs[i].font.bold
                                run.font.italic = original_runs[i].font.italic
                                # Add more formatting attributes as needed                                               

                        # Let us log the corrections made
                        corrections_doc.add_paragraph()
                        corrections_doc.add_paragraph(f"Original Text : \n {para.text}")
                        corrections_doc.add_paragraph(f"Corrections: \n{llm_output_text}")
                        
            print()  # Newline after each row
    
    
    return modified_doc, corrections_doc

def proofread_document(input_file, output_file, correction_file):
    # Load the document
    input_doc = Document(input_file)
    corrections_doc = Document()
    corrections_doc.add_heading("Corrections Made", 0)
    modified_doc = deepcopy(input_doc)
    modified_doc, corrections_doc = process_document_paragraphs(modified_doc, corrections_doc)
    modified_doc, corrections_doc = process_document_tables(modified_doc, corrections_doc)
    # Save the edited document
    modified_doc.save(output_file)
    corrections_doc.save(correction_file)
    


In [10]:
#input_file = "/home/cdsw/data/simple-word-file-2.docx"
input_file = "/home/cdsw/data/simple-word-file-with-table.docx"
output_file = "/home/cdsw/data/output_doc.docx"
correction_file = "/home/cdsw/data/correction_file.docx"
proofread_document(input_file, output_file, correction_file)

Llama.generate: 126 prefix-match hit, remaining 1 prompt tokens to eval



llama_print_timings:        load time =    3776.62 ms
llama_print_timings:      sample time =      13.61 ms /    20 runs   (    0.68 ms per token,  1469.51 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     0 tokens (    -nan ms per token,     -nan tokens per second)
llama_print_timings:        eval time =    4504.72 ms /    20 runs   (  225.24 ms per token,     4.44 tokens per second)
llama_print_timings:       total time =    4535.29 ms /    20 tokens
Llama.generate: 57 prefix-match hit, remaining 72 prompt tokens to eval


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  'HEADING'
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ Make a numbered list of your corrections ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
1. Edited Text: HEADING
2. Corrections: None needed.


INSIDE process_llm_output
Edited Text:
HEADING

Corrections:
None needed.
DONE process_llm_output
EDITS : 
 HEADING
CORRECTIONS: 
 None needed.
Skipping LLM CALL:



llama_print_timings:        load time =    3776.62 ms
llama_print_timings:      sample time =      15.85 ms /    23 runs   (    0.69 ms per token,  1450.83 tokens per second)
llama_print_timings: prompt eval time =    2169.47 ms /    72 tokens (   30.13 ms per token,    33.19 tokens per second)
llama_print_timings:        eval time =    5003.02 ms /    22 runs   (  227.41 ms per token,     4.40 tokens per second)
llama_print_timings:       total time =    7208.47 ms /    94 tokens
Llama.generate: 57 prefix-match hit, remaining 75 prompt tokens to eval


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  'SECTION 1 '
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ Make a numbered list of your corrections ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
1. Edited Text: 'SECTION 1'
2. Corrections: None needed.


INSIDE process_llm_output
Edited Text:
'SECTION 1'

Corrections:
None needed.
DONE process_llm_output
EDITS : 
 'SECTION 1'
CORRECTIONS: 
 None needed.



llama_print_timings:        load time =    3776.62 ms
llama_print_timings:      sample time =      16.02 ms /    24 runs   (    0.67 ms per token,  1497.94 tokens per second)
llama_print_timings: prompt eval time =    2265.24 ms /    75 tokens (   30.20 ms per token,    33.11 tokens per second)
llama_print_timings:        eval time =    5174.54 ms /    23 runs   (  224.98 ms per token,     4.44 tokens per second)
llama_print_timings:       total time =    7476.74 ms /    98 tokens
Llama.generate: 57 prefix-match hit, remaining 72 prompt tokens to eval


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  'I am trying something bold here. '
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ Make a numbered list of your corrections ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
1. Edited Text: I am trying something bold here.
2. Corrections: None needed.


INSIDE process_llm_output
Edited Text:
I am trying something bold here.

Corrections:
None needed.
DONE process_llm_output
EDITS : 
 I am trying something bold here.
CORRECTIONS: 
 None needed.
Skipping LLM CALL:
Skipping LLM CALL:



llama_print_timings:        load time =    3776.62 ms
llama_print_timings:      sample time =      15.59 ms /    24 runs   (    0.65 ms per token,  1539.94 tokens per second)
llama_print_timings: prompt eval time =    2160.33 ms /    72 tokens (   30.00 ms per token,    33.33 tokens per second)
llama_print_timings:        eval time =    5155.44 ms /    23 runs   (  224.15 ms per token,     4.46 tokens per second)
llama_print_timings:       total time =    7351.78 ms /    95 tokens
Llama.generate: 57 prefix-match hit, remaining 70 prompt tokens to eval


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  'SECTION 2 '
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ Make a numbered list of your corrections ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
1. Edited Text: SECTION 2
2. Corrections: No formatting errors found.


INSIDE process_llm_output
Edited Text:
SECTION 2

Corrections:
No formatting errors found.
DONE process_llm_output
EDITS : 
 SECTION 2
CORRECTIONS: 
 No formatting errors found.
Skipping LLM CALL:



llama_print_timings:        load time =    3776.62 ms
llama_print_timings:      sample time =      12.67 ms /    19 runs   (    0.67 ms per token,  1499.96 tokens per second)
llama_print_timings: prompt eval time =    2007.74 ms /    70 tokens (   28.68 ms per token,    34.87 tokens per second)
llama_print_timings:        eval time =    4037.45 ms /    18 runs   (  224.30 ms per token,     4.46 tokens per second)
llama_print_timings:       total time =    6073.88 ms /    88 tokens
Llama.generate: 57 prefix-match hit, remaining 127 prompt tokens to eval


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  'Product Details'
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ Make a numbered list of your corrections ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
1. Edited Text: Product Details
2. Corrections: None needed.


INSIDE process_llm_output
Edited Text:
Product Details

Corrections:
None needed.
DONE process_llm_output
EDITS : 
 Product Details
CORRECTIONS: 
 None needed.
Skipping LLM CALL:



llama_print_timings:        load time =    3776.62 ms
llama_print_timings:      sample time =      72.05 ms /   106 runs   (    0.68 ms per token,  1471.24 tokens per second)
llama_print_timings: prompt eval time =    3696.96 ms /   127 tokens (   29.11 ms per token,    34.35 tokens per second)
llama_print_timings:        eval time =   23606.03 ms /   105 runs   (  224.82 ms per token,     4.45 tokens per second)
llama_print_timings:       total time =   27473.94 ms /   232 tokens
Llama.generate: 57 prefix-match hit, remaining 126 prompt tokens to eval


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  'I saw a smaple product. These products is below expected quality. \n I start to understand what he said is quite     right.  The correct value is 12.5  % and 3   r d position. The time is 10:30 am '
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ Make a numbered list of your corrections ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
1. Edited Text: 'I saw a sample product. These products are below expected quality. I start to understand what he said is quite right. The correct value is 12.5% and the 3rd position. The time is 10:30 am.'
2. Corrections: [Your corrected text here]
- Replace 'smaple' with 'sample'.
- Remove extra space after '%'.
- Add a period at the end 


llama_print_timings:        load time =    3776.62 ms
llama_print_timings:      sample time =      49.48 ms /    73 runs   (    0.68 ms per token,  1475.40 tokens per second)
llama_print_timings: prompt eval time =    3612.94 ms /   126 tokens (   28.67 ms per token,    34.87 tokens per second)
llama_print_timings:        eval time =   16247.22 ms /    72 runs   (  225.66 ms per token,     4.43 tokens per second)
llama_print_timings:       total time =   19975.02 ms /   198 tokens
Llama.generate: 57 prefix-match hit, remaining 70 prompt tokens to eval


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  '                   The length of the object is 845mm and cost $1m. The sunrise happens at 0750 EST and visibily is 35 km.      The right way to summarize and favor some advisors is still to be found'
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ Make a numbered list of your corrections ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
1. Edited Text: The length of the object is 845mm and costs $1m. The sunrise happens at 0750 EST and visibility is 35 km. The right way to summarize and favor some advisors is still to be found.'
2. Corrections: [No corrections needed]


INSIDE process_llm_output
Edited Text:
The length of the object is 845mm and costs $1m. The sunrise hap


llama_print_timings:        load time =    3776.62 ms
llama_print_timings:      sample time =      12.94 ms /    19 runs   (    0.68 ms per token,  1467.86 tokens per second)
llama_print_timings: prompt eval time =    2113.51 ms /    70 tokens (   30.19 ms per token,    33.12 tokens per second)
llama_print_timings:        eval time =    4070.32 ms /    18 runs   (  226.13 ms per token,     4.42 tokens per second)
llama_print_timings:       total time =    6212.60 ms /    88 tokens
Llama.generate: 57 prefix-match hit, remaining 128 prompt tokens to eval


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  'Table Section'
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ Make a numbered list of your corrections ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
1. Edited Text: Table Section
2. Corrections: None needed.


INSIDE process_llm_output
Edited Text:
Table Section

Corrections:
None needed.
DONE process_llm_output
EDITS : 
 Table Section
CORRECTIONS: 
 None needed.



llama_print_timings:        load time =    3776.62 ms
llama_print_timings:      sample time =      84.64 ms /   121 runs   (    0.70 ms per token,  1429.65 tokens per second)
llama_print_timings: prompt eval time =    3599.23 ms /   128 tokens (   28.12 ms per token,    35.56 tokens per second)
llama_print_timings:        eval time =   27011.23 ms /   120 runs   (  225.09 ms per token,     4.44 tokens per second)
llama_print_timings:       total time =   30808.45 ms /   248 tokens
Llama.generate: 57 prefix-match hit, remaining 72 prompt tokens to eval


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  'Some table to be addd latr here by I. I am trying t  o  make a long  setence to see if the LLM is able to capture some of these                       errors such as 2    nd  or 4    th or 6.5   % '
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ Make a numbered list of your corrections ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
1. Edited Text: 'Some table to be added later here by I. I am trying t o make a long setence to see if the LLM is able to capture some of these errors such as 2 nd or 4 th or 6.5 %.'
2. Corrections: [1] Remove extra space after "Some table" and before "to be added later here by I." [2] Change "t o make a long setence" to "to create a long se


llama_print_timings:        load time =    3776.62 ms
llama_print_timings:      sample time =      24.46 ms /    37 runs   (    0.66 ms per token,  1512.74 tokens per second)
llama_print_timings: prompt eval time =    2098.54 ms /    72 tokens (   29.15 ms per token,    34.31 tokens per second)
llama_print_timings:        eval time =    8060.79 ms /    36 runs   (  223.91 ms per token,     4.47 tokens per second)
llama_print_timings:       total time =   10214.24 ms /   108 tokens
Llama.generate: 57 prefix-match hit, remaining 73 prompt tokens to eval


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  'Spelling Errors'
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ Make a numbered list of your corrections ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
1. Edited Text: 'Spelling Errors'
2. Corrections: 
   - Remove extra space after the colon in "Spelling Errors"


INSIDE process_llm_output
Edited Text:
'Spelling Errors'

Corrections:
- Remove extra space after the colon in "Spelling Errors"
DONE process_llm_output
Grammar Errors



llama_print_timings:        load time =    3776.62 ms
llama_print_timings:      sample time =      26.12 ms /    39 runs   (    0.67 ms per token,  1493.34 tokens per second)
llama_print_timings: prompt eval time =    2373.06 ms /    73 tokens (   32.51 ms per token,    30.76 tokens per second)
llama_print_timings:        eval time =    8553.36 ms /    38 runs   (  225.09 ms per token,     4.44 tokens per second)
llama_print_timings:       total time =   10985.28 ms /   111 tokens
Llama.generate: 57 prefix-match hit, remaining 76 prompt tokens to eval


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  'Grammar Errors'
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ Make a numbered list of your corrections ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
1. Edited Text: 'Grammar Errors'
2. Corrections: 
   - Remove extra space after the colon in "Grammar Errors".


INSIDE process_llm_output
Edited Text:
'Grammar Errors'

Corrections:
- Remove extra space after the colon in "Grammar Errors".
DONE process_llm_output

This is a mistke in spelling



llama_print_timings:        load time =    3776.62 ms
llama_print_timings:      sample time =      24.22 ms /    36 runs   (    0.67 ms per token,  1486.25 tokens per second)
llama_print_timings: prompt eval time =    2218.40 ms /    76 tokens (   29.19 ms per token,    34.26 tokens per second)
llama_print_timings:        eval time =    7823.37 ms /    35 runs   (  223.52 ms per token,     4.47 tokens per second)
llama_print_timings:       total time =   10097.87 ms /   111 tokens
Llama.generate: 59 prefix-match hit, remaining 73 prompt tokens to eval


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  'This is a mistke in spelling'
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ Make a numbered list of your corrections ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
1. Edited Text: 'This is a mistake in spelling'
2. Corrections: 
   - Remove extra space after "mistake"


INSIDE process_llm_output
Edited Text:
'This is a mistake in spelling'

Corrections:
- Remove extra space after "mistake"
DONE process_llm_output
This is examples of grammar mistakes



llama_print_timings:        load time =    3776.62 ms
llama_print_timings:      sample time =      30.96 ms /    47 runs   (    0.66 ms per token,  1518.33 tokens per second)
llama_print_timings: prompt eval time =    2413.91 ms /    73 tokens (   33.07 ms per token,    30.24 tokens per second)
llama_print_timings:        eval time =   10352.74 ms /    46 runs   (  225.06 ms per token,     4.44 tokens per second)
llama_print_timings:       total time =   12837.63 ms /   119 tokens
Llama.generate: 57 prefix-match hit, remaining 78 prompt tokens to eval


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  'This is examples of grammar mistakes'
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ Make a numbered list of your corrections ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
1. Edited Text: This is an example of grammar mistakes.
2. Corrections: 
   - Remove "of" after "examples".
   - Change "is" to "are".


INSIDE process_llm_output
Edited Text:
This is an example of grammar mistakes.

Corrections:
- Remove "of" after "examples".
   - Change "is" to "are".
DONE process_llm_output

The wether was beautifull on our vacation.



llama_print_timings:        load time =    3776.62 ms
llama_print_timings:      sample time =      35.25 ms /    53 runs   (    0.67 ms per token,  1503.42 tokens per second)
llama_print_timings: prompt eval time =    2248.12 ms /    78 tokens (   28.82 ms per token,    34.70 tokens per second)
llama_print_timings:        eval time =   11670.17 ms /    52 runs   (  224.43 ms per token,     4.46 tokens per second)
llama_print_timings:       total time =   13999.66 ms /   130 tokens
Llama.generate: 57 prefix-match hit, remaining 76 prompt tokens to eval


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  'The wether was beautifull on our vacation.'
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ Make a numbered list of your corrections ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
1. Edited Text: The weather was beautiful on our vacation.
2. Corrections: 
   - Replace "wether" with "weather."
   - Remove extra space after the period at the end of the sentence.


INSIDE process_llm_output
Edited Text:
The weather was beautiful on our vacation.

Corrections:
- Replace "wether" with "weather."
   - Remove extra space after the period at the end of the sentence.
DONE process_llm_output
Me and her went to the store yesterday.



llama_print_timings:        load time =    3776.62 ms
llama_print_timings:      sample time =      29.87 ms /    45 runs   (    0.66 ms per token,  1506.78 tokens per second)
llama_print_timings: prompt eval time =    2303.66 ms /    76 tokens (   30.31 ms per token,    32.99 tokens per second)
llama_print_timings:        eval time =    9860.12 ms /    44 runs   (  224.09 ms per token,     4.46 tokens per second)
llama_print_timings:       total time =   12231.37 ms /   120 tokens
Llama.generate: 57 prefix-match hit, remaining 82 prompt tokens to eval


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  'Me and her went to the store yesterday.'
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ Make a numbered list of your corrections ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
1. Edited Text: Me and her went to the store yesterday.
2. Corrections: 
   - Remove extra space after "Me"
   - Change "her" to "her."


INSIDE process_llm_output
Edited Text:
Me and her went to the store yesterday.

Corrections:
- Remove extra space after "Me"
   - Change "her" to "her."
DONE process_llm_output

She recieved an accolade for her excelent work.



llama_print_timings:        load time =    3776.62 ms
llama_print_timings:      sample time =      38.16 ms /    57 runs   (    0.67 ms per token,  1493.55 tokens per second)
llama_print_timings: prompt eval time =    2428.45 ms /    82 tokens (   29.62 ms per token,    33.77 tokens per second)
llama_print_timings:        eval time =   12577.08 ms /    56 runs   (  224.59 ms per token,     4.45 tokens per second)
llama_print_timings:       total time =   15091.84 ms /   138 tokens
Llama.generate: 57 prefix-match hit, remaining 79 prompt tokens to eval


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  'She recieved an accolade for her excelent work.'
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ Make a numbered list of your corrections ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
1. Edited Text: 'She received an accolade for her excellent work.'
2. Corrections: 
   - Replace "recieved" with "received" (spelling error)
   - Remove extra space after "work."


INSIDE process_llm_output
Edited Text:
'She received an accolade for her excellent work.'

Corrections:
- Replace "recieved" with "received" (spelling error)
   - Remove extra space after "work."
DONE process_llm_output
 If I would have known, I would have came earlier.



llama_print_timings:        load time =    3776.62 ms
llama_print_timings:      sample time =      41.26 ms /    61 runs   (    0.68 ms per token,  1478.57 tokens per second)
llama_print_timings: prompt eval time =    2334.78 ms /    79 tokens (   29.55 ms per token,    33.84 tokens per second)
llama_print_timings:        eval time =   13498.09 ms /    60 runs   (  224.97 ms per token,     4.45 tokens per second)
llama_print_timings:       total time =   15927.16 ms /   139 tokens


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  ' If I would have known, I would have came earlier.'
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ Make a numbered list of your corrections ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
1. Edited Text: 'If I would have known, I would have come earlier.'
2. Corrections: 
   - Replace "would" with "could" (correct verb tense)
   - Remove extra space after comma (proper punctuation)


INSIDE process_llm_output
Edited Text:
'If I would have known, I would have come earlier.'

Corrections:
- Replace "would" with "could" (correct verb tense)
   - Remove extra space after comma (proper punctuation)
DONE process_llm_output



In [None]:
def process_document(input_file, output_file):
      # Open the input Word document
    doc = Document(input_file)
    
    for element in doc.paragraphs:
        print("IN PURE PARAGRAPH")
        print(element.text)
        element.add_run('*P')
    
    # Iterate through all tables in the document
    for table in doc.tables:
        print("IN Table")        
        printed_cells = set()  # To keep track of cells that have been processed
        for r_index, row in enumerate(table.rows):
            for c_index, cell in enumerate(row.cells):
                cell_id = (r_index, c_index)  # Unique identifier for the cell
                
                # Skip this cell if it is already processed as part of a merged cell
                if cell_id in printed_cells:
                    continue

                # Detect merged cells
                is_merged = False
                for other_cell in row.cells:
                    if other_cell is not cell and other_cell._element is cell._element:
                        is_merged = True
                        break

                # If it's a merged cell, avoid processing duplicates
                if is_merged:
                    # Register this cell's element to skip duplicates
                    for merged_row_index, merged_row in enumerate(table.rows):
                        for merged_cell_index, merged_cell in enumerate(merged_row.cells):
                            if merged_cell._element is cell._element:
                                printed_cells.add((merged_row_index, merged_cell_index))

                # Append '**' to the text of the cell if not already processed
                if cell.text.strip():  # Check if the cell is not empty
#                    cell.text += '*T*B*L'
                    for paragraph in cell.paragraphs:
                        # Add an asterisk (*)   to the end of each cell paragraph
                        print(paragraph.text)
                        paragraph.add_run('*T')

            print()  # Newline after each row

    # Save the modified document to the output file
    doc.save(output_file)


In [2]:
input_file = "/home/cdsw/data/simple-word-file-2.docx"
output_file = "/home/cdsw/data/output_doc.docx"
correction_file = "/home/cdsw/data/correction_file.docx"
proofread_and_edit_paragraphs(input_file, output_file, correction_file)


llama_print_timings:        load time =    3653.13 ms
llama_print_timings:      sample time =      13.54 ms /    20 runs   (    0.68 ms per token,  1477.43 tokens per second)
llama_print_timings: prompt eval time =    3652.78 ms /   125 tokens (   29.22 ms per token,    34.22 tokens per second)
llama_print_timings:        eval time =    4100.43 ms /    19 runs   (  215.81 ms per token,     4.63 tokens per second)
llama_print_timings:       total time =    7784.64 ms /   144 tokens
Llama.generate: 57 prefix-match hit, remaining 70 prompt tokens to eval


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  'HEADING'
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ list **each correction made** ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
 1. Edited Text: HEADING
2. Corrections: None needed.


Skipping LLM CALL:



llama_print_timings:        load time =    3653.13 ms
llama_print_timings:      sample time =      15.20 ms /    23 runs   (    0.66 ms per token,  1512.86 tokens per second)
llama_print_timings: prompt eval time =    2041.48 ms /    70 tokens (   29.16 ms per token,    34.29 tokens per second)
llama_print_timings:        eval time =    4749.94 ms /    22 runs   (  215.91 ms per token,     4.63 tokens per second)
llama_print_timings:       total time =    6826.65 ms /    92 tokens
Llama.generate: 57 prefix-match hit, remaining 73 prompt tokens to eval


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  'SECTION 1 '
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ list **each correction made** ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
 1. Edited Text: 'SECTION 1'
2. Corrections: None needed.





llama_print_timings:        load time =    3653.13 ms
llama_print_timings:      sample time =      16.55 ms /    25 runs   (    0.66 ms per token,  1510.67 tokens per second)
llama_print_timings: prompt eval time =    2205.26 ms /    73 tokens (   30.21 ms per token,    33.10 tokens per second)
llama_print_timings:        eval time =    5223.78 ms /    24 runs   (  217.66 ms per token,     4.59 tokens per second)
llama_print_timings:       total time =    7466.42 ms /    97 tokens
Llama.generate: 57 prefix-match hit, remaining 70 prompt tokens to eval


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  'I am trying something bold here. '
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ list **each correction made** ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
 1. Edited Text: 'I am trying something bold here.'
2. Corrections: None needed.


Skipping LLM CALL:
Skipping LLM CALL:



llama_print_timings:        load time =    3653.13 ms
llama_print_timings:      sample time =      14.44 ms /    22 runs   (    0.66 ms per token,  1523.65 tokens per second)
llama_print_timings: prompt eval time =    2520.20 ms /    70 tokens (   36.00 ms per token,    27.78 tokens per second)
llama_print_timings:        eval time =    4552.72 ms /    21 runs   (  216.80 ms per token,     4.61 tokens per second)
llama_print_timings:       total time =    7105.25 ms /    91 tokens
Llama.generate: 57 prefix-match hit, remaining 68 prompt tokens to eval


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  'SECTION 2 '
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ list **each correction made** ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
 1. Edited Text: SECTION 2
2. Corrections: No corrections needed.


Skipping LLM CALL:



llama_print_timings:        load time =    3653.13 ms
llama_print_timings:      sample time =      14.43 ms /    22 runs   (    0.66 ms per token,  1524.71 tokens per second)
llama_print_timings: prompt eval time =    1988.19 ms /    68 tokens (   29.24 ms per token,    34.20 tokens per second)
llama_print_timings:        eval time =    4535.71 ms /    21 runs   (  215.99 ms per token,     4.63 tokens per second)
llama_print_timings:       total time =    6556.68 ms /    89 tokens
Llama.generate: 57 prefix-match hit, remaining 125 prompt tokens to eval


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  'Product Details'
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ list **each correction made** ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
 **.1. Edited Text:** Product Details
**.2. Corrections:** No corrections needed.


Skipping LLM CALL:



llama_print_timings:        load time =    3653.13 ms
llama_print_timings:      sample time =      98.38 ms /   139 runs   (    0.71 ms per token,  1412.83 tokens per second)
llama_print_timings: prompt eval time =    3658.11 ms /   125 tokens (   29.26 ms per token,    34.17 tokens per second)
llama_print_timings:        eval time =   30722.95 ms /   138 runs   (  222.63 ms per token,     4.49 tokens per second)
llama_print_timings:       total time =   34625.03 ms /   263 tokens
Llama.generate: 57 prefix-match hit, remaining 124 prompt tokens to eval


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  'I saw a smaple product. These products is below expected quality. \n I start to understand what he said is quite     right.  The correct value is 12.5  % and 3   r d position. The time is 10:30 am '
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ list **each correction made** ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
 1. Edited Text: 'I saw a sample product. These products are below expected quality. I start to understand what he said is quite right. The correct value is 12.5% and the third position. The time is 10:30 am.'
2. Corrections: [list each correction made]
   - Replaced "smaple" with "sample"
   - Changed "These products is below expected quality." to "T


llama_print_timings:        load time =    3653.13 ms
llama_print_timings:      sample time =      78.28 ms /   113 runs   (    0.69 ms per token,  1443.50 tokens per second)
llama_print_timings: prompt eval time =    3789.71 ms /   124 tokens (   30.56 ms per token,    32.72 tokens per second)
llama_print_timings:        eval time =   24296.99 ms /   112 runs   (  216.94 ms per token,     4.61 tokens per second)
llama_print_timings:       total time =   28271.72 ms /   236 tokens
Llama.generate: 57 prefix-match hit, remaining 68 prompt tokens to eval


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  '                   The length of the object is 845mm and cost $1m. The sunrise happens at 0750 EST and visibily is 35 km.      The right way to summarize and favor some advisors is still to be found'
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ list **each correction made** ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
 **.1. Edited Text:** The length of the object is 845mm and costs $1m. The sunrise happens at 0750 EST and visibility is 35 km. The right way to summarize and favor some advisors is still to be found.'

**2. Corrections:**
- Removed extra space after 'The length of the object'
- Changed 'cost $1m' to '$1m cost'
- Added a period at the end of the sent


llama_print_timings:        load time =    3653.13 ms
llama_print_timings:      sample time =      12.77 ms /    19 runs   (    0.67 ms per token,  1487.75 tokens per second)
llama_print_timings: prompt eval time =    1959.95 ms /    68 tokens (   28.82 ms per token,    34.69 tokens per second)
llama_print_timings:        eval time =    3942.65 ms /    18 runs   (  219.04 ms per token,     4.57 tokens per second)
llama_print_timings:       total time =    5930.68 ms /    86 tokens
Llama.generate: 57 prefix-match hit, remaining 129 prompt tokens to eval


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  'Table Section'
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ list **each correction made** ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
 1. Edited Text: Table Section
2. Corrections: None needed.





llama_print_timings:        load time =    3653.13 ms
llama_print_timings:      sample time =      98.86 ms /   140 runs   (    0.71 ms per token,  1416.16 tokens per second)
llama_print_timings: prompt eval time =    3729.61 ms /   129 tokens (   28.91 ms per token,    34.59 tokens per second)
llama_print_timings:        eval time =   30118.07 ms /   139 runs   (  216.68 ms per token,     4.62 tokens per second)
llama_print_timings:       total time =   34083.63 ms /   268 tokens


PROMPT: 
 <|system|>You are an expert editor who corrects spelling, formatting and grammatical errors<|end|>
<|user|> ** TASK**  
1. Edit the following text for spelling and grammar mistakes:  'Some table to be addd latr here by I. I am trying t  o  make a looooong  setence to see if the LLM is able to capture some of these                       errors such as 2 n    d  or 4 t  h or 6 %s '
2. Remove any formatting errors such as extra spaces**IMPORTANT** use the following template to format your response **.1. Edited Text: [ Your corrected text here ]2. Corrections: [ list **each correction made** ]<|end|>
<|assistant|>
 


LLM RESPONSE: 
 1. Edited Text: 'Some table to be added later here by I. I am trying t o make a looooong setence to see if the LLM is able to capture some of these errors such as 2 n, d or 4 t h or 6 %s'
2. Corrections: [list each correction made]
- Replaced 't' with 'to' in "I am trying t o make a looooong setence"
- Removed extra space after 'by I.'
- Changed 'd' 