# Semantic Diff

In [2]:
import os
import asyncio
from openai import AsyncAzureOpenAI

In [20]:
os.environ["AZURE_OPENAI_API_KEY"] = ""
os.environ["OPENAI_API_VERSION"] = "2023-05-15"
os.environ["AZURE_OPENAI_ENDPOINT"] = ""

In [47]:
import asyncio
from collections.abc import Callable
import logging
import os
import random
from datetime import datetime
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff


class APICaller:

    def __init__(self, num_concurrent: int, api_func: Callable):
        self.api_func = api_func
        self.num_concurrent = num_concurrent
        self.call_timing = []

    def get_url(self):
        return random.choice(self.urls)

    @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
    async def call_api(self, idx: int, messages: dict, semaphore: asyncio.Semaphore):
        try: 
            async with semaphore:  # Acquire a semaphore slot before making the API call, this throttles the concurrency
                start = datetime.now()
                result = await self.api_func(idx, messages)
                #if 'usage' in result:
                #    self.usage += result['usage']['total_tokens']
        except Exception as e:
            logging.error("API call failed for %s: %s", messages, e)
            raise e
            #return None
        finally:
            duration = (datetime.now() - start).total_seconds()
            self.call_timing.append(duration)
            if idx % 20 == 0:
                logging.info("Call %s completed in %s seconds", idx, duration)

        return result

    async def send(self, requests):
        semaphore = asyncio.Semaphore(self.num_concurrent)  
        tasks = []
        for idx, item in enumerate(requests):
            task = asyncio.create_task(self.call_api(idx, item, semaphore))
            tasks.append(task)
        return await asyncio.gather(*tasks, return_exceptions=True)


async def ask_llm(idx, messages, model="gpt-4-32k", max_tokens=256):

    response = None
    client = AsyncAzureOpenAI(  
        api_key = os.getenv("AZURE_OPENAI_API_KEY"),  
        api_version = os.getenv("OPENAI_API_VERSION"),
        azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
    )
    response = await client.chat.completions.create(
        model=model, 
        messages=messages, 
        max_tokens=max_tokens
    )

    return (idx, response)


async def get_embeddings(idx, text, model="text-embedding-ada-002", max_tokens=4096):

    client = AsyncAzureOpenAI(  
        api_key = os.getenv("AZURE_OPENAI_API_KEY"),  
        api_version = os.getenv("OPENAI_API_VERSION"),
        azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
    )
    response = await client.embeddings.create(
        model=model,
        input=text
    )

    return (idx, response)


In [146]:

def get_facts_prompt(text):
    prompt = f"""Take the following text and output any statements or facts contained in the paragraph and provide the source text for each statement or fact: \r\n\rn{text}\r\nOutput the json in the form [{{"fact": "", "source_text: ""}}, {{"fact": "", "source_text: ""}}, etc.]"""
    messages = [
        {"role": "system", "content": "Extract facts and related source text"},
        {"role": "user", "content": prompt},
    ]
    return messages


def get_fact_relevance(text, source_text):
    prompt = f"""Is the following statement consistent with the source text? and please output the relevant section of the source text:\r\n\r\nStatement: {text}\r\n\r\nSource Text: {source_text}\r\n Output the result in json: {{"consistent": true/false, "relevant_text": ""}}"""
    messages = [
        {"role": "system", "content": "Check fact relevance"},
        {"role": "user", "content": prompt},
    ]
    return messages


def get_diff_prompt(text1, text2):
    
    prompt = f"""I am providing two pieces or sections of text. A source text and a new text. 
    
    Please list the statements in the new text that are consistent with statements in the source text.

    Please also provide a list of statements that are in the source text but are not in the new text.

    Please also provide a list of statements that are in the new text but are not in the source text.

    source Text: 
    {text1}

    
    new text:
    {text2}

    For each item please provide the relevant sections. Return JSON in the form {{"consistent": [{{"new_text": "", "source_text": ""}}, ...]}}.
    """

    messages = [
        {"role": "system", "content": "Compare two texts and output differences"},
        {"role": "user", "content": prompt},
    ]
    return messages


In [130]:
source_text = """Amendments: consultation on applications for building control
9.  After regulation 15 of the 2010 Regulations insert—

“Consultation in relation to fire safety
15A.—(1) This regulation applies where it is proposed—

(a)to erect, extend or make any structural alteration to a building to which the Regulatory Reform (Fire Safety) Order 2005 applies or will apply after completion of the work, or
(b)to change the use of a building to which the Regulatory Reform (Fire Safety) Order 2005 applies or will apply after the change of use,
and, in connection with that proposal, an application for building control approval with full plans is given to a relevant authority.

(2) Subject to paragraph (3), where this regulation applies the relevant authority must consult the enforcing authority before determining the application for building control approval with full plans given to the relevant authority.

(3) The duty to consult imposed by paragraph (2) does not apply where the relevant authority is the enforcing authority.

(4) In this regulation “enforcing authority” has the same meaning as in article 25 of the Regulatory Reform (Fire Safety) Order 2005.”."""

In [18]:
new_text = """Regulation 9 prescribes that a consultation between the building control authority and 
the relevant enforcing authorities for the Regulatory Reform (Fire Safety) Order 2005 
(Fire Safety Order) must take place before an application for building control approval 
is determined. The 2022 Act provides for article 45 (consultation with fire and rescue 
authorities) of the Regulatory Reform (Fire Safety) Order 2005 to be revoked to enable 
this requirement to be moved into the Building Regulations."""

In [43]:
source_prompt = get_facts_prompt(source_text)
new_prompt = get_facts_prompt(new_text)

#source_response = asyncio.run(ask_llm(0, source_prompt))
#new_response = asyncio.run(ask_llm(1, new_prompt))

source_response = await ask_llm(0, source_prompt, max_tokens=4096)
new_response = await ask_llm(1, new_prompt, max_tokens=4096)

In [77]:
import json
source_facts = json.loads(source_response[1].choices[0].message.content)
new_facts = json.loads(new_response[1].choices[0].message.content)



In [45]:
new_facts

[{'fact': 'Regulation 9 prescribes a consultation between the building control authority and the relevant enforcing authorities for the Regulatory Reform (Fire Safety) Order 2005 must take place before an application for building control approval is determined.',
  'source_text': 'Regulation 9 prescribes that a consultation between the building control authority and the relevant enforcing authorities for the Regulatory Reform (Fire Safety) Order 2005 must take place before an application for building control approval is determined.'},
 {'fact': 'The 2022 Act provides for article 45 of the Regulatory Reform (Fire Safety) Order 2005 to be revoked to enable this requirement to be moved into the Building Regulations.',
  'source_text': 'The 2022 Act provides for article 45 (consultation with fire and rescue authorities) of the Regulatory Reform (Fire Safety) Order 2005 to be revoked to enable this requirement to be moved into the Building Regulations.'}]

In [78]:
# get embeddings and cross compare to identify the same facts?

async def get_fact_embeddings(facts):

    for fact in facts:
        fact_text = fact['fact']
        source_text = fact['source_text']
        idx, resp = await get_embeddings(0, [fact_text, source_text])
        fact['fact_embedding'] = resp.data[0].embedding
        fact['source_embedding'] = resp.data[1].embedding 

    return facts


In [79]:
new_facts = await get_fact_embeddings(new_facts)
source_facts = await get_fact_embeddings(source_facts)

In [94]:
import numpy as np
from scipy.spatial.distance import cdist

def find_best_matches(embeddings1, embeddings2, threshold=0.2):
    # Calculate the cosine distance between two sets of embeddings
    cosine_distances = cdist(embeddings1, embeddings2, 'cosine')
    
    # Find the index of the minimum cosine distance for each item in embeddings1
    best_matches = np.argmin(cosine_distances, axis=1)
    
    # Convert cosine distances to similarities (1 - distance) and apply threshold
    best_similarities = 1 - np.min(cosine_distances, axis=1)
    match_list = [(i, best_matches[i]) for i in range(len(embeddings1)) if best_similarities[i] >= threshold]
    
    # Return the match list and the array of cosine distances
    return match_list, cosine_distances, best_similarities

In [110]:
source_embeddings = [f['fact_embedding'] for f in source_facts]
new_embeddings = [f['fact_embedding'] for f in new_facts]
match_list, dists, best_similarities = find_best_matches(source_embeddings, new_embeddings, threshold=0.9)
match_list, dists, best_similarities

([(0, 0), (1, 0)],
 array([[0.08493979, 0.10343929],
        [0.08427627, 0.09626686],
        [0.10214423, 0.16540304],
        [0.10193707, 0.19573742],
        [0.1894266 , 0.23007876],
        [0.12860412, 0.12070059]]),
 array([0.91506021, 0.91572373, 0.89785577, 0.89806293, 0.8105734 ,
        0.87929941]))

In [112]:
print(source_facts[0]['fact'], '\r\n',source_facts[0]['source_text'])
    
print(new_facts[0]['fact'], '\r\n',new_facts[0]['source_text'])

The regulation applies where it is proposed to erect, extend or make any structural alteration to a building to which the Regulatory Reform (Fire Safety) Order 2005 applies or will apply after completion of the work. 
 15A.—(1) This regulation applies where it is proposed—(a)to erect, extend or make any structural alteration to a building to which the Regulatory Reform (Fire Safety) Order 2005 applies or will apply after completion of the work
Regulation 9 prescribes a consultation between the building control authority and the relevant enforcing authorities for the Regulatory Reform (Fire Safety) Order 2005 must take place before an application for building control approval is determined. 
 Regulation 9 prescribes that a consultation between the building control authority and the relevant enforcing authorities for the Regulatory Reform (Fire Safety) Order 2005 must take place before an application for building control approval is determined.


In [114]:
print(source_facts[0]['source_text'])
    
print(new_facts[1]['source_text'])

15A.—(1) This regulation applies where it is proposed—(a)to erect, extend or make any structural alteration to a building to which the Regulatory Reform (Fire Safety) Order 2005 applies or will apply after completion of the work
The 2022 Act provides for article 45 (consultation with fire and rescue authorities) of the Regulatory Reform (Fire Safety) Order 2005 to be revoked to enable this requirement to be moved into the Building Regulations.


In [116]:
[f['source_text'] for f in source_facts]

['15A.—(1) This regulation applies where it is proposed—(a)to erect, extend or make any structural alteration to a building to which the Regulatory Reform (Fire Safety) Order 2005 applies or will apply after completion of the work',
 '(b)to change the use of a building to which the Regulatory Reform (Fire Safety) Order 2005 applies or will apply after the change of use',
 'and, in connection with that proposal, an application for building control approval with full plans is given to a relevant authority.',
 '(2) Subject to paragraph (3), where this regulation applies the relevant authority must consult the enforcing authority before determining the application for building control approval with full plans given to the relevant authority.',
 '(3) The duty to consult imposed by paragraph (2) does not apply where the relevant authority is the enforcing authority.',
 '(4) In this regulation “enforcing authority” has the same meaning as in article 25 of the Regulatory Reform (Fire Safety) O

In [118]:
[f['source_text'] for f in new_facts]

['Regulation 9 prescribes that a consultation between the building control authority and the relevant enforcing authorities for the Regulatory Reform (Fire Safety) Order 2005 must take place before an application for building control approval is determined.',
 'The 2022 Act provides for article 45 (consultation with fire and rescue authorities) of the Regulatory Reform (Fire Safety) Order 2005 to be revoked to enable this requirement to be moved into the Building Regulations.']

In [None]:
new_text = """Regulation 9 prescribes that a consultation between the building control authority and 
the relevant enforcing authorities for the Regulatory Reform (Fire Safety) Order 2005 
(Fire Safety Order) must take place before an application for building control approval 
is determined. The 2022 Act provides for article 45 (consultation with fire and rescue 
authorities) of the Regulatory Reform (Fire Safety) Order 2005 to be revoked to enable 
this requirement to be moved into the Building Regulations."""

In [None]:
new_facts[0]['source_text']

In [134]:
messages = get_fact_relevance(new_facts[1]['source_text'], source_text)
idx, relevant_fact = await ask_llm(0, messages, max_tokens=4096)


In [135]:
json.loads(relevant_fact.choices[0].message.content)

{'consistent': False,
 'relevant_text': 'After regulation 15 of the 2010 Regulations insert—\n\n“Consultation in relation to fire safety\n15A.—(1) This regulation applies where it is proposed—\n\n(a)to erect, extend or make any structural alteration to a building to which the Regulatory Reform (Fire Safety) Order 2005 applies or will apply after completion of the work, or\n(b)to change the use of a building to which the Regulatory Reform (Fire Safety) Order 2005 applies or will apply after the change of use,\nand, in connection with that proposal, an application for building control approval with full plans is given to a relevant authority.\n\n(2) Subject to paragraph (3), where this regulation applies the relevant authority must consult the enforcing authority before determining the application for building control approval with full plans given to the relevant authority.\n\n(3) The duty to consult imposed by paragraph (2) does not apply where the relevant authority is the enforcing a

In [129]:
source_text

'The 2022 Act provides for article 45 (consultation with fire and rescue authorities) of the Regulatory Reform (Fire Safety) Order 2005 to be revoked to enable this requirement to be moved into the Building Regulations.'

## Just Ask

In [139]:
print(source_text)

Amendments: consultation on applications for building control
9.  After regulation 15 of the 2010 Regulations insert—

“Consultation in relation to fire safety
15A.—(1) This regulation applies where it is proposed—

(a)to erect, extend or make any structural alteration to a building to which the Regulatory Reform (Fire Safety) Order 2005 applies or will apply after completion of the work, or
(b)to change the use of a building to which the Regulatory Reform (Fire Safety) Order 2005 applies or will apply after the change of use,
and, in connection with that proposal, an application for building control approval with full plans is given to a relevant authority.

(2) Subject to paragraph (3), where this regulation applies the relevant authority must consult the enforcing authority before determining the application for building control approval with full plans given to the relevant authority.

(3) The duty to consult imposed by paragraph (2) does not apply where the relevant authority is t

In [140]:
print(new_text)

Regulation 9 prescribes that a consultation between the building control authority and 
the relevant enforcing authorities for the Regulatory Reform (Fire Safety) Order 2005 
(Fire Safety Order) must take place before an application for building control approval 
is determined. The 2022 Act provides for article 45 (consultation with fire and rescue 
authorities) of the Regulatory Reform (Fire Safety) Order 2005 to be revoked to enable 
this requirement to be moved into the Building Regulations.


In [147]:
messages = get_diff_prompt(source_text, new_text)
idx, relevant_fact = await ask_llm(0, messages, max_tokens=4096)

In [150]:
results = json.loads(relevant_fact.choices[0].message.content)

In [197]:
import re
from IPython.display import display, HTML

def highlight_phrases(text, phrases, color='gray'):
    # Escape phrases for literal matching and sort by length in descending order to match longer phrases first
    phrases = sorted(phrases, key=len, reverse=True)
    phrases = [re.escape(phrase) for phrase in phrases]

    # Create a regular expression from the list of phrases
    regex_pattern = r'(' + '|'.join(phrases) + r')'

    # Function to apply highlighting style
    def highlight(match):
        return f'<span style="background-color: {color};">{match.group(0)}</span>'
    
    # Replace occurrences of phrases with highlighted version
    highlighted_text = re.sub(regex_pattern, highlight, text, flags=re.IGNORECASE)

    return f'<pre>{highlighted_text}</pre>'

from IPython.display import display, HTML
import re

def highlight_phrases_with_priority(text, primary_phrases, primary_color, secondary_phrases, secondary_color):
    # Escape all phrases to handle any special characters they may contain
    escaped_primary = {re.escape(phrase): phrase for phrase in primary_phrases}
    escaped_secondary = {re.escape(phrase): phrase for phrase in secondary_phrases}

    # Create a combined dictionary with primary phrases taking precedence
    all_phrases = {**escaped_secondary, **escaped_primary}

    # Create the regex pattern from the phrases, ensuring primary phrases are matched first
    regex_pattern = r'(' + '|'.join(all_phrases.keys()) + r')'

    # Function to apply highlighting style based on which list the match came from
    def highlight(match):
        matched_text = match.group(0)
        # Check if the matched text is from the primary list
        if matched_text in primary_phrases:
            return f'<span style="color: black; background-color: {primary_color};">{matched_text}</span>'
        else:
            return f'<span style="color: black; background-color: {secondary_color};">{matched_text}</span>'

    # Replace occurrences of phrases with highlighted versions
    highlighted_text = re.sub(regex_pattern, highlight, text, flags=re.IGNORECASE)

    # Wrap in <pre> tags to preserve whitespace and formatting
    return f'<pre>{highlighted_text}</pre>'


from IPython.display import display, HTML
import re

def compile_pattern(phrase):
    """Compile a regex pattern that accounts for arbitrary internal whitespace."""
    return re.compile(r'\s*'.join(re.escape(char) for char in phrase))

def highlight_phrases(text, phrases, colors):
    # Compile regex patterns for each phrase, considering internal variable whitespace
    patterns = {compile_pattern(phrase): color for phrase, color in zip(phrases, colors)}

    # Function to replace matched phrases with highlighted versions
    def replace(match):
        matched_text = match.group(0)
        for pattern, color in patterns.items():
            if pattern.fullmatch(matched_text):
                return f'<span style="background-color: {color}; color: black;">{matched_text}</span>'
        return matched_text  # default case, should not happen

    # Combine all patterns into a single pattern
    combined_pattern = re.compile('|'.join('(?:' + pattern.pattern + ')' for pattern in patterns))

    # Replace occurrences of phrases with highlighted versions
    highlighted_text = re.sub(combined_pattern, replace, text, flags=re.DOTALL)

    # Wrap in <pre> tags to preserve whitespace and formatting
    return f'<pre>{highlighted_text}</pre>'

# Usage example



In [151]:
results

{'consistent': [{'new_text': 'Regulation 9 prescribes that a consultation between the building control authority and the relevant enforcing authorities for the Regulatory Reform (Fire Safety) Order 2005 (Fire Safety Order) must take place before an application for building control approval is determined.',
   'source_text': 'Subject to paragraph (3), where this regulation applies the relevant authority must consult the enforcing authority before determining the application for building control approval with full plans given to the relevant authority.'}],
 'source_text_absent_in_new_text': [{'source_text': 'This regulation applies where it is proposed—to erect, extend or make any structural alteration to a building to which the Regulatory Reform (Fire Safety) Order 2005 applies or will apply after completion of the work, or to change the use of a building to which the Regulatory Reform (Fire Safety) Order 2005 applies or will apply after the change of use, and, in connection with that p

In [179]:
def highlight_differences_new_text(text, results):
    # List of phrases to highlight
    consistent_phrases = [p['new_text'] for p in results['consistent']]
    source_phrases_absent_in_new = [p['new_text'] for p in results['new_text_absent_in_source_text']]

    # Highlight the phrases in the text
    # highlighted_text = highlight_phrases(source_text, phrases)
    highlighted_text = highlight_phrases_with_priority(text, consistent_phrases, 'lightgreen',  source_phrases_absent_in_new, 'lightcoral')

    # Display the highlighted text
    display(HTML(highlighted_text))
    

def highlight_differences_source_text(text, results):
    # List of phrases to highlight
    consistent_phrases = [p['source_text'] for p in results['consistent']]
    source_phrases_absent_in_new = [p['source_text'] for p in results['source_text_absent_in_new_text']]

    # Highlight the phrases in the text
    # highlighted_text = highlight_phrases(source_text, phrases)
    highlighted_text = highlight_phrases_with_priority(text, consistent_phrases, 'lightgreen',  source_phrases_absent_in_new, 'lightcoral')

    # Display the highlighted text
    display(HTML(highlighted_text))

In [219]:
import re

def search_string_with_offsets(text, search_term):
    # Normalize whitespace in the search term and create a regex pattern
    # The pattern will match the search term even if there are new lines, spaces, or tabs between each character
    normalized_pattern = r'\s*'.join(map(re.escape, search_term))
    
    # Compile the regex pattern with re.DOTALL to match across lines
    pattern = re.compile(normalized_pattern, re.DOTALL)
    
    # Search for the pattern in the text
    match = pattern.search(text)
    
    # Return the match offsets if found
    if match:
        return [match.start(), match.end()]
    else:
        return []
    
from IPython.display import display, HTML

def highlight_text_by_offsets(text, offsets, color='yellow'):
    """
    Highlights parts of text based on a list of offsets with the specified color.
    
    Parameters:
        text (str): The original text.
        offsets (list of tuples): A list of tuples where each tuple contains (start, end) indices.
        color (str): A CSS color string.
        
    Returns:
        Displayable HTML object with highlighted text.
    """
    # Sort offsets to handle them in order in the text
    offsets = sorted(offsets, key=lambda x: x[0])
    
    # List to hold pieces of the new HTML string
    highlighted_text_parts = []
    last_end = 0
    
    # Iterate through each offset pair
    for start, end in offsets:
        # Add non-highlighted text piece
        highlighted_text_parts.append(text[last_end:start])
        # Add highlighted text piece
        highlighted_text_piece = f'<span style="background-color: {color};">{text[start:end]}</span>'
        highlighted_text_parts.append(highlighted_text_piece)
        last_end = end
    
    # Add any remaining text after the last highlight
    highlighted_text_parts.append(text[last_end:])
    
    # Join all parts into a single string
    highlighted_text = ''.join(highlighted_text_parts)
    
    # Return HTML displayable object
    return '<pre>' + highlighted_text + '</pre>'

# Example Usage

from IPython.display import display, HTML

def highlight_text_by_multiple_offsets(text, offsets1, color1, offsets2, color2):
    """
    Highlights parts of text based on two lists of offsets with different colors.
    
    Parameters:
        text (str): The original text.
        offsets1 (list of tuples): First list of tuples where each tuple contains (start, end) indices.
        color1 (str): CSS color string for the first list of offsets.
        offsets2 (list of tuples): Second list of tuples where each tuple contains (start, end) indices.
        color2 (str): CSS color string for the second list of offsets.
        
    Returns:
        Displayable HTML object with highlighted text.
    """
    # Combine both lists with their respective colors
    combined_offsets = [(start, end, color1) for start, end in offsets1] + [(start, end, color2) for start, end in offsets2]
    # Sort offsets to handle them in order in the text
    combined_offsets = sorted(combined_offsets, key=lambda x: x[0])
    
    # List to hold pieces of the new HTML string
    highlighted_text_parts = []
    last_end = 0
    
    # Iterate through each offset tuple
    for start, end, color in combined_offsets:
        # Add non-highlighted text piece
        highlighted_text_parts.append(text[last_end:start])
        # Add highlighted text piece
        highlighted_text_piece = f'<span style="background-color: {color}; color: black;">{text[start:end]}</span>'
        highlighted_text_parts.append(highlighted_text_piece)
        last_end = end
    
    # Add any remaining text after the last highlight
    highlighted_text_parts.append(text[last_end:])
    
    # Join all parts into a single string
    highlighted_text = ''.join(highlighted_text_parts)
    
    # Wrap in <pre> tags to preserve whitespace and formatting
    return f'<pre>{highlighted_text}</pre>'

def remove_overlapping(ranges1, ranges2):
    """
    Removes items from the second list of ranges that overlap with any range in the first list.
    
    Parameters:
        ranges1 (list of tuples): List of tuples representing ranges (start, end) in the first list.
        ranges2 (list of tuples): List of tuples representing ranges (start, end) in the second list.
    
    Returns:
        list of tuples: Filtered list of ranges from the second list with no overlaps.
    """
    filtered_ranges = []

    for start2, end2 in ranges2:
        overlap = False
        for start1, end1 in ranges1:
            # Check if there is an overlap
            if start1 <= end2 and end1 >= start2:
                overlap = True
                break
        if not overlap:
            filtered_ranges.append((start2, end2))

    return filtered_ranges

# Example Usage



In [221]:
def get_positions(text, phrases):
    positions = []
    for phrase in phrases:
        pos = search_string_with_offsets(text, phrase)
        if len(pos) > 0:
            positions.append(pos)
    return positions


def get_highlighted_html(results, text, text_key, absent_key='source_text_absent_in_new_text'):
   
    consistent_phrases = [p[text_key] for p in results['consistent']]
    text_absent = [p[text_key] for p in results[absent_key]]

    consistent_pos = get_positions(text, consistent_phrases)
    absent_pos = get_positions(text, text_absent)

    absent_pos = remove_overlapping(consistent_pos, absent_pos)

    #highlighted_text = highlight_text_by_offsets(new_text, absent_pos, color='lightgreen')
    highlighted_text = highlight_text_by_multiple_offsets(text, consistent_pos, 'lightgreen', absent_pos, 'lightcoral')
    return highlighted_text


consistent_phrases = [p['new_text'] for p in results['consistent']]
new_text_absent_in_source_text = [p['new_text'] for p in results['new_text_absent_in_source_text']]

consistent_pos = get_positions(new_text, consistent_phrases)
absent_pos = get_positions(new_text, new_text_absent_in_source_text)

absent_pos = remove_overlapping(consistent_pos, absent_pos)

#highlighted_text = highlight_text_by_offsets(new_text, absent_pos, color='lightgreen')
highlighted_text = highlight_text_by_multiple_offsets(new_text, consistent_pos, 'lightgreen', absent_pos, 'lightcoral')
display(HTML(highlighted_text))


In [223]:

consistent_phrases = [p['source_text'] for p in results['consistent']]
text_absent = [p['source_text'] for p in results['source_text_absent_in_new_text']]

consistent_pos = get_positions(source_text, consistent_phrases)
absent_pos = get_positions(source_text, text_absent)

absent_pos = remove_overlapping(consistent_pos, absent_pos)

#highlighted_text = highlight_text_by_offsets(new_text, absent_pos, color='lightgreen')
highlighted_text = highlight_text_by_multiple_offsets(source_text, consistent_pos, 'lightgreen', absent_pos, 'lightcoral')
display(HTML(highlighted_text))

In [218]:
consistent_pos, absent_pos,new_text_absent_in_source_text

([[0, 278]],
 [[0, 278], [279, 499]],
 ['Regulation 9 prescribes that a consultation between the building control authority and the relevant enforcing authorities for the Regulatory Reform (Fire Safety) Order 2005 (Fire Safety Order) must take place before an application for building control approval is determined.',
  'The 2022 Act provides for article 45 (consultation with fire and rescue authorities) of the Regulatory Reform (Fire Safety) Order 2005 to be revoked to enable this requirement to be moved into the Building Regulations.'])

In [180]:
highlight_differences_source_text(source_text, results)


In [175]:
# Example text and phrases to be highlighted

# List of phrases to highlight
consistent_phrases = [p['source_text'] for p in results['consistent']]
source_phrases_absent_in_new = [p['source_text'] for p in results['source_text_absent_in_new_text']]

# Highlight the phrases in the text
# highlighted_text = highlight_phrases(source_text, phrases)
highlighted_text = highlight_phrases_with_priority(source_text, consistent_phrases, 'lightgreen',  source_phrases_absent_in_new, 'lightcoral')

# Display the highlighted text
display(HTML(highlighted_text))


In [166]:
[p['source_text'] for p in results['source_text_absent_in_new_text']]

['This regulation applies where it is proposed—to erect, extend or make any structural alteration to a building to which the Regulatory Reform (Fire Safety) Order 2005 applies or will apply after completion of the work, or to change the use of a building to which the Regulatory Reform (Fire Safety) Order 2005 applies or will apply after the change of use, and, in connection with that proposal, an application for building control approval with full plans is given to a relevant authority.',
 'The duty to consult imposed by paragraph (2) does not apply where the relevant authority is the enforcing authority.',
 'In this regulation “enforcing authority” has the same meaning as in article 25 of the Regulatory Reform (Fire Safety) Order 2005.']

In [167]:
[p['source_text'] for p in results['consistent']]

['Subject to paragraph (3), where this regulation applies the relevant authority must consult the enforcing authority before determining the application for building control approval with full plans given to the relevant authority.']