# Semantic diff

In [11]:
import os
from openai import AsyncAzureOpenAI, OpenAI
import json
from IPython.display import display, HTML
from fuzzysearch import find_near_matches

In [12]:
os.environ["OPENAI_API_KEY"] =""
model="gpt-4-turbo"
os.environ["AZURE_OPENAI_API_KEY"] = ""
os.environ["OPENAI_API_VERSION"] = "2023-05-15"
os.environ["AZURE_OPENAI_ENDPOINT"] = ""

In [19]:

def ask_openai_llm(idx, messages, model="gpt-4-turbo", max_tokens=16000, temperature=0):
    client = OpenAI()
    response = client.chat.completions.create(
        model=model, 
        messages=messages, 
        max_tokens=max_tokens,
        temperature=temperature
    )
    return (idx, response)

async def ask_llm(idx, messages, model="gpt-4-32k", max_tokens=256, temperature=0):

    response = None
    client = AsyncAzureOpenAI(  
        api_key = os.getenv("AZURE_OPENAI_API_KEY"),  
        api_version = os.getenv("OPENAI_API_VERSION"),
        azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
    )
    response = await client.chat.completions.create(
        model=model, 
        messages=messages, 
        max_tokens=max_tokens,
        temperature=temperature
    )

    return (idx, response)


async def get_embeddings(idx, text, model="text-embedding-ada-002", max_tokens=4096):

    client = AsyncAzureOpenAI(  
        api_key = os.getenv("AZURE_OPENAI_API_KEY"),  
        api_version = os.getenv("OPENAI_API_VERSION"),
        azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
    )
    response = await client.embeddings.create(
        model=model,
        input=text
    )

    return (idx, response)


In [4]:
def highlight_text_by_multiple_offsets(text, offsets1, color1, offsets2, color2):
    """
    Highlights parts of text based on two lists of offsets with different colors.
    
    Parameters:
        text (str): The original text.
        offsets1 (list of tuples): First list of tuples where each tuple contains (start, end) indices.
        color1 (str): CSS color string for the first list of offsets.
        offsets2 (list of tuples): Second list of tuples where each tuple contains (start, end) indices.
        color2 (str): CSS color string for the second list of offsets.
        
    Returns:
        Displayable HTML object with highlighted text.
    """
    # Combine both lists with their respective colors
    combined_offsets = [(start, end, color1) for start, end in offsets1] + [(start, end, color2) for start, end in offsets2]
    # Sort offsets to handle them in order in the text
    combined_offsets = sorted(combined_offsets, key=lambda x: x[0])
    
    # List to hold pieces of the new HTML string
    highlighted_text_parts = []
    last_end = 0
    
    # Iterate through each offset tuple
    for start, end, color in combined_offsets:
        # Add non-highlighted text piece
        highlighted_text_parts.append(text[last_end:start])
        # Add highlighted text piece
        highlighted_text_piece = f'<span style="background-color: {color}; color: black;">{text[start:end]}</span>'
        highlighted_text_parts.append(highlighted_text_piece)
        last_end = end
    
    # Add any remaining text after the last highlight
    highlighted_text_parts.append(text[last_end:])
    
    # Join all parts into a single string
    highlighted_text = ''.join(highlighted_text_parts)
    
    # Wrap in <pre> tags to preserve whitespace and formatting
    return f'<pre>{highlighted_text}</pre>'


def remove_overlapping(ranges1, ranges2):
    """
    Removes items from the second list of ranges that overlap with any range in the first list.
    
    Parameters:
        ranges1 (list of tuples): List of tuples representing ranges (start, end) in the first list.
        ranges2 (list of tuples): List of tuples representing ranges (start, end) in the second list.
    
    Returns:
        list of tuples: Filtered list of ranges from the second list with no overlaps.
    """
    filtered_ranges = []

    for start2, end2 in ranges2:
        overlap = False
        for start1, end1 in ranges1:
            # Check if there is an overlap
            if start1 <= end2 and end1 >= start2:
                overlap = True
                break
        if not overlap:
            filtered_ranges.append((start2, end2))

    return filtered_ranges


def fuzzy_phrase_search(text, phrase, max_l_dist=5):
    matches = find_near_matches(phrase, text, max_l_dist=max_l_dist)
    return [[match.start, match.end] for match in matches]


def get_positions2(text, phrases):
    positions = []
    for phrase in phrases:
        fuzzy_pos = fuzzy_phrase_search(text, phrase)[0]   
        if len(fuzzy_pos) > 0:
            positions.append(fuzzy_pos)
    return positions


def get_highlighted_html2(results, text, text_key, absent_key='in_source_not_in_new'):
   
    consistent_phrases = [p[text_key] for p in results['consistent']]
    text_absent = [p[text_key] for p in results[absent_key]]

    consistent_pos = get_positions2(text, consistent_phrases)
    absent_pos = get_positions2(text, text_absent)

    absent_pos = remove_overlapping(consistent_pos, absent_pos)

    highlighted_text = highlight_text_by_multiple_offsets(text, consistent_pos, 'lightgreen', absent_pos, 'lightcoral')
    return highlighted_text


In [5]:

def get_diff_prompt(text1, text2):
    
    prompt = f"""I am providing two pieces or sections of text. A source text and a new text. 
    
    Please list the statements in the new text that are consistent with statements in the source text.

    Please also provide a list of statements that are in the source text but are not in the new text.

    Please also provide a list of statements that are in the new text but are not in the source text.

    source Text: 
    {text1}

    
    new text:
    {text2}

    For each item please provide the relevant sections. Return JSON in the form {{"consistent": [{{"new_text": "", "source_text": ""}}], "new_not_in_source": [], "source_not_in_new": []]}}.
    """

    messages = [
        {"role": "system", "content": "Compare two texts and output differences"},
        {"role": "user", "content": prompt},
    ]
    return messages


# Compare text

In [6]:
source_text = """Amendments to relating to fire safety information
13.—(1) In regulation 38 (fire safety information) of the 2010 Regulations, for paragraph (2) substitute—

“(2) The person carrying out the work must give fire safety information to the responsible person no later than—

(a)where the building, proposed building or extension to which the building work relates is not occupied during the building work, the date of completion of the work or the date of occupation of the building or the extension, whichever is the earlier;
(b)in any other case, the date of completion of the work.
(2A) The responsible person must give the person carrying out the work a notice acknowledging receipt of the fire safety information and confirming the information provided is sufficient to enable them to understand, operate and maintain the building (and the fire safety systems in it) after the building work in question.

(2B) Subject to paragraph (2D), the person carrying out the work must give a notice to the relevant authority—

(a)confirming that they have given the fire safety information to the responsible person pursuant to paragraph (2), and
(b)stating that they have received the notice from the responsible person pursuant to paragraph (2A) or where they have not received the notice, stating the steps taken to obtain the notice from the responsible person and the dates they were taken.
(2C) The notification under paragraph (2B) must be given no later than—

(a)where regulation 20 (provisions applicable to self-certification schemes) applies to the work, 30 days after the date referred to in paragraph (2),
(b)in any other case, five days after the date referred to in paragraph (2).
(2D) Paragraphs (2B) and (2C) do not apply where regulation 20A (provisions applicable to third party certification schemes) applies to the work and instead paragraphs (2E) to (2G) apply.

(2E) Where this paragraph applies, the person carrying out the work must notify the third party certifier appointed under regulation 12(6)(c)—

(a)confirming that they have given the fire safety information to the responsible person pursuant to paragraph (2), and
(b)stating that they have received the notice from the responsible person pursuant to paragraph (2A) or where they have not received the notice, stating the steps taken to obtain the notice from the responsible person and the dates they were taken.
(2F) The notification under paragraph (2E) must be given no later than seven days after the date referred to in paragraph (2).

(2G) Within 30 days of receiving the notification under paragraph (2E) the third party certifier appointed under regulation 12(6)(c) must notify the relevant authority confirming receipt of the notification under paragraph (2E).”.

(2) In regulation 17 (completion certificates), in paragraph (1) for “it, a building complies with the relevant provisions” substitute “a building, the relevant provisions have been complied with in relation to the building”.

(3) In regulation 17A (certificate for building occupied before work is completed), in sub-paragraph (1)(c) for “those parts of the building which are to be occupied before completion of the work currently comply with regulation 38 and Part B of Schedule 1” substitute “regulation 38 and Part B of Schedule 1 are currently complied with in relation to those parts of the building which are to be occupied before completion of the work”."""


new_text = """Regulation 13 of this instrument strengthens fire safety information handover from the 
person carrying out the work to the responsible person. This will include handing over 
the information earlier in the process; gaining confirmation from the person who 
receives the information that it is sufficient to enable them to understand, operate and 
maintain the building; and giving notice to the relevant authority that the transfer of 
information has taken place. The fire safety information is information relating to the 
design and construction of the building or extension, and the services, fittings and 
equipment provided in or in connection with the building or extension which will assist 
the responsible person to operate and maintain the building or extension with reasonable 
safety."""

### Get the diff phrases from the LLM

In [23]:
messages = get_diff_prompt(source_text, new_text)
idx, relevant_fact = await ask_llm(0, messages, max_tokens=16000)
#idx, relevant_fact = ask_openai_llm(0, messages, model="gpt-4-turbo", max_tokens=4000)
results = json.loads(relevant_fact.choices[0].message.content)

### Show them highlighted on the original text

In [74]:
source_text_highlighted = get_highlighted_html2(results, source_text, 'source_text', 'source_not_in_new')
new_text_highlighted = get_highlighted_html2(results, new_text, 'new_text', 'new_not_in_source')
display(HTML("<h1>Highlighted Differences</h1><h2>New Text</h2>"))
display(HTML(new_text_highlighted))
display(HTML("<h2>Source Text</h2>"))
display(HTML(source_text_highlighted))


### Example 2

In [75]:

# https://www.legislation.gov.uk/uksi/2023/911/regulation/13/made
# https://www.legislation.gov.uk/uksi/2023/911/pdfs/uksiem_20230911_en_002.pdf


source_text = """Amendments: consultation on applications for building control
9.  After regulation 15 of the 2010 Regulations insert—

“Consultation in relation to fire safety
15A.—(1) This regulation applies where it is proposed—

(a)to erect, extend or make any structural alteration to a building to which the Regulatory Reform (Fire Safety) Order 2005 applies or will apply after completion of the work, or
(b)to change the use of a building to which the Regulatory Reform (Fire Safety) Order 2005 applies or will apply after the change of use,
and, in connection with that proposal, an application for building control approval with full plans is given to a relevant authority.

(2) Subject to paragraph (3), where this regulation applies the relevant authority must consult the enforcing authority before determining the application for building control approval with full plans given to the relevant authority.

(3) The duty to consult imposed by paragraph (2) does not apply where the relevant authority is the enforcing authority.

(4) In this regulation “enforcing authority” has the same meaning as in article 25 of the Regulatory Reform (Fire Safety) Order 2005.”."""


new_text = """Regulation 9 prescribes that a consultation between the building control authority and 
the relevant enforcing authorities for the Regulatory Reform (Fire Safety) Order 2005 
(Fire Safety Order) must take place before an application for building control approval 
is determined. The 2022 Act provides for article 45 (consultation with fire and rescue 
authorities) of the Regulatory Reform (Fire Safety) Order 2005 to be revoked to enable 
this requirement to be moved into the Building Regulations."""

In [76]:
messages = get_diff_prompt(source_text, new_text)
idx, relevant_fact = await ask_llm(0, messages, max_tokens=1500)
#idx, relevant_fact = ask_openai_llm(0, messages, model="gpt-4-turbo", max_tokens=4000)
results = json.loads(relevant_fact.choices[0].message.content)

In [79]:
results

{'consistent': [{'new_text': 'Regulation 9 prescribes that a consultation between the building control authority and the relevant enforcing authorities for the Regulatory Reform (Fire Safety) Order 2005 (Fire Safety Order) must take place before an application for building control approval is determined.',
   'source_text': '(2) Subject to paragraph (3), where this regulation applies the relevant authority must consult the enforcing authority before determining the application for building control approval with full plans given to the relevant authority.'}],
 'new_not_in_source': [{'new_text': 'The 2022 Act provides for article 45 (consultation with fire and rescue authorities) of the Regulatory Reform (Fire Safety) Order 2005 to be revoked to enable this requirement to be moved into the Building Regulations.'}],
 'source_not_in_new': [{'source_text': '9.  After regulation 15 of the 2010 Regulations insert—'},
  {'source_text': '“Consultation in relation to fire safety'},
  {'source_te

In [77]:

source_text_highlighted = get_highlighted_html2(results, source_text, 'source_text', 'source_not_in_new')
new_text_highlighted = get_highlighted_html2(results, new_text, 'new_text', 'new_not_in_source')
display(HTML("<h1>Highlighted Differences</h1><h2>New Text</h2>"))
display(HTML(new_text_highlighted))
display(HTML("<h2>Source Text</h2>"))
display(HTML(source_text_highlighted))
