In [None]:
# import gc
# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [1]:
import json
# split = "validate"

# Load the JSON data
with open(f'static_split_score.json', 'r') as json_file:
    data = json.load(json_file)

In [2]:
import html
import re
import difflib

threshold = 1.0

# Helper function to clean text
def clean_text(text):
    # Remove HTML tags
    text = html.unescape(text)
    text = re.sub(r'<.*?>', '', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters like \u002
    text = re.sub(r'\\u[0-9A-Fa-f]+', '', text)
    return text.strip()

def token_count(text):
  input_tokens = tokenizer(text)["input_ids"]
  input_token_count = len(input_tokens)
  return input_token_count

def are_sentences_similar(sentence1, sentence2):
    sentence1_no_spaces = ''.join(sentence1.split())
    sentence2_no_spaces = ''.join(sentence2.split())
    similarity_ratio = difflib.SequenceMatcher(None, sentence1_no_spaces, sentence2_no_spaces).ratio()
    return similarity_ratio >= threshold

## Definition Generation

In [11]:
# Common required libraries
!pip install -q einops accelerate langchain bitsandbytes sentencepiece loralib transformers

^C


### Prompt Generation

In [31]:
system_role_prompt = "Act as a Lawyer drafting European Legislative documents to be published on Eur-Lex website."

In [32]:
system_instruction_prompt = "Define the term: {term}, based on the sentences provided between the triple dashes where different sentences are splitted by new line character \n. ---{sentences}---"

In [33]:
system_context_prompt = "Provide a clear and concise definition strictly within 25 to 45 words that accurately conveys the meaning within the context of the sentences."

In [34]:
system_output_prompt = """Give your output in the following JSON format:
{{
  "term": "{term}"
  "definition": "```output text```"
}}

ONLY return the JSON with the keys: [term, definition], do not add ANYTHING, NO INTERPRETATION!"""

In [35]:
llama_template = f"""
<s>[INST]<<SYS>>
{system_role_prompt}\n
{system_instruction_prompt}\n
{system_context_prompt}\n
{system_output_prompt}<</SYS>>
[/INST]
"""

### LLAMA-2

#### References:
1. [Get the LLAMA-2 model](https://levelup.gitconnected.com/text-summarization-llama2-how-to-use-llama2-with-langchain-ad5775c80716)
2. [Insights about Prompting](https://medium.com/@sasika.roledene/unlocking-llm-fundamental-of-prompt-engineering-with-llama-2-ee8649552115)
3. [LLAMA-2 prompting](https://huggingface.co/blog/llama2#how-to-prompt-llama-2)


In [11]:
import os
from dotenv import load_dotenv

dotenv_path = './../../../.env'
load_dotenv(dotenv_path)

True

In [12]:
import dotenv
DOTENVPATH = dotenv.find_dotenv()
CONFIG = dotenv.dotenv_values(DOTENVPATH)

In [14]:
from langchain import HuggingFacePipeline
from transformers import AutoTokenizer
import transformers
import torch
from langchain import PromptTemplate, LLMChain

model = "meta-llama/Llama-2-7b-chat-hf"
hf_auth = os.getenv('HUGGINGFACE_AUTH_KEY')

tokenizer = AutoTokenizer.from_pretrained(model)

pipeline = transformers.pipeline(
    "text-generation", #task
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
    do_sample=True,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id
)

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [15]:
llm = HuggingFacePipeline(pipeline = pipeline,
                          model_kwargs = {'temperature':0.2, 'top_k': 20,'top_p': 0.6, 'repetition_penalty': 1.2, 'max_length': 3000}
                          )
prompt = PromptTemplate(template=llama_template, input_variables=["term", "sentences"])
llm_chain = LLMChain(prompt=prompt, llm=llm)

In [None]:
# Iterate through each item in the JSON data
term_list = []

#for item in data:
for item in data:
  term = item["term"]
  celex_id = item["celex_id"]
  record = f"{term}_{celex_id}"
  if record not in term_list:
    if item["existing_record"] == ["NEW TERM"]:
      scores = item["scores"]
      article_scores = {
        key: value["article_score"]
        for key, value in scores.items()
        # if ("Article" in key and key != "Article 2" and key != "Not Article") or (key == "Article 2" and len(scores.items()) > 1)
        if ("Article" in key and key != "Not Article")
      }

      # If any article exist to provide context then only go ahead otherwise go to next term
      if article_scores:
          max_score = max(article_scores.values())
          selected_articles = [key for key, value in article_scores.items() if value == max_score]

          # Get existing sentences for the selected article
          sentences = item["existing_sentences"].get(selected_articles[0], [])

          # Create a list of tuples where each tuple contains a sentence and its score
          sentence_scores = [(sentence, item["scores"][selected_articles[0]]["statement_scores"]) for sentence in sentences]

          # Sort sentences based on statement_scores in descending order
          sorted_sentences = sorted(
              [(sentence, score) for sentence, score in zip(sentences, item["scores"][selected_articles[0]]["statement_scores"]) if score > 0],
              key=lambda x: x[1], reverse=True
          )
          # Extract cleaned sentences and create a single string with each retrieved sentence on a new line
          cleaned_sentences = [clean_text(sentence) for sentence, _ in sorted_sentences]

          # List to store unique sentences
          unique_sentences = []

          # Iterate through the selected_cleaned_sentences
          for sentence in cleaned_sentences:
            # Flag to indicate if the sentence is a duplicate
            is_duplicate = False

            # Check if the sentence is similar to any existing unique sentence
            for unique_sentence in unique_sentences:
                if are_sentences_similar(sentence, unique_sentence):
                    is_duplicate = True
                    break

            # If not a duplicate, add it to the unique_sentences list
            if not is_duplicate:
                unique_sentences.append(sentence)

          # Since the language model is having the max context length restriction
          total_tokens = 0
          selected_cleaned_sentences = []

          for sentence in unique_sentences:
              sentence_tokens = token_count(sentence)
              if total_tokens + sentence_tokens <= 2500:
                  selected_cleaned_sentences.append(sentence)
                  total_tokens += sentence_tokens
              else:
                  break

          # Remove duplicate sentences if any
          sentences_str = "\n".join(selected_cleaned_sentences)

          llama_response = llm_chain.run({"term": term, "sentences": sentences_str})
          print("#########")
          print(f"term: {term}\n")
          print(f"celex_id: {celex_id}\n")
          print(f"response: {llama_response}")
          print("#########")

          try:
            # Define a regular expression pattern to match the JSON-like structure
            # Use regular expression to find the JSON string after [/INST]
            pattern = re.compile(r'\[/INST\]\s*(\{.*?\})', re.DOTALL)
            match = pattern.search(llama_response)

            # If a match is found, extract and parse the JSON-like structure
            if match:
              try:
                json_text = match.group(1)  # Extract the JSON text
                json_text = json_text.replace('\n', '')
                json_data = json.loads(json_text)

                # Store the extracted JSON data in a field
                llama_response_updated = {
                    "term": json_data.get("term", ""),
                    "definition": json_data.get("definition", "")
                  }
              except json.JSONDecodeError as e:
                # Convert the string value to a JSON object
                json_object = json.loads(llama_response)
                # Convert the JSON object to a dictionary
                json_dict = dict(json_object)
                llama_response_updated = json_dict

              # Concatenate the term and definition
              concatenated_text = f"'{llama_response_updated['term']}' means {llama_response_updated['definition'].lower()}"
              item["generated_definition"] = concatenated_text.strip()
              output_file_path = f"{term}_{celex_id}.json"
              with open(output_file_path, 'w') as json_file:
                json.dump(item, json_file, indent=2)

              term_list.append(f"llama_{term}_{celex_id}")
              print(f"term_list = {term_list}")
              JSON_file_path = f"llama2_term_list.json"
              with open(JSON_file_path, 'w') as json_file:
                json.dump(term_list, json_file, indent=4)

              torch.cuda.empty_cache()
            else:
              print("------- Error -------")
              print(f"term: {term}\n")
              print(f"celex_id: {item['celex_id']}\n")
              print(f"response: {llama_response}")
              print(f"error: NO JSON AS AN OUTPUT OBTAINED")

          except Exception as e:
            print("------- Error -------")
            print(f"term: {term}\n")
            print(f"celex_id: {item['celex_id']}\n")
            print(f"response: {llama_response}")
            print(f"error: {e}")
            continue
      else:
        term_list.append(f"llama_{term}_{celex_id}")
        print(f"term_list = {term_list}")
        JSON_file_path = f"llama2_term_list.json"
        with open(JSON_file_path, 'w') as json_file:
          json.dump(term_list, json_file, indent=4)
        torch.cuda.empty_cache()
    else:
      print(f"NOT A NEW TERM\n")
      term_list.append(f"llama_{term}_{celex_id}")
      print(f"term_list = {term_list}")
      JSON_file_path = f"llama2_term_list.json"
      with open(JSON_file_path, 'w') as json_file:
        json.dump(term_list, json_file, indent=4)
      torch.cuda.empty_cache()

#########
term: mains

celex_id: 32019R2014

response: 
<s>[INST]<<SYS>>
Act as a Lawyer drafting European Legislative documents to be published on Eur-Lex website.

Define the term: mains, based on the sentences provided between the triple dashes where different sentences are splitted by new line character 
. ---‘off mode’ means a condition in which the household washing machine or the household washer-dryer is connected to the mains and is not providing any function; the following shall also be considered as off mode:
‘standby mode’ means a condition where the household washing machine or the household washer-dryer is connected to the mains and provides only the following functions, which may persist for an indefinite time:---

Provide a clear and concise definition strictly within 25 to 45 words that accurately conveys the meaning within the context of the sentences.

Give your output in the following JSON format:
{
  "term": "mains"
  "definition": "```output text```"
}

ONLY retur

In [None]:
JSON_file_path = f"drive/My Drive/PhD/LexDrafter_Paper/dataset/llama2/final_{split}_dataset.json"
with open(JSON_file_path, 'w') as json_file:
  json.dump(data, json_file, indent=4)

In [43]:
llama_response_updated = {
                    "term": json_data.get("term", ""),
                    "definition": json_data.get("definition", "")
                  }

In [44]:
llama_response_updated

{'term': 'mains',
 'definition': 'The electrical power supply to a household washing machine or washer-dryer, providing a connection to the grid for energy supply.'}