In [59]:
import json

# Load the JSON data
with open(f'static_split_score.json', 'r') as json_file:
    data = json.load(json_file)

## Definition Generation

In [60]:
import html
import re
import difflib

threshold = 1.0

# Helper function to clean text
def clean_text(text):
    # Remove HTML tags
    text = html.unescape(text)
    text = re.sub(r'<.*?>', '', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters like \u002
    text = re.sub(r'\\u[0-9A-Fa-f]+', '', text)
    return text.strip()

def token_count(text):
  input_tokens = tokenizer(text)["input_ids"]
  input_token_count = len(input_tokens)
  return input_token_count

def are_sentences_similar(sentence1, sentence2):
    sentence1_no_spaces = ''.join(sentence1.split())
    sentence2_no_spaces = ''.join(sentence2.split())
    similarity_ratio = difflib.SequenceMatcher(None, sentence1_no_spaces, sentence2_no_spaces).ratio()
    return similarity_ratio >= threshold

### Definition Generation Evaluation

### Prompt Generation

In [61]:
system_role_prompt = "Act as a Lawyer drafting European Legislative documents to be published on Eur-Lex website."

In [62]:
system_instruction_prompt = "Define the term: {term}, based on the sentences provided between the triple dashes where different sentences are splitted by new line character \n. ---{sentences}---"

In [63]:
system_context_prompt = "Provide a clear and concise definition strictly within 25 to 45 words that accurately conveys the meaning within the context of the sentences."

In [64]:
system_output_prompt = """Give your output in the following JSON format:
{{
  "term": "{term}"
  "definition": "```output text```"
}}

ONLY return the JSON with the keys: [term, definition], do not add ANYTHING, NO INTERPRETATION!"""

In [65]:
vicuna_template = f"""
{system_role_prompt}\n
{system_instruction_prompt}\n
{system_context_prompt}\n
{system_output_prompt}
"""

### Alpaca Vicuna 7B

#### References:
1. [Alpaca 7B](https://www.youtube.com/watch?v=v6sF8Ed3nTE)

In [66]:
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig, pipeline
from langchain.llms import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain

import torch

Source of latest Vicuna Model:
https://github.com/lm-sys/FastChat/blob/main/docs/vicuna_weights_version.md

In [67]:
!pip install protobuf



In [68]:
model = "lmsys/vicuna-7b-v1.5"

tokenizer = LlamaTokenizer.from_pretrained(model)

pipeline = pipeline(
    "text-generation", #task
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
    do_sample=True,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [69]:
llm = HuggingFacePipeline(pipeline = pipeline,
                          model_kwargs = {'temperature':0.2, 'top_k': 20,'top_p': 0.6, 'repetition_penalty': 1.2, 'max_length': 3000}
                          )
prompt = PromptTemplate(template=vicuna_template, input_variables=["term", "sentences"])
llm_chain = LLMChain(prompt=prompt, llm=llm)

In [None]:
from tqdm import tqdm

term_list = []

for item in tqdm(data):
  term = item["term"]
  celex_id = item["celex_id"]
  record = f"{term}_{celex_id}"
  if record not in term_list:
    if ((item["existing_record"] == ["NEW TERM"]) and ((item["generated_definition"] == "NO JSON AS AN OUTPUT OBTAINED") or (item["generated_definition"] == ""))):
      scores = item["scores"]

      # Include only articles with a score greater than 0
      article_scores = {
        key: value["article_score"]
        for key, value in scores.items()
        if ("Article" in key and key != "Not Article" and value["article_score"] > 0)
      }

      # If any article exist to provide context then only go ahead otherwise go to next term
      if article_scores:
          max_score = max(article_scores.values())
          selected_articles = [key for key, value in article_scores.items() if value == max_score]

          # Get existing sentences for the selected article
          sentences = item["existing_sentences"].get(selected_articles[0], [])
          sentence_scores = item["scores"][selected_articles[0]]["statement_scores"]

          # Filter sentences with statement_scores > 0
          filtered_sentences = [
              (sentence, score)
              for sentence, score in zip(sentences, sentence_scores)
              if score > 0
          ]

          # Sort filtered sentences based on statement_scores in descending order
          sorted_sentences = sorted(filtered_sentences, key=lambda x: x[1], reverse=True)
          
          # Extract cleaned sentences and create a single string with each retrieved sentence on a new line
          cleaned_sentences = [clean_text(sentence) for sentence, _ in sorted_sentences]

          # List to store unique sentences
          unique_sentences = []

          # Iterate through the selected_cleaned_sentences
          for sentence in cleaned_sentences:
            # Flag to indicate if the sentence is a duplicate
            is_duplicate = False

            # Check if the sentence is similar to any existing unique sentence
            for unique_sentence in unique_sentences:
                if are_sentences_similar(sentence, unique_sentence):
                    is_duplicate = True
                    break

            # If not a duplicate, add it to the unique_sentences list
            if not is_duplicate:
                unique_sentences.append(sentence)

          # Since the language model is having the max context length restriction
          total_tokens = 0
          selected_cleaned_sentences = []

          for sentence in unique_sentences:
              sentence_tokens = token_count(sentence)
              if total_tokens + sentence_tokens <= 2500:
                  selected_cleaned_sentences.append(sentence)
                  total_tokens += sentence_tokens
              else:
                  break

          # Remove duplicate sentences if any
          sentences_str = "\n".join(selected_cleaned_sentences)
          model_response = llm_chain.run({"term": term, "sentences": sentences_str})

          try:
            # Define a regular expression pattern to match the JSON-like structure
            pattern = re.compile(r'NO INTERPRETATION![\s\S]*?\{\s*"term"\s*:\s*"(.*?)",\s*"definition"\s*:\s*"(.*?)"\s*\}', re.DOTALL)
            matches = pattern.findall(model_response)

            if matches:
                for match in matches:
                    json_text = '{{"term": "{}", "definition": "{}"}}'.format(match[0], match[1])
                    json_text = json_text.replace('\n', '')
                    json_data = json.loads(json_text)
        
                    # Store the extracted JSON data in a field
                    model_response_updated = {
                        "term": json_data.get("term", ""),
                        "definition": json_data.get("definition", "")
                      }
                    # Concatenate the term and definition
                    concatenated_text = f"'{model_response_updated['term']}' means {model_response_updated['definition'].lower()}"
                    item["generated_definition"] = concatenated_text.strip()
                    output_file_path = f"{term}_{celex_id}.json"
                    with open(output_file_path, 'w') as json_file:
                        json.dump(item, json_file, indent=2)
                        
                    term_list.append(f"{term}_{celex_id}")
                    JSON_file_path = f"vicuna_term_list.json"
                    with open(JSON_file_path, 'w') as json_file:
                        json.dump(term_list, json_file, indent=4)
                    
                    torch.cuda.empty_cache()
            else:
              print("------- Error -------")
              print(f"term: {term}\n")
              print(f"celex_id: {item['celex_id']}\n")
              print(f"error: NO JSON AS AN OUTPUT OBTAINED")
              item["generated_definition"] = "NO JSON AS AN OUTPUT OBTAINED"
              output_file_path = f"{term}_{celex_id}.json"
              with open(output_file_path, 'w') as json_file:
                  json.dump(item, json_file, indent=2)

          except Exception as e:
            print("------- Error -------")
            print(f"term: {term}\n")
            print(f"celex_id: {item['celex_id']}\n")
            print(f"response: {model_response}")
            print(f"error: {e}")
            continue
      else:
        term_list.append(f"{term}_{celex_id}")
        JSON_file_path = f"vicuna_term_list.json"
        with open(JSON_file_path, 'w') as json_file:
          json.dump(term_list, json_file, indent=4)
        torch.cuda.empty_cache()
    else:
      print(f"NOT A NEW TERM\n")
      term_list.append(f"{term}_{celex_id}")
      print(f"term_list = {term_list}")
      JSON_file_path = f"vicuna_term_list.json"
      with open(JSON_file_path, 'w') as json_file:
        json.dump(term_list, json_file, indent=4)
      torch.cuda.empty_cache()