# Installing Dependecies

In [1]:
#--required dependencies
!pip install wget

!pip install -q trl==0.8.6
!pip install -q transformers accelerate peft
!pip install -q langchain_huggingface  
!pip install -U bitsandbytes


Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9656 sha256=502bcf97eb8d9eab4fdee6926e2a79cb374e695242aa5091640df8e416a8b220
  Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.5/112.5 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.8/374.8 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m450.5/450.5 kB[0m 

# Legal NER System

In [19]:
import re, json

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
from langchain_huggingface.llms import HuggingFacePipeline

from IPython.display import clear_output

!huggingface-cli login --token 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

class Legal_Document_NER:
    def __init__(self, Few_shot_mode=False):
        self.Few_shot_mode = Few_shot_mode
        self.pseudo_text   = ''
        self.load_model()
    def load_model(self):
        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
        if self.Few_shot_mode: model_name = 'mistralai/Mistral-7B-Instruct-v0.1'
        else: model_name = 'YounesMohammed/Legal_NER_2'
        clear_output(wait=False)
        
        print('Loading the model ...\n')
        model =  AutoModelForCausalLM.from_pretrained(
                                                model_name,
                                                quantization_config=quantization_config)
        tokenizer = AutoTokenizer.from_pretrained('mistralai/Mistral-7B-Instruct-v0.1')
         #---initializing the pipeline------
        pipe = pipeline("text-generation",
                            model=model,
                            tokenizer=tokenizer,
                            max_new_tokens=300,
                            device_map = "cuda")
        self.model = HuggingFacePipeline(pipeline=pipe)  
        clear_output(wait=False)

    def get_prompt(self, text):
        TEMPLATE = f"""
                <s> [INST]
                You are an expert in Named Entity Recognition (NER) tasked with extracting specific named entities from a given text. 
                Carefully analyze the input text and extract name entities for the following categories: 
                PERSON_NAME, ORGANIZATION_NAME, and AUTHORITY_NAME.
                
                ### Reference Example:
                Input: "Mr. John Smith is the CEO of Coca Cola. The Supreme Court has asked Ms. XXX arrested John Smith."  
                Output:  
                {{
                  "PERSON_NAME": ["John Smith", "XXX"],
                  "ORGANIZATION_NAME": ["Coca Cola"],
                  "AUTHORITY_NAME": ["Supreme Court"]
                }}
                The output must be a valid JSON object in the format:
                 </s>
                [INST]
                 Input: {text}
                [/INST]
                Output:"""
        if not self.Few_shot_mode: TEMPLATE = f"<s> [INST] text: {text} [/INST] name entities:"
            
        return TEMPLATE
        
    def extract_NE(self, prompt):
        output = self.model.invoke(prompt)
        return self.get_json_output(output)
        
    def get_json_output(self, text):
        pattern = r'\{\s*"PERSON_NAME":\s*\[.*?\],\s*"ORGANIZATION_NAME":\s*\[.*?\],\s*"AUTHORITY_NAME":\s*\[.*?\]\s*\}'
        if not self.Few_shot_mode: pattern = r'name entities:\s*(\{.*?\})'; text=text.replace("\'",'\"')
        matches = re.findall(pattern, text)
        if matches:
            return json.loads(matches[-1]) 
            
    def names_spans(self, text, model_output):
        NE_set         = set()
        counter        = {}
        spans          = []
        
        for label, name_entities in model_output.items(): 
           for i,name_entity in enumerate(name_entities):
                pattern = r'\b' + re.escape(name_entity) + r'\b'
                matches = list(re.finditer(pattern, text, re.IGNORECASE))
                if not matches: matches = list(re.finditer(pattern.replace(r'\b',''), text, re.IGNORECASE))
                span = [(match.start(), match.end()) for match in matches]
                if span:
                    span = span[0]
                    spans.append({"start": span[0] ,"end": span[1],"label": label, "text": name_entity})
                    if not (f'{label}_{name_entity}' in NE_set) and not label.startswith('AUTHORITY'): 
                        counter[label] = counter.get(label,0)+1 
                        NE_set.add(f'{label}_{name_entity}')
                        text = self.Pseudonymize(text, span[0],span[1], f"{label}_{counter[label]}")
        return spans
        
    def Pseudonymize(self, text, start_idx, end_idx, name_entity): #-> gets NE from the model --> gets spans  from names spans --> Pseudnymize 
        self.pseudo_text = text[:start_idx] + name_entity + text[end_idx:]
        return self.pseudo_text
        
    def __call__(self, text):
        prompt             = self.get_prompt(text)
        #-----The following are the output of the model----------
        self.name_entities = self.extract_NE(prompt)
        self.spans         = {'spans': self.names_spans(text, self.name_entities)}
        self.pseudo_text   =  self.pseudo_text if self.spans['spans'] else f'Notice: no name entity was identified, the text is unchaged\n\n{text}'
        
        return self.name_entities, self.spans, self.pseudo_text

legal_ner_system = Legal_Document_NER()

In [25]:
text = "Ms. X works in Apple with Mr. Ahmed and Ms. XX"
name_entities, spans, pseudonymized_text = legal_ner_system(text)

print('*'*10,'Name Entities','*'*10)
print(name_entities)            #-> This is a dictgionary of format : {NER_category: [list of name_entities] ...}

print('*'*10,'spans','*'*10)
print(spans)                    #-> This is a Json object of format: {'spans': [{'start': **, 'end':**, 'label':**, 'text':**} ...]}

print('*'*10,'Pseudonymized text','*'*10)
print(pseudonymized_text)       #-> This is a string object of the pseudonymzed text with ordered name_entity category.

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


********** Name Entities **********
{'AUTHORITY_NAME': [], 'ORGANIZATION_NAME': ['Apple'], 'PERSON_NAME': ['X', 'Ahmed', 'XX']}
********** spans **********
{'spans': [{'start': 15, 'end': 20, 'label': 'ORGANIZATION_NAME', 'text': 'Apple'}, {'start': 4, 'end': 5, 'label': 'PERSON_NAME', 'text': 'X'}, {'start': 56, 'end': 61, 'label': 'PERSON_NAME', 'text': 'Ahmed'}, {'start': 78, 'end': 80, 'label': 'PERSON_NAME', 'text': 'XX'}]}
********** Pseudonymized text **********
Ms. PERSON_NAME_1 works in ORGANIZATION_NAME_1 with Mr. PERSON_NAME_2 and Ms. PERSON_NAME_3
