# Tuning legal model  

Adding extract_definitions to identify and extract definitions from the input text. It uses a regular expression pattern to find sentences that may contain definition like structures.

Moded score_sentence to take into account the relevance of the sentence to the defined terms. If a sentence contains a defined term or its definition, the sentence score is increased.

Added a section to print the extracted definitions after loading.

In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import re 
import pandas as pd 
# Specify the directory where punkt is already downloaded
nltk_data_dir = r"D:/Data/OneDrive/Ccantu/OneDrive - CFTC/Documents/Python Scripts/punkt"

# Add the directory to NLTK's data path
nltk.data.path.append(nltk_data_dir)

def load_local_legal_bert():
    model_path = r"D:/Data/OneDrive/Ccantu/OneDrive - CFTC\Documents/Python Scripts/BERT-Legal"
    print(f"Loading the Legal-BERT model from '{model_path}'...")
    
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModel.from_pretrained(model_path)

    print("Legal-BERT model loaded successfully!")
    return tokenizer, model
# tuning
def extract_definitions(text):
    definition_pattern = r"(?P<term>\w+)\s+(?:is|means)\s+(?P<definition>.*?)[;.]" # looking for definition patterns 
    definitions = {}
    for match in re.finditer(definition_pattern, text, re.IGNORECASE):
        term = match.group("term")
        definition = match.group("definition").strip()
        definitions[term] = definition
    return definitions
# Tuning
def score_sentence(sentence, definitions, tfidf_matrix, sentence_similarities):
    score = 0
    for term, definition in definitions.items():
        if term in sentence or definition in sentence: 
            score += sentence_similarities[0,1] # may need to change assume first senetence is important 
    return score 

def extractive_summarize(text, num_sentences=3):
    sentences = nltk.sent_tokenize(text)
    definitions = extract_definitions(text) 

    # Create TF-IDF matrix
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    
    # Compute sentence similarities
    sentence_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # Rank sentences based on similarity scores
    sentence_scores = [score_sentence(sentence, definitions, tfidf_matrix, sentence_similarities) for sentence in sentences]
    ranked_sentences = [sentences[i] for i in np.argsort(sentence_scores)[::-1]]
    
    # Select top sentences
    summary = ' '.join(ranked_sentences[:num_sentences])
    return summary

def process_with_legal_bert(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

def search_document(text, search_term):
    sentences = nltk.sent_tokenize(text)
    matches = []
    for sentence in sentences:
        if re.search(search_term, sentence, re.IGNORECASE):
            matches.append(sentence)
    return matches

# Load Legal-BERT
legal_bert_tokenizer, legal_bert_model = load_local_legal_bert()

# Example legal text
# text = """

# Except where otherwise expressly provided in this Covenant or by the terms of the present Treaty, decisions at any meeting of the Assembly or of the Council shall require the agreement of all the Members of the League represented atthe meeting. All matters of procedure at meetings of the Assembly or of the Council, including the appointment of Committees to investigate particular matters, shall be regulated by the Assembly orby the Council and may be decided by a majority of the Members of the League represented at the meeting. Thefirst meeting of the Assembly and the first meeting of the Council shall be summoned by the President of the United States of America.
# """
text = input("Enter the legal text: ")

print("\nOriginal text:")
print(text)

# Extract definitions
definitions = extract_definitions(text)
print("\nDefinitions found:")
for term, definition in definitions.items():
    print(f"{term} means {definition}")

# Process with Legal-BERT
bert_output = process_with_legal_bert(text, legal_bert_tokenizer, legal_bert_model)
print("\nLegal-BERT processing complete. Output shape:", bert_output.shape)

# Generate summary
summary = extractive_summarize(text)
print("\nGenerated Summary:")
print(summary)

# while True:
#     search_term = input("\Enter a search term (or q to exit):")
#     if search_term.lower() == 'q':
#         break 
#     matches = search_document(text, search_term)
#     if matches:
#         print(f"\nFound {len(matches)} match(es) for '{search_term}':")
#         for i, match in enumerate(matches, 1):
#             print(f"{i}. {match}")
#     else:
#         print(f"No matches found for '{search_term}'.")



Loading the Legal-BERT model from 'D:/Data/OneDrive/Ccantu/OneDrive - CFTC\Documents/Python Scripts/BERT-Legal'...
Legal-BERT model loaded successfully!

Original text:
Ladies and Gentlemen: The Division of Clearing and Risk (“Division”) of the Commodity Futures Trading Commission (the “Commission” or “CFTC”) is replacing CFTC Letter 16-26,1 which applied to European Union (“EU”)-based central counterparties (“CCPs”) that are registered with the Commission as derivatives clearing organizations (“DCOs”). This letter addresses the same Commission requirements discussed in CFTC Letter 16-26 for EU-based DCOs and extends the no-action position taken therein to DCOs based in the United Kingdom (“UK”) that are registered with the Commission (together “DCOs/CCPs”): (1) Regulation 39.12(b)(6)’s requirement that, upon a DCO’s acceptance of a swap for clearing, the original swap is extinguished and it is replaced by an equal and opposite swap between the DCO and each clearing member (acting as a

## train  the gpt model on the on extracted Definitions
### Extracted Definitions and CSV Import Merge

In [2]:
import pandas as pd

# Path to the CSV file
csv_path = "D:/Data/OneDrive/Ccantu/OneDrive - CFTC/Documents/Python Scripts/CFT_terms_and_simple.csv"

# Read the CSV file into a DataFrame
df_csv = pd.read_csv(csv_path)

# Extract definitions from the text
definitions = extract_definitions(text)

# Convert the extracted definitions dictionary to a list of tuples
definitions_list = [(term, definition) for term, definition in definitions.items()]

# Create a DataFrame from the list of tuples
df_extracted = pd.DataFrame(definitions_list, columns=['Term', 'Definition'])

# Print the DataFrame containing the extracted definitions
print("\nExtracted Definitions DataFrame:")
print(df_extracted)

# Merge the DataFrames on the 'Term' column
df_combined = pd.merge(df_csv, df_extracted, on='Term', how='outer', suffixes=('_CSV', '_Extracted'))

# Print the combined DataFrame
print("\nCombined DataFrame:")
#print(df_combined)
#df_combined


Extracted Definitions DataFrame:
             Term                                         Definition
0            swap  extinguished and it is replaced by an equal an...
1            that  greater than 100% of the DCO’s initial margin ...
2          member  an FCM, a swap dealer, or a major swap partici...
3        position  no longer needed with respect to these provisi...
4            what                            referred to as “UK EMIR
5            DCOs  equivalent to the UK’s regime for the regulati...
6        Division  extending the no-action position taken in CFTC...
7           which  based in London, for example, has been registe...
8       regulator  the Alberta Securities Commission) and Eurex C...
9         persons  an important part of the Commission’s regulato...
10  determination                        an example of this approach
11   organization  subject to comparable, comprehensive supervisi...
12          party                                                a U


## TRAIN

In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
import torch
from torch.nn import CrossEntropyLoss

# Move non-empty Definition_Extracted to Simplified Definition
mask = df_combined['Definition_Extracted'].notna() & (df_combined['Definition_Extracted'] != '')
df_combined.loc[mask, 'Simplified Definition'] = df_combined.loc[mask, 'Definition_Extracted']

# Prepare data
def prepare_data(df_combined):
    prompts = [f"Simplify this legal term: {term}" for term in df_combined['Term']]
    completions = df_combined['Simplified Definition'].tolist()
    return [{"prompt": p, "completion": c} for p, c in zip(prompts, completions)]

train_data = prepare_data(df_combined)

# Create a Dataset object
dataset = Dataset.from_list(train_data)

# Load GPT-2 model and tokenizer
base_model = AutoModelForCausalLM.from_pretrained("gpt2", attn_implementation="eager")
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Add special tokens if needed
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
base_model.resize_token_embeddings(len(tokenizer))

class GPT2ForSimplification(PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.transformer = base_model.transformer
        self.lm_head = base_model.lm_head
        self.loss_fct = CrossEntropyLoss()

        # Tie weights
        self.tie_weights()

    def tie_weights(self):
        self.lm_head.weight = self.transformer.wte.weight

    def forward(self, input_ids, attention_mask=None, labels=None):
        transformer_outputs = self.transformer(input_ids, attention_mask=attention_mask)
        hidden_states = transformer_outputs[0]
        logits = self.lm_head(hidden_states)

        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            loss = self.loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

        return {"loss": loss, "logits": logits}

# Create the custom model
config = GPT2Config.from_pretrained("gpt2", attn_implementation="eager")
model = GPT2ForSimplification(config)

# Tokenize the data
def tokenize_function(examples):
    prompts = examples["prompt"]
    completions = examples["completion"]
    
    inputs = tokenizer(prompts, padding="max_length", truncation=True, max_length=64)
    targets = tokenizer(completions, padding="max_length", truncation=True, max_length=64)
    
    inputs["labels"] = targets["input_ids"]
    
    return inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_gpt2")
tokenizer.save_pretrained("./fine_tuned_gpt2")

NameError: name 'GPT2Config' is not defined