In [None]:
import streamlit as st
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import re
import pandas as pd

# Specify the directory where punkt is already downloaded
nltk_data_dir = r"D:/Data/OneDrive/Ccantu/OneDrive - CFTC/Documents/Python Scripts/punkt"
nltk.data.path.append(nltk_data_dir)

# Load Legal-BERT model and tokenizer
def load_local_legal_bert():
    model_path = r"D:/Data/OneDrive/Ccantu/OneDrive - CFTC/Documents/Python Scripts/BERT-Legal"
    st.write(f"Loading the Legal-BERT model from '{model_path}'...")
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModel.from_pretrained(model_path)
    st.write("Legal-BERT model loaded successfully!")
    return tokenizer, model

# Extract definitions from the text
def extract_definitions(text):
    definition_pattern = r"(?P<term>\w+)\s+(?:is|means)\s+(?P<definition>.*?)[;.]" 
    definitions = {}
    for match in re.finditer(definition_pattern, text, re.IGNORECASE):
        term = match.group("term")
        definition = match.group("definition").strip()
        definitions[term] = definition
    return definitions

# Score sentences based on definitions
def score_sentence(sentence, definitions, tfidf_matrix, sentence_similarities):
    score = 0
    for term, definition in definitions.items():
        if term in sentence or definition in sentence: 
            score += sentence_similarities[0,1] 
    return score

# Summarize the legal text
def extractive_summarize(text, num_sentences=3):
    sentences = nltk.sent_tokenize(text)
    definitions = extract_definitions(text) 

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    
    sentence_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)
    sentence_scores = [score_sentence(sentence, definitions, tfidf_matrix, sentence_similarities) for sentence in sentences]
    ranked_sentences = [sentences[i] for i in np.argsort(sentence_scores)[::-1]]
    
    summary = ' '.join(ranked_sentences[:num_sentences])
    return summary

# Process text with Legal-BERT
def process_with_legal_bert(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Search for a term in the document
def search_document(text, search_term):
    sentences = nltk.sent_tokenize(text)
    matches = [sentence for sentence in sentences if re.search(search_term, sentence, re.IGNORECASE)]
    return matches

# Streamlit App
st.title("Legal Text Analysis with BERT")

# Load Legal-BERT
st.write("Loading the Legal-BERT model...")
legal_bert_tokenizer, legal_bert_model = load_local_legal_bert()

# Input legal text
text = st.text_area("Enter the legal text:", height=200)

if st.button("Analyze Text"):
    st.write("**Original Text:**")
    st.write(text)

    # Extract definitions
    st.write("\n**Definitions found:**")
    definitions = extract_definitions(text)
    for term, definition in definitions.items():
        st.write(f"{term} means {definition}")

    # Process with Legal-BERT
    bert_output = process_with_legal_bert(text, legal_bert_tokenizer, legal_bert_model)
    st.write("\n**Legal-BERT processing complete. Output shape:**", bert_output.shape)

    # Generate summary
    summary = extractive_summarize(text)
    st.write("\n**Generated Summary:**")
    st.write(summary)

# Search functionality
search_term = st.text_input("Search for a term in the document:")
if st.button("Search"):
    matches = search_document(text, search_term)
    st.write(f"**Sentences containing '{search_term}':**")
    for match in matches:
        st.write(match)

all together

In [None]:
import streamlit as st
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import re
import pandas as pd

# Specify the directory where punkt is already downloaded
nltk_data_dir = r"D:/Data/OneDrive/Ccantu/OneDrive - CFTC/Documents/Python Scripts/punkt"
nltk.data.path.append(nltk_data_dir)

# Load Legal-BERT model and tokenizer
def load_local_legal_bert():
    model_path = r"D:/Data/OneDrive/Ccantu/OneDrive - CFTC/Documents/Python Scripts/BERT-Legal"
    st.write(f"Loading the Legal-BERT model from '{model_path}'...")
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModel.from_pretrained(model_path)
    st.write("Legal-BERT model loaded successfully!")
    return tokenizer, model

# Load Fine-Tuned GPT-2 model and tokenizer
def load_gpt2_model():
    gpt2_model_path = r"D:/Data/OneDrive/Ccantu/OneDrive - CFTC/Documents/Python Scripts/GPT2"
    st.write(f"Loading the GPT-2 model from '{gpt2_model_path}'...")
    gpt2_tokenizer = AutoTokenizer.from_pretrained(gpt2_model_path)
    gpt2_model = AutoModelForCausalLM.from_pretrained(gpt2_model_path)
    st.write("GPT-2 model loaded successfully!")
    return gpt2_tokenizer, gpt2_model

# Simplify summary for laypeople using GPT-2
def simplify_summary_for_layperson(summary, gpt2_model, gpt2_tokenizer):
    input_text = f"Simplify this legal text for a layperson: {summary}"
    inputs = gpt2_tokenizer(input_text, return_tensors='pt')
    
    outputs = gpt2_model.generate(
        **inputs,
        max_length=len(inputs['input_ids'][0]) + 200,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        do_sample=True
    )
    
    simplified_summary = gpt2_tokenizer.decode(outputs[0], skip_special_tokens=True)
    simplified_summary = simplified_summary.replace(input_text, "").strip()
    
    return simplified_summary

# Extract definitions from the text
def extract_definitions(text):
    definition_pattern = r"(?P<term>\w+)\s+(?:is|means)\s+(?P<definition>.*?)[;.]" 
    definitions = {}
    for match in re.finditer(definition_pattern, text, re.IGNORECASE):
        term = match.group("term")
        definition = match.group("definition").strip()
        definitions[term] = definition
    return definitions

# Score sentences based on definitions
def score_sentence(sentence, definitions, tfidf_matrix, sentence_similarities):
    score = 0
    for term, definition in definitions.items():
        if term in sentence or definition in sentence: 
            score += sentence_similarities[0,1] 
    return score

# Summarize the legal text
def extractive_summarize(text, num_sentences=3):
    sentences = nltk.sent_tokenize(text)
    definitions = extract_definitions(text) 

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    
    sentence_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)
    sentence_scores = [score_sentence(sentence, definitions, tfidf_matrix, sentence_similarities) for sentence in sentences]
    ranked_sentences = [sentences[i] for i in np.argsort(sentence_scores)[::-1]]
    
    summary = ' '.join(ranked_sentences[:num_sentences])
    return summary

# Process text with Legal-BERT
def process_with_legal_bert(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Search for a term in the document
def search_document(text, search_term):
    sentences = nltk.sent_tokenize(text)
    matches = [sentence for sentence in sentences if re.search(search_term, sentence, re.IGNORECASE)]
    return matches

# Streamlit App
st.title("Legal Text Analysis with BERT")

# Load models
st.write("Loading models...")
legal_bert_tokenizer, legal_bert_model = load_local_legal_bert()
gpt2_tokenizer, gpt2_model = load_gpt2_model()

# Input legal text
text = st.text_area("Enter the legal text:", height=200)

if st.button("Analyze Text"):
    st.write("**Original Text:**")
    st.write(text)

    # Extract definitions
    st.write("\n**Definitions found:**")
    definitions = extract_definitions(text)
    for term, definition in definitions.items():
        st.write(f"{term} means {definition}")

    # Process with Legal-BERT
    bert_output = process_with_legal_bert(text, legal_bert_tokenizer, legal_bert_model)
    st.write("\n**Legal-BERT processing complete. Output shape:**", bert_output.shape)

    # Generate summary
    summary = extractive_summarize(text)
    st.write("\n**Generated Summary:**")
    st.write(summary)
    
    # Simplify summary for laypeople using GPT-2
    simplified_summary = simplify_summary_for_layperson(summary, gpt2_model, gpt2_tokenizer)
    st.write("\n**Simplified Summary for Laypeople:**")
    st.write(simplified_summary)

# Search functionality
search_term = st.text_input("Search for a term in the document:")
if st.button("Search"):
    matches = search_document(text, search_term)
    st.write(f"**Sentences containing '{search_term}':**")
    for match in matches:
        st.write(match)
