<a href="https://colab.research.google.com/github/aylenemce/470proj2/blob/main/AIFinalProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Spring AI Final Project CSCI 470 01


---
 Contract Analysis

*   Morgan Pallas pallasmv@g.cofc.edu
*   Aylene McEntire mcentireak@g.cofc.edu
*   Jack Keim keimjm@g.cofc.edu






In [None]:
# For PDF reading
!pip install pdfplumber
import pdfplumber

# For NLP (spaCy and legal-specific transformers)
!pip install spacy transformers
import spacy
nlp = spacy.load("en_core_web_sm")
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering, AutoModelForSeq2SeqLM

# If using spaCy with Hugging Face
!pip install spacy-transformers

# If using legal-specific models from Hugging Face
from transformers import AutoModel, AutoTokenizer

# For Hugging Face summarization / NER
from transformers import pipeline

# For display and basic data handling
import pandas as pd
import numpy as np

# For file uploads in Colab
from google.colab import files




In [None]:
!pip install pdfplumber sentence-transformers transformers langchain langchain-community accelerate


Collecting langchain-community
  Downloading langchain_community-0.3.22-py3-none-any.whl.metadata (2.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB

In [None]:
uploaded = files.upload()


Saving 123 coming st lease .pdf to 123 coming st lease .pdf


In [None]:
pdf_filename = list(uploaded.keys())[0]

with pdfplumber.open(pdf_filename) as pdf:
    full_text = ""
    for page in pdf.pages:
        text = page.extract_text()
        if text:
            full_text += text + "\n"

print(full_text)






In [None]:
def extract_text_from_pdf(file):
    with pdfplumber.open(file) as pdf:
        full_text = ""
        for page in pdf.pages:
            full_text += page.extract_text() or ""
    return full_text.strip()

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from sentence_transformers import SentenceTransformer, util
import torch
import re

# --- Named Entity Recognition using Legal-BERT ---
def extract_named_entities(text: str):
    tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
    model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")  # You can train on legal-BERT too
    nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

    entities = nlp(text)
    return [{"text": e["word"], "label": e["entity_group"]} for e in entities]

# --- Clause Detection using Semantic Similarity ---
def detect_clauses_semantically(text: str):
    # Clause reference examples
    clause_templates = {
        "Termination": "This clause explains how and when the contract can be terminated.",
        "Confidentiality": "This clause discusses nondisclosure of information.",
        "Liability": "This clause outlines responsibilities and liabilities.",
        "Indemnification": "This clause defines compensation for harm or loss.",
        "Dispute Resolution": "This clause covers legal methods of resolving disagreements."
    }

    model = SentenceTransformer("all-MiniLM-L6-v2")
    paragraph_texts = re.split(r'\n{2,}', text)
    paragraph_embeddings = model.encode(paragraph_texts, convert_to_tensor=True)

    results = {}

    for clause_type, template_text in clause_templates.items():
        template_embedding = model.encode(template_text, convert_to_tensor=True)
        cosine_scores = util.cos_sim(template_embedding, paragraph_embeddings)[0]

        top_idx = torch.argmax(cosine_scores).item()
        if cosine_scores[top_idx] > 0.5:  # threshold for match
            results[clause_type] = {
                "score": float(cosine_scores[top_idx]),
                "text": paragraph_texts[top_idx]
            }

    return results





1.  Person Three(Morgan): LLM Explanation and Risk Assessment
*   Build the RAG system to pass contract snippets and get explanations from LLMs

*   Generate Layman-term explainations for clauses obligations, and risks.

*   Implement risk assessment scoring (use heuristics or model predictions to label clauses as low/med/high risk)

2.   Skills and tools involved

*   open ai/llama/mistral/hugging face llms


*   LangChain or Haystack(for rag setup)
*   Risk assessment logic/prompt engineering







In [None]:
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.schema import Document
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

# STEP 1: Build documents from detected clauses
def build_documents_from_clauses(clauses_dict):
    """
    Converts detected clauses into LangChain Document objects with metadata.
    Each clause becomes a searchable entry in the vector store.
    """
    return [
        Document(page_content=v['text'], metadata={"clause_type": k, "score": v['score']})
        for k, v in clauses_dict.items()
    ]


# STEP 2: Set up vector store with FAISS
def setup_vector_store(documents):
    """
    Creates a vector database using FAISS (Facebook AI Similarity Search).
    FAISS efficiently indexes and searches high-dimensional vectors (like embeddings),
    enabling fast similarity-based retrieval.
    """
    embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return FAISS.from_documents(documents, embedder)

# STEP 3: Load LLaMA Model via HuggingFace Pipeline
def load_llama_pipeline():
    """
    Loads a LLaMA model for text generation using HuggingFace Transformers and wraps it in a LangChain-compatible interface.
    Make sure the model is downloaded or use a local path.
    """
    model_id = "meta-llama/Llama-2-7b-chat-hf"  # Replace with your local path if needed
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")

    hf_pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.3,
        top_p=0.9,
        repetition_penalty=1.1,
    )

    return HuggingFacePipeline(pipeline=hf_pipe)

# STEP 4: Create RAG Chain using Retriever + LLaMA LLM
def create_rag_qa_chain(vectorstore):
    """
    Combines the vector store retriever with LLaMA LLM to build a Retrieval-Augmented Generation system.
    LangChain handles the orchestration of searching relevant docs and passing context to the model.
    """
    retriever = vectorstore.as_retriever()
    llm = load_llama_pipeline()
    return RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

# STEP 5: Generate a layman's explanation
def generate_explanations(qa_chain, clause_type):
    """
    Asks the RAG system to explain the clause type in simple terms.
    """
    query = f"Explain the {clause_type} clause in simple, layman terms with examples if possible."
    return qa_chain.run(query)

# STEP 6: Basic risk assessment using keyword heuristics
def assess_clause_risk(clause_text, clause_type):
    """
    Analyzes a clause and assigns a risk level using keyword-based heuristics.
    You can later replace this with a trained classifier or prompt-based scoring.
    """
    high_risk_keywords = ["indemnify", "liable", "damages", "terminate without cause", "breach"]
    medium_risk_keywords = ["subject to", "must", "shall", "limited"]

    text_lower = clause_text.lower()
    risk_score = 0

    for word in high_risk_keywords:
        if word in text_lower:
            risk_score += 2

    for word in medium_risk_keywords:
        if word in text_lower:
            risk_score += 1

    if risk_score >= 3:
        return "High"
    elif risk_score == 2:
        return "Medium"
    else:
        return "Low"


Clause-Based Question Answeing System: This algorithm builds a question-answering system that explains contract clauses in plain english. First, each detected clause is converted into a searchable format with metadata, then stored in a FAISS vector database for retrieval. I used LLaMA 2 language model to generate natural language explainations based on the most relevant clauses. There is also basic risk assessment that flags clauses as high, medium or low risk usung simple keyword heuristics. While it's helpful for quick insights, the risk scoring is rul-based and should be reviewed by a human for accuracy.