Full Mistral Summarization Pipeline

In [29]:
import os
import re
import csv
from typing import List, Optional

import pdfplumber
import tiktoken
import ollama


Keyword Extraction Utility

In [30]:
from collections import Counter

def extract_top_keywords(
    text: str,
    top_k: int = 10
) -> List[str]:
    """
    Extract top keywords using frequency-based scoring.
    Designed for legal judgments.
    """

    # Basic stopwords (extend if needed)
    stopwords = {
        "the", "and", "of", "which", "under", "to", "in", "for", "on", 
        "is", "are", "was", "were", "by", "that", "this", "it",
        "be", "or", "an", "at", "from", "has", "have", "had",
        "court", "judge", "judgment", "section", "case", "with", "as",
        "petitioner", "respondent", "india", "law", "article"
    }

    # Normalize
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)

    words = [
        w for w in text.split()
        if len(w) > 3 and w not in stopwords
    ]

    freq = Counter(words)

    return [word for word, _ in freq.most_common(top_k)]

import re
from collections import Counter

def extract_candidate_terms(text: str):
    patterns = [
        r'Article\s+\d+[A-Z]?',
        r'Section\s+\d+[A-Z]?',
        r'IPC\s+\d+',
        r'Supreme Court',
        r'Constitution',
    ]

    matches = []
    for p in patterns:
        matches.extend(re.findall(p, text))

    # Capitalized multi-word phrases
    caps = re.findall(r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3})\b', text)

    return list(set(matches + caps))

def extract_social_keywords_mistral(text: str, top_k: int = 10):
    prompt = f"""
You are extracting keywords from a Supreme Court judgment
for SOCIAL LISTENING on Twitter, YouTube, and news platforms.

Select up to {top_k} keywords or short phrases that:
- People are likely to use when discussing this judgment online
- Include case names, articles, sections, institutions, and issues
- Are short, searchable, and commonly used in public discourse

Avoid legal jargon and long sentences.

TEXT:
{text[:4000]}

Return only a comma-separated list of keywords.
"""

    response = ollama.chat(
        model="mistral",
        messages=[{"role": "user", "content": prompt}],
        options={"temperature": 0.1}
    )

    return [k.strip() for k in response["message"]["content"].split(",")]

def get_social_listening_keywords(text: str):
    rule_based = extract_candidate_terms(text)
    llm_based = extract_social_keywords_mistral(text)

    combined = set(rule_based + llm_based)

    # Light cleanup
    cleaned = [
        k.replace("  ", " ").strip()
        for k in combined
        if len(k.split()) <= 5
    ]

    return cleaned[:20]



In [31]:
class SemanticPDFExtractor:
    def __init__(
        self,
        min_line_length: int = 40,
        min_sentence_words: int = 12,
        stop_sections: Optional[List[str]] = None
    ):
        self.min_line_length = min_line_length
        self.min_sentence_words = min_sentence_words
        self.stop_sections = stop_sections or []

    # ---------- Core extraction ----------
    def extract_raw_text(self, pdf_path: str) -> str:
        pages = []
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text = page.extract_text()
                if text:
                    pages.append(text)
        return "\n".join(pages)

    # ---------- Cleaning ----------
    def clean_lines(self, text: str) -> str:
        lines = text.split("\n")
        cleaned = []

        for line in lines:
            line = line.strip()

            if len(line) < self.min_line_length:
                continue
            if re.fullmatch(r"\d+", line):
                continue

            line = re.sub(r"\s+", " ", line)
            cleaned.append(line)

        return "\n".join(cleaned)

    # ---------- Paragraph normalization ----------
    def normalize_paragraphs(self, text: str) -> str:
        text = re.sub(r'-\n', '', text)
        text = re.sub(r'\n(?!\n)', ' ', text)
        text = re.sub(r'\n{2,}', '\n\n', text)
        return text.strip()

    # ---------- Section pruning ----------
    def remove_stop_sections(self, text: str) -> str:
        lowered = text.lower()
        for section in self.stop_sections:
            idx = lowered.find(section)
            if idx != -1:
                text = text[:idx]
                lowered = lowered[:idx]
        return text

    # ---------- Sentence compression ----------
    def compress_sentences(self, text: str) -> str:
        sentences = re.split(r'(?<=[.!?])\s+', text)

        kept = [
            s for s in sentences
            if len(s.split()) >= self.min_sentence_words
            and not s.lower().startswith((
                "this paper",
                "in this study",
                "copyright",
                "all rights reserved"
            ))
        ]

        return " ".join(kept)

    # ---------- Full pipeline ----------
    def extract(self, pdf_path: str) -> str:
        text = self.extract_raw_text(pdf_path)
        text = self.clean_lines(text)
        text = self.normalize_paragraphs(text)
        text = self.remove_stop_sections(text)
        text = self.compress_sentences(text)
        return text


Chunking Utilities

In [32]:
def chunk_text(
    text: str,
    chunk_tokens: int = 3000,
    overlap_tokens: int = 200,
    encoding_name: str = "gpt2"
) -> List[str]:
    enc = tiktoken.get_encoding(encoding_name)
    tokens = enc.encode(text)

    chunks = []
    start = 0

    while start < len(tokens):
        end = start + chunk_tokens
        chunk = enc.decode(tokens[start:end])
        chunks.append(chunk)
        start = end - overlap_tokens
        if start < 0:
            start = 0

    return chunks


Mistral Summarisation

In [33]:
def summarize_chunk_mistral(chunk: str) -> str:
    prompt = f"""
You are summarizing a portion of a legal judgment.

Summarize the following text clearly and accurately.
Preserve legal reasoning, key findings, and conclusions.
Avoid speculation.

TEXT:
{chunk}

SUMMARY:
"""
    response = ollama.chat(
        model="mistral",
        messages=[{"role": "user", "content": prompt}],
        options={
            "temperature": 0.2,
            "num_ctx": 4096
        }
    )
    return response["message"]["content"].strip()

5. Recursive Reduction

In [34]:
def reduce_summaries(
    summaries: List[str],
    group_size: int = 6
) -> str:
    current = summaries

    while len(current) > 1:
        next_round = []

        for i in range(0, len(current), group_size):
            group = "\n\n".join(current[i:i + group_size])

            prompt = f"""
Combine the following summaries into a single,
clear and coherent legal summary.

SUMMARIES:
{group}

FINAL SUMMARY:
"""
            response = ollama.chat(
                model="mistral",
                messages=[{"role": "user", "content": prompt}],
                options={"temperature": 0.2}
            )
            next_round.append(response["message"]["content"].strip())

        current = next_round
        print("Reduction step complete. Remaining summaries:", len(current))

    return current[0]


6. File-Naming Helper

In [35]:
def make_output_name(pdf_path: str, suffix: str, ext: str = "txt") -> str:
    base = os.path.splitext(os.path.basename(pdf_path))[0]
    return f"{base}_{suffix}.{ext}"

def get_judgement_name(pdf_path: str) -> str:
    return os.path.splitext(os.path.basename(pdf_path))[0]

7. Main Pipeline (Multiple PDFs)

In [None]:
from pathlib import Path

csv_output_path = "judgement_keywords_social_listening.csv"

with open(csv_output_path, mode="w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow([
        "Judgement",
        "Keywords Mistral",
        "Summary Mistral"
    ])

In [None]:
if __name__ == "__main__":

    extractor = SemanticPDFExtractor(
        min_line_length=50,
        min_sentence_words=15,
        stop_sections=[
            "table of contents",
            "disclaimer",
            "references",
            "appendix",
            "copyright"
        ]
    )

    PDF_DIR = Path(r"C:\Users\Arushi\Documents\Python\IIMA\PDFs")  # folder containing your PDFs

    pdf_paths = sorted(PDF_DIR.glob("*.pdf"))
    pdf_paths = [str(p) for p in pdf_paths]  # convert to strings if needed

    for pdf_path in pdf_paths:
        print(f"\n=== Processing: {pdf_path} ===")

        # 1. Extract + compress
        compressed_text = extractor.extract(pdf_path)

        compressed_file = make_output_name(pdf_path, "compressed")
        with open(compressed_file, "w", encoding="utf-8") as f:
            f.write(compressed_text)

        #2. Chunk
        chunks = chunk_text(compressed_text)
        print("Total chunks:", len(chunks))

        # 3. Chunk summaries
        chunk_summaries = []
        for i, chunk in enumerate(chunks):
            print(f"Summarizing chunk {i+1}/{len(chunks)}...")
            summary = summarize_chunk_mistral(chunk)
            chunk_summaries.append(summary)

        chunk_summary_file = make_output_name(pdf_path, "chunk_summaries")

        # 4. Final reduction
        final_summary = reduce_summaries(chunk_summaries)

        #5. Extract keywords
        keywords = get_social_listening_keywords(compressed_text)

        with open(make_output_name(pdf_path, "keywords"), "w", encoding="utf-8") as f:
            f.write("\n".join(keywords))

        judgement_name = get_judgement_name(pdf_path)

        with open(csv_output_path, mode="a", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerow([
                judgement_name,
                ", ".join(keywords),   # keywords in one cell
                final_summary          # full Mistral summary in one cell
            ])







=== Processing: C:\Users\Arushi\Documents\Python\IIMA\PDFs\2GScam_judgement.pdf ===

=== Processing: C:\Users\Arushi\Documents\Python\IIMA\PDFs\Aadhaar_right_to_privacy_judgement.pdf ===

=== Processing: C:\Users\Arushi\Documents\Python\IIMA\PDFs\article_370_judgement.pdf ===

=== Processing: C:\Users\Arushi\Documents\Python\IIMA\PDFs\Assam_CAA_judgement.pdf ===

=== Processing: C:\Users\Arushi\Documents\Python\IIMA\PDFs\Bhima_Koregaon_judgement.pdf ===

=== Processing: C:\Users\Arushi\Documents\Python\IIMA\PDFs\Covid_19_judgement.pdf ===

=== Processing: C:\Users\Arushi\Documents\Python\IIMA\PDFs\Hijab-Ban-Judgment.pdf ===

=== Processing: C:\Users\Arushi\Documents\Python\IIMA\PDFs\Ram_janmabhoomi_judgement.pdf ===

=== Processing: C:\Users\Arushi\Documents\Python\IIMA\PDFs\sabrimala_judgement.pdf ===

=== Processing: C:\Users\Arushi\Documents\Python\IIMA\PDFs\Section377_judgement.pdf ===

=== Processing: C:\Users\Arushi\Documents\Python\IIMA\PDFs\Stray_dogs_judgement.pdf ===

=== Pr