In [3]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [15]:
"""
Legal Brief Key Items Extractor - Dynamic Extraction
Automatically identifies key arguments using NLP techniques
"""

# Uncomment if libraries are not installed
# !pip install PyPDF2 pandas openpyxl nltk

import re
import json
import os
from typing import List, Dict, Optional, Tuple
import PyPDF2
import pandas as pd
from collections import Counter, defaultdict
import nltk

# Download required NLTK data
try:
    nltk.download('punkt_tab', quiet=True)
    nltk.download('stopwords', quiet=True)
except:
    pass

from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords as nltk_stopwords

print("Libraries imported successfully")
print("- PyPDF2: PDF text extraction")
print("- Pandas: Data organization")
print("- NLTK: Natural language processing")
print("Ready to dynamically extract key items\n")

Libraries imported successfully
- PyPDF2: PDF text extraction
- Pandas: Data organization
- NLTK: Natural language processing
Ready to dynamically extract key items



In [16]:
# Cell 2: Load PDF Document
"""
Extract text from PDF file
"""

PDF_FILE_PATH = "/content/Amicus Brief on Behalf of Mississippi, Alabama, Alaska, Arkansas etc....pdf"

def load_pdf_with_pypdf2(file_path: str) -> Tuple[str, pd.DataFrame]:
    """Extract text and create page-level DataFrame"""
    try:
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            total_pages = len(pdf_reader.pages)

            print(f"Loading PDF: {os.path.basename(file_path)}")
            print(f"Total pages: {total_pages}\n")

            full_text = ""
            page_data = []

            for page_num, page in enumerate(pdf_reader.pages, 1):
                page_text = page.extract_text()
                full_text += f"\n--- PAGE {page_num} ---\n{page_text}"

                page_data.append({
                    'page_number': page_num,
                    'text_content': page_text,
                    'word_count': len(page_text.split()),
                    'char_count': len(page_text)
                })

            pages_df = pd.DataFrame(page_data)
            print(f"Successfully extracted {len(full_text):,} characters from {total_pages} pages\n")

            return full_text, pages_df

    except FileNotFoundError:
        print(f"ERROR: File not found: {file_path}")
        return "", pd.DataFrame()
    except Exception as e:
        print(f"ERROR: {str(e)}")
        return "", pd.DataFrame()

document_text, pages_dataframe = load_pdf_with_pypdf2(PDF_FILE_PATH)

if document_text:
    print(f"Document loaded: Alliance for Hippocratic Medicine v. FDA")
    print(f"Pages DataFrame shape: {pages_dataframe.shape}\n")

Loading PDF: Amicus Brief on Behalf of Mississippi, Alabama, Alaska, Arkansas etc....pdf
Total pages: 26

Successfully extracted 44,315 characters from 26 pages

Document loaded: Alliance for Hippocratic Medicine v. FDA
Pages DataFrame shape: (26, 4)



In [6]:
!pip install nltk



In [17]:
# Cell 3: Define Dynamic Extraction Class
"""
Automatically extracts key legal arguments using pattern matching and scoring
"""

class DynamicLegalExtractor:
    """Dynamically extracts key items from legal briefs"""

    def __init__(self, text: str, pages_df: pd.DataFrame):
        self.text = text
        self.pages_df = pages_df
        self.sentences = []
        self.stop_words = set(nltk_stopwords.words('english'))

    def extract_sentences(self) -> List[Dict]:
        """Extract all sentences with metadata"""
        sentences_data = []

        for idx, row in self.pages_df.iterrows():
            page_num = row['page_number']
            page_text = row['text_content']

            # Skip header/footer pages
            if len(page_text) < 100:
                continue

            # Tokenize into sentences
            sentences = sent_tokenize(page_text)

            for sent in sentences:
                if len(sent) > 50:  # Filter very short sentences
                    sentences_data.append({
                        'text': sent.strip(),
                        'page': page_num,
                        'length': len(sent),
                        'word_count': len(sent.split())
                    })

        self.sentences = sentences_data
        return sentences_data

    def score_sentence(self, sentence: str) -> Dict[str, float]:
        """Score a sentence based on legal importance indicators"""

        scores = {
            'legal_citation': 0,
            'modal_strength': 0,
            'argument_indicator': 0,
            'subject_relevance': 0,
            'procedural': 0
        }

        sentence_lower = sentence.lower()

        # Legal citations (statutes, cases, regulations)
        citation_patterns = [
            r'\d+\s+U\.S\.C\.\s+§\s*\d+',  # Federal statutes
            r'\d+\s+C\.F\.R\.\s+§\s*\d+',  # Federal regulations
            r'\d+\s+S\.\s*Ct\.\s+\d+',     # Supreme Court
            r'\d+\s+F\.\s*\d+th\s+\d+',    # Federal courts
            r'v\.\s+[A-Z][\w\s]+,\s+\d+',  # Case names
        ]
        for pattern in citation_patterns:
            if re.search(pattern, sentence):
                scores['legal_citation'] += 2

        # Strong modal verbs indicating legal arguments
        strong_modals = ['violate', 'defy', 'contradict', 'require', 'mandate',
                        'prohibit', 'unlawful', 'invalid', 'unconstitutional']
        scores['modal_strength'] = sum(2 for word in strong_modals if word in sentence_lower)

        # Argument indicators
        argument_phrases = [
            'the fda', 'states have', 'public interest', 'court held',
            'congress', 'administration', 'plaintiffs', 'amici'
        ]
        scores['argument_indicator'] = sum(1.5 for phrase in argument_phrases if phrase in sentence_lower)

        # Key subject matter
        key_subjects = ['mifepristone', 'abortion', 'approval', 'rems',
                       'subpart h', 'dobbs', 'preemption', 'enforcement']
        scores['subject_relevance'] = sum(1 for subj in key_subjects if subj in sentence_lower)

        # Procedural importance
        procedural_terms = ['injunction', 'relief', 'preliminary', 'motion', 'brief']
        scores['procedural'] = sum(0.5 for term in procedural_terms if term in sentence_lower)

        return scores

    def extract_key_items(self, top_n: int = 10) -> pd.DataFrame:
        """Dynamically extract top N key items"""

        print("Extracting sentences from document...")
        sentences_data = self.extract_sentences()
        print(f"Found {len(sentences_data)} sentences\n")

        print("Scoring sentences for legal importance...")
        scored_items = []

        for sent_data in sentences_data:
            scores = self.score_sentence(sent_data['text'])
            total_score = sum(scores.values())

            # Only include sentences with meaningful scores
            if total_score >= 3:
                scored_items.append({
                    'text': sent_data['text'],
                    'page': sent_data['page'],
                    'total_score': total_score,
                    'legal_citation': scores['legal_citation'],
                    'modal_strength': scores['modal_strength'],
                    'argument_indicator': scores['argument_indicator'],
                    'subject_relevance': scores['subject_relevance'],
                    'word_count': sent_data['word_count']
                })

        # Sort by total score
        scored_items.sort(key=lambda x: x['total_score'], reverse=True)

        # Take top N
        top_items = scored_items[:top_n]

        # Categorize automatically
        for i, item in enumerate(top_items, 1):
            item['rank'] = i
            item['category'] = self._categorize_sentence(item['text'])
            item['importance'] = self._determine_importance(item['total_score'])

        print(f"Extracted top {len(top_items)} key items\n")

        return pd.DataFrame(top_items)

    def _categorize_sentence(self, text: str) -> str:
        """Automatically categorize based on content"""
        text_lower = text.lower()

        if 'u.s.c.' in text_lower or 'c.f.r.' in text_lower:
            return 'Legal Violation'
        elif 'dobbs' in text_lower or 'state' in text_lower:
            return 'Constitutional Authority'
        elif 'fda' in text_lower and ('approve' in text_lower or 'action' in text_lower):
            return 'FDA Actions'
        elif 'public interest' in text_lower or 'harm' in text_lower:
            return 'Public Interest'
        elif 'enforce' in text_lower or 'resource' in text_lower:
            return 'State Enforcement'
        else:
            return 'General Legal Argument'

    def _determine_importance(self, score: float) -> str:
        """Determine importance based on score"""
        if score >= 8:
            return 'Critical'
        elif score >= 5:
            return 'High'
        else:
            return 'Medium'

    def export_to_excel(self, df: pd.DataFrame, filename: str = 'dynamic_analysis.xlsx'):
        """Export results to Excel"""
        with pd.ExcelWriter(filename, engine='openpyxl') as writer:
            df.to_excel(writer, sheet_name='Key Items', index=False)

            # Summary sheet
            summary = pd.DataFrame({
                'Metric': ['Total Items', 'Critical', 'High', 'Medium',
                          'Avg Score', 'Max Score', 'Total Pages'],
                'Value': [
                    len(df),
                    len(df[df['importance'] == 'Critical']),
                    len(df[df['importance'] == 'High']),
                    len(df[df['importance'] == 'Medium']),
                    round(df['total_score'].mean(), 2),
                    df['total_score'].max(),
                    len(self.pages_df)
                ]
            })
            summary.to_excel(writer, sheet_name='Summary', index=False)

            # Category breakdown
            category_counts = df['category'].value_counts().reset_index()
            category_counts.columns = ['Category', 'Count']
            category_counts.to_excel(writer, sheet_name='Categories', index=False)

        print(f"Exported to {filename}")

print("DynamicLegalExtractor class defined\n")

DynamicLegalExtractor class defined



In [18]:
# Cell 4: Initialize Dynamic Extractor
"""
Create extractor instance
"""

if document_text:
    extractor = DynamicLegalExtractor(document_text, pages_dataframe)
    print("Dynamic extractor initialized")
    print("Ready to extract key items automatically\n")
else:
    extractor = None
    print("Cannot initialize - no document loaded")

Dynamic extractor initialized
Ready to extract key items automatically



In [19]:

"""
Run dynamic extraction algorithm
"""

if extractor:
    print("="*85)
    print("DYNAMIC EXTRACTION - TOP 10 KEY ITEMS")
    print("="*85 + "\n")

    key_items_df = extractor.extract_key_items(top_n=10)

    # Display results
    display_cols = ['rank', 'category', 'importance', 'page', 'total_score', 'text']
    print(key_items_df[display_cols].to_string(index=False, max_colwidth=60))
else:
    key_items_df = pd.DataFrame()

DYNAMIC EXTRACTION - TOP 10 KEY ITEMS

Extracting sentences from document...
Found 253 sentences

Scoring sentences for legal importance...
Extracted top 10 key items

 rank                 category importance  page  total_score                                                         text
    1              FDA Actions       High     2          6.0 i \n  \nTABLE OF CONTENTS  \nPage  \nTABLE OF AUTHORITIES...
    2 Constitutional Authority       High    13          6.0 They defy \nfederal law , flout  the public -interest det...
    3              FDA Actions       High    14          6.0 Amici emphasize that the FDA’s actions defy  both the age...
    4   General Legal Argument       High    14          6.0 “ There is generally \nno public  interest  in the perpet...
    5 Constitutional Authority       High     9          5.5 Last, the FDA ’s actions threaten to \nundermine  the ami...
    6   General Legal Argument       High    11          5.5 Because of the serious \nsafety concern

In [20]:
# Cell 6: Detailed Score Analysis
"""
Show scoring breakdown for each item
"""

if not key_items_df.empty:
    print("\n" + "="*85)
    print("SCORE BREAKDOWN ANALYSIS")
    print("="*85 + "\n")

    score_cols = ['rank', 'legal_citation', 'modal_strength',
                  'argument_indicator', 'subject_relevance', 'total_score']

    print("Scoring Components:")
    print(key_items_df[score_cols].to_string(index=False))

    print("\n" + "-"*85 + "\n")
    print("Score Statistics:")
    print(key_items_df['total_score'].describe())


SCORE BREAKDOWN ANALYSIS

Scoring Components:
 rank  legal_citation  modal_strength  argument_indicator  subject_relevance  total_score
    1               0               0                 4.5                  1          6.0
    2               0               2                 3.0                  1          6.0
    3               0               2                 3.0                  1          6.0
    4               4               2                 0.0                  0          6.0
    5               0               0                 4.5                  1          5.5
    6               0               2                 1.5                  2          5.5
    7               0               2                 1.5                  2          5.5
    8               0               2                 3.0                  0          5.0
    9               4               0                 0.0                  1          5.0
   10               0               2                

In [21]:
# Cell 7: Category Distribution
"""
Analyze automatically assigned categories
"""

if not key_items_df.empty:
    print("\n" + "="*85)
    print("CATEGORY DISTRIBUTION")
    print("="*85 + "\n")

    category_counts = key_items_df['category'].value_counts()
    print("Items by Category:")
    print(category_counts)

    print("\n" + "-"*85 + "\n")

    importance_counts = key_items_df['importance'].value_counts()
    print("Items by Importance:")
    print(importance_counts)


CATEGORY DISTRIBUTION

Items by Category:
category
Constitutional Authority    5
FDA Actions                 3
General Legal Argument      2
Name: count, dtype: int64

-------------------------------------------------------------------------------------

Items by Importance:
importance
High    10
Name: count, dtype: int64


In [22]:
# Cell 8: Page Distribution Analysis
"""
Show which pages contain key arguments
"""

if not key_items_df.empty:
    print("\n" + "="*85)
    print("PAGE DISTRIBUTION")
    print("="*85 + "\n")

    page_dist = key_items_df['page'].value_counts().sort_index()
    print("Key Items per Page:")
    print(page_dist)

    print(f"\nPage range: {key_items_df['page'].min()} - {key_items_df['page'].max()}")


PAGE DISTRIBUTION

Key Items per Page:
page
2     2
3     1
9     2
11    1
13    1
14    2
23    1
Name: count, dtype: int64

Page range: 2 - 23


In [23]:
# Cell 9: Export Results
"""
Save dynamically extracted data
"""

if extractor and not key_items_df.empty:
    print("\n" + "="*85)
    print("EXPORTING RESULTS")
    print("="*85 + "\n")

    extractor.export_to_excel(key_items_df, 'dynamic_legal_analysis.xlsx')

    # Also export to CSV
    key_items_df.to_csv('key_items_dynamic.csv', index=False)
    print("Also exported to: key_items_dynamic.csv")


EXPORTING RESULTS

Exported to dynamic_legal_analysis.xlsx
Also exported to: key_items_dynamic.csv


In [24]:
"""
Final summary with extraction methodology
"""

if not key_items_df.empty:
    print("\n" + "="*85)
    print("EXTRACTION SUMMARY")
    print("="*85 + "\n")

    print("METHODOLOGY:")
    print("  Dynamic NLP-based extraction using:")
    print("  • Legal citation pattern matching")
    print("  • Modal verb strength analysis")
    print("  • Argument indicator detection")
    print("  • Subject relevance scoring")
    print("  • Automatic categorization\n")

    print("RESULTS:")
    print(f"  Total sentences analyzed: {len(extractor.sentences)}")
    print(f"  Key items extracted: {len(key_items_df)}")
    print(f"  Critical items: {len(key_items_df[key_items_df['importance'] == 'Critical'])}")
    print(f"  High priority items: {len(key_items_df[key_items_df['importance'] == 'High'])}")
    print(f"  Average relevance score: {key_items_df['total_score'].mean():.2f}")

    print("\n" + "="*85)
    print("="*85)
    print("\nAll items extracted dynamically from PDF content")
    print("Scoring algorithm can be adjusted based on legal domain needs")
else:
    print("No analysis available")


EXTRACTION SUMMARY

METHODOLOGY:
  Dynamic NLP-based extraction using:
  • Legal citation pattern matching
  • Modal verb strength analysis
  • Argument indicator detection
  • Subject relevance scoring
  • Automatic categorization

RESULTS:
  Total sentences analyzed: 253
  Key items extracted: 10
  Critical items: 0
  High priority items: 10
  Average relevance score: 5.55


All items extracted dynamically from PDF content
Scoring algorithm can be adjusted based on legal domain needs
