# EU Legal Document Summary Quality Analysis

In [2]:
%pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
# Import necessary libraries
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from rouge import Rouge
from sklearn.metrics import precision_score, recall_score, f1_score
import random
from tqdm.notebook import tqdm

# Set plot style
plt.style.use('ggplot')
sns.set(font_scale=1.2)

# Download NLTK resources if needed
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/alexanderbenady/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alexanderbenady/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Connect to the Database

In [4]:
# Connect to the SQLite database
DB_PATH = "../scraper/data/eurlex.db"
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()

In [5]:
# List all tables in the database
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
print("Tables in the database:")
for table in tables:
    print(f"- {table[0]}")

Tables in the database:
- responsible_bodies
- forms
- documents
- authors
- document_authors
- eurovoc_descriptors
- document_eurovoc_descriptors
- subject_matters
- document_subject_matters
- directory_codes
- document_directory_codes
- sqlite_sequence
- document_sections
- document_keywords


In [6]:
# Examine schema of documents table
cursor.execute("PRAGMA table_info(documents);")
columns = cursor.fetchall()
print("\nColumns in the documents table:")
for col in columns:
    print(f"- {col[1]} ({col[2]})")


Columns in the documents table:
- document_id (INTEGER)
- celex_number (TEXT)
- title (TEXT)
- identifier (TEXT)
- eli_uri (TEXT)
- html_url (TEXT)
- pdf_url (TEXT)
- responsible_body_id (INTEGER)
- form_id (INTEGER)
- date_of_document (DATE)
- date_of_effect (DATE)
- date_of_end_validity (DATE)
- content (TEXT)
- content_html (TEXT)
- summary (TEXT)
- summary_word_count (INTEGER)
- total_words (INTEGER)
- compression_ratio (REAL)
- tier (INTEGER)


## Sample Documents from Each Tier

We'll retrieve documents from each tier, randomly sampling 10% of them for our analysis.

In [None]:
# Define word count thresholds for tiers according to the project methodology
def get_tier_from_word_count(word_count):
    if word_count <= 600:
        return 1
    elif word_count <= 2500:
        return 2
    elif word_count <= 20000:
        return 3
    else:
        return 4

# Query to analyze documents by word count
cursor.execute("""
    SELECT 
        content,
        total_words,
        tier
    FROM documents 
    WHERE summary IS NOT NULL
""")

# Process results
word_count_tiers = {1: 0, 2: 0, 3: 0, 4: 0}

In [None]:
print("Document counts by word count tier:")
for tier, count in word_count_tiers.items():
    if tier == 1:
        print(f"Tier 1 (0-600 words): {count} documents")
    elif tier == 2:
        print(f"Tier 2 (601-2,500 words): {count} documents")
    elif tier == 3:
        print(f"Tier 3 (2,501-20,000 words): {count} documents")
    else:
        print(f"Tier 4 (20,000+ words): {count} documents")

In [None]:
# First, check how many tiers are in the database and how many documents are in each tier
cursor.execute("SELECT tier, COUNT(*) FROM documents GROUP BY tier ORDER BY tier;")
tier_counts = cursor.fetchall()

print("Document counts by tier:")
for tier, count in tier_counts:
    print(f"Tier {tier}: {count} documents")

In [18]:
# Function to sample documents from a specific tier
def sample_documents_from_tier(tier, sample_pct=0.1):
    # Get total count for this tier
    cursor.execute("SELECT COUNT(*) FROM documents WHERE tier = ? AND summary IS NOT NULL", (tier,))
    total_count = cursor.fetchone()[0]
    
    # Calculate sample size (10%)
    sample_size = max(int(total_count * sample_pct), 1)  # At least 1 document
    
    # Get all document IDs for this tier
    cursor.execute("SELECT celex_number FROM documents WHERE tier = ? AND summary IS NOT NULL", (tier,))
    all_ids = [row[0] for row in cursor.fetchall()]
    
    # Randomly sample document IDs
    sampled_ids = random.sample(all_ids, min(sample_size, len(all_ids)))
    
    # Fetch the complete documents for the sampled IDs
    placeholders = ', '.join(['?'] * len(sampled_ids))
    query = f"SELECT celex_number, title, content, summary, tier FROM documents WHERE celex_number IN ({placeholders})"
    cursor.execute(query, sampled_ids)
    
    # Convert to DataFrame
    columns = ['celex_number', 'title', 'content', 'summary','tier']
    df = pd.DataFrame(cursor.fetchall(), columns=columns)
    
    print(f"Sampled {len(df)} documents from Tier {tier} (out of {total_count})")
    return df

In [None]:
# Sample documents from each tier
all_samples = []
for tier, _ in tier_counts:
    tier_samples = sample_documents_from_tier(tier)
    all_samples.append(tier_samples)

# Combine all samples into one DataFrame
sampled_docs = pd.concat(all_samples, ignore_index=True)
sampled_docs.head()

In [22]:
# Import the DynamicKeywordExtractor class from the project
import sys
from pathlib import Path

# Add the summarization source directory to path
summarization_src = Path('/Users/alexanderbenady/DataThesis/eu-legal-recommender/summarization/src')
if str(summarization_src) not in sys.path:
    sys.path.append(str(summarization_src))

# Import the keyword extractor
from keyword_extractor import DynamicKeywordExtractor

# Create an instance of the keyword extractor
keyword_extractor = DynamicKeywordExtractor()

# Function to extract keywords for all documents in the DataFrame, without tqdm
def add_extracted_keywords(df):
    """Extract keywords using the project's extractor and add them to the DataFrame."""
    # Create columns for keywords and scores
    df['extracted_keywords'] = None
    df['keyword_scores'] = None
    
    # Process each document
    print(f"Extracting keywords for {len(df)} documents...")
    for i, (idx, row) in enumerate(df.iterrows()):
        # Print progress every 10 documents
        if i % 10 == 0:
            print(f"Processing document {i+1}/{len(df)}...")
            
        content = row['content']
        if isinstance(content, str) and content.strip():
            try:
                # Extract keywords
                keywords = keyword_extractor.extract_keywords(content)
                
                # Split keywords and scores
                keyword_terms = [k for k, _ in keywords]
                keyword_scores = [s for _, s in keywords]
                
                # Store in DataFrame
                df.at[idx, 'extracted_keywords'] = keyword_terms
                df.at[idx, 'keyword_scores'] = keyword_scores
            except Exception as e:
                print(f"Error extracting keywords for document {row['celex_number']}: {e}")
                df.at[idx, 'extracted_keywords'] = []
                df.at[idx, 'keyword_scores'] = []
        else:
            df.at[idx, 'extracted_keywords'] = []
            df.at[idx, 'keyword_scores'] = []
    
    print("Keyword extraction complete!")
    return df

# Run this function on your combined DataFrame
sampled_docs = add_extracted_keywords(sampled_docs)

# Display a few examples to verify
print("\nSample of extracted keywords:")
for i in range(min(3, len(sampled_docs))):
    doc = sampled_docs.iloc[i]
    print(f"\nDocument {i+1} (Tier {doc['tier']}):")
    print(f"Celex: {doc['celex_number']}")
    print(f"Title: {doc['title'][:50]}...")
    if isinstance(doc['extracted_keywords'], list):
        print(f"Keywords ({len(doc['extracted_keywords'])}): {', '.join(doc['extracted_keywords'][:5])}...")
    else:
        print("No keywords extracted")

INFO:keyword_extractor:Initializing KeyBERT with model: distilbert-base-nli-mean-tokens
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: distilbert-base-nli-mean-tokens


Extracting keywords for 416 documents...
Processing document 1/416...
Processing document 11/416...
Processing document 21/416...
Processing document 31/416...
Processing document 41/416...
Processing document 51/416...
Processing document 61/416...
Processing document 71/416...
Processing document 81/416...
Processing document 91/416...
Processing document 101/416...
Processing document 111/416...
Processing document 121/416...
Processing document 131/416...
Processing document 141/416...
Processing document 151/416...
Processing document 161/416...
Processing document 171/416...
Processing document 181/416...
Processing document 191/416...
Processing document 201/416...
Processing document 211/416...
Processing document 221/416...
Processing document 231/416...
Processing document 241/416...
Processing document 251/416...
Processing document 261/416...
Processing document 271/416...
Processing document 281/416...
Processing document 291/416...
Processing document 301/416...
Processin

In [26]:
sampled_docs

Unnamed: 0,celex_number,title,content,summary,tier,extracted_keywords,keyword_scores
0,22023D2270,Decision of the EEA Joint Committee No 127/202...,Official Journalof the European UnionENSeries ...,Decision of the EEA Joint Committee No 127/202...,1,"[eu 2021, 11 2023, 2023of 28, 2021 amending, a...","[0.5504, 0.5604, 0.5605, 0.5646, 0.5712, 0.574..."
1,22023D2272,Decision of the EEA Joint Committee No 129/202...,Official Journalof the European UnionENSeries ...,Annex III (Transport) to the EEA Agreement — T...,1,"[2023 28, 11 2023, 2023decision, agreement 202...","[0.5764, 0.5841, 0.6078, 0.612, 0.6175, 0.619,..."
2,22023D2277,Decision of the EEA Joint Committee No 134/202...,Official Journalof the European UnionENSeries ...,Decision of the EEA Joint Committee No 134/202...,1,"[134 2023of, 2021 oj, 2023of 28, 2023decision,...","[0.5699, 0.58, 0.5876, 0.5941, 0.5945, 0.5955,..."
3,22023D2278,Decision of the EEA Joint Committee No 135/202...,Official Journalof the European UnionENSeries ...,Decision of the EEA Joint Committee No 135/202...,1,"[2023 2294, 2021 oj, 2023of 28, 2021 amending,...","[0.5495, 0.556, 0.5582, 0.5735, 0.5747, 0.5756..."
4,22023D2285,Decision of the EEA Joint Committee No 142/202...,Official Journalof the European UnionENSeries ...,Decision of the EEA Joint Committee No 142/202...,1,"[2023 28, 2023of 28, 105 2022, dec 2023, 2023d...","[0.5761, 0.5766, 0.5785, 0.5902, 0.5923, 0.608..."
...,...,...,...,...,...,...,...
411,32025L0050,Council Directive (EU) 2025/50 of 10 December ...,Official Journalof the European UnionENL serie...,**Revised Summary:** \n\n**Council Directive ...,3,"[esma 2026, 2025 5010, april 2024, 5010 2025, ...","[0.5225, 0.5297, 0.5349, 0.5351, 0.5357, 0.538..."
412,32025R0080,Commission Implementing Regulation (EU) 2025/8...,Official Journalof the European UnionENL serie...,Commission Implementing Regulation (EU) 2025/8...,3,"[2024 amending, 2024gbmp123, january 2025amend...","[0.5281, 0.5295, 0.536, 0.5361, 0.5365, 0.5369..."
413,52024BP2307,Resolution (EU) 2024/2307 of the European Parl...,Official Journalof the European UnionENL serie...,The European Parliament adopted Resolution (EU...,3,"[2024resolution, strategy 2023, published 2022...","[0.5964, 0.5993, 0.5997, 0.6049, 0.6106, 0.611..."
414,52024BP2340,Resolution (EU) 2024/2340 of the European Parl...,Official Journalof the European UnionENL serie...,Resolution (EU) 2024/2340 of the European Parl...,3,"[launched 2024, sysper 2024, 2024 resolution, ...","[0.5827, 0.5864, 0.592, 0.5925, 0.5936, 0.5954..."


## Preprocessing Functions

In [27]:
# Preprocessing functions
def clean_text(text):
    """Clean text by removing special characters and extra whitespace"""
    if not isinstance(text, str):
        return ""
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = text.strip()  # Remove leading/trailing whitespace
    return text

def tokenize_text(text):
    """Tokenize text into words, removing stopwords"""
    if not isinstance(text, str):
        return []
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
    return tokens

def parse_keywords(keywords_str):
    """Parse keywords from the comma-separated string"""
    if not isinstance(keywords_str, str):
        return []
    keywords = [k.strip() for k in keywords_str.split(',')]
    # Split multi-word keywords into individual words
    all_keyword_words = []
    for kw in keywords:
        all_keyword_words.extend(tokenize_text(kw))
    return list(set(all_keyword_words))  # Remove duplicates

## Calculate Metrics

In [28]:
# 1. Compression Ratio
def calculate_compression_ratio(original_text, summary):
    """Calculate compression ratio (summary length / original text length)"""
    if not isinstance(original_text, str) or not isinstance(summary, str):
        return np.nan
    if len(original_text) == 0:
        return np.nan
    return len(summary) / len(original_text)

# 2. ROUGE Scores
rouge = Rouge()
def calculate_rouge_scores(original_text, summary):
    """Calculate ROUGE-1, ROUGE-2, and ROUGE-L scores"""
    if not isinstance(original_text, str) or not isinstance(summary, str):
        return {'rouge-1': {'f': 0}, 'rouge-2': {'f': 0}, 'rouge-l': {'f': 0}}
    
    # Ensure texts are not empty and have at least one sentence
    if len(summary.strip()) == 0 or len(original_text.strip()) == 0:
        return {'rouge-1': {'f': 0}, 'rouge-2': {'f': 0}, 'rouge-l': {'f': 0}}
    
    # Handle very long texts by using the first 10k characters
    # (Rouge can be memory-intensive with very long texts)
    original_text = original_text[:10000]
    
    try:
        scores = rouge.get_scores(summary, original_text)[0]
        return scores
    except Exception as e:
        print(f"Error calculating ROUGE: {e}")
        return {'rouge-1': {'f': 0}, 'rouge-2': {'f': 0}, 'rouge-l': {'f': 0}}

# 3. Keyword Retention
def calculate_keyword_retention(summary, keywords):
    """Calculate keyword retention (% of keywords present in summary)"""
    if not isinstance(summary, str) or not keywords:
        return 0
    
    # Tokenize summary
    summary_tokens = set(tokenize_text(summary))
    
    # Count how many keywords appear in the summary
    keyword_tokens = set(keywords)
    if not keyword_tokens:
        return 0
    
    found_keywords = keyword_tokens.intersection(summary_tokens)
    retention_rate = len(found_keywords) / len(keyword_tokens) if len(keyword_tokens) > 0 else 0
    
    return retention_rate

In [33]:
# Apply preprocessing to the sampled documents
sampled_docs['clean_text'] = sampled_docs['content'].apply(clean_text)
sampled_docs['clean_summary'] = sampled_docs['summary'].apply(clean_text)
sampled_docs['parsed_keywords'] = sampled_docs['extracted_keywords'].apply(parse_keywords)

# Calculate metrics for each document
results = []

for _, doc in sampled_docs.iterrows():
    # Calculate compression ratio
    compression_ratio = calculate_compression_ratio(doc['clean_text'], doc['clean_summary'])
    
    # Calculate ROUGE scores
    rouge_scores = calculate_rouge_scores(doc['clean_text'], doc['clean_summary'])
    
    # Calculate keyword retention
    keyword_retention = calculate_keyword_retention(doc['clean_summary'], doc['parsed_keywords'])
    
    # Store all metrics
    result = {
        'celex_number': doc['celex_number'],
        'tier': doc['tier'],
        'compression_ratio': compression_ratio,
        'rouge_1': rouge_scores['rouge-1']['f'],
        'rouge_2': rouge_scores['rouge-2']['f'],
        'rouge_l': rouge_scores['rouge-l']['f'],
        'keyword_retention': keyword_retention,
        'original_length': len(doc['clean_text']) if isinstance(doc['clean_text'], str) else 0,
        'summary_length': len(doc['clean_summary']) if isinstance(doc['clean_summary'], str) else 0,
        'keyword_count': len(doc['parsed_keywords']) if isinstance(doc['parsed_keywords'], list) else 0
    }
    
    results.append(result)

# Convert results to DataFrame
metrics_df = pd.DataFrame(results)

## Analyze Results

Now, let's analyze the metrics by tier and visualize the results.

In [38]:
# Group by tier and calculate mean metrics
tier_metrics = metrics_df.groupby('tier').agg({
    'compression_ratio': 'mean',
    'rouge_1': 'mean',
    'rouge_2': 'mean',
    'rouge_l': 'mean',
    'keyword_retention': 'mean',
    'original_length': 'mean',
    'summary_length': 'mean',
    'keyword_count': 'mean',
    'celex_number': 'count'  # Count of documents in each tier
}).rename(columns={'celex_number': 'count'})

tier_metrics

Unnamed: 0_level_0,compression_ratio,rouge_1,rouge_2,rouge_l,keyword_retention,original_length,summary_length,keyword_count,count
tier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0.061725,0.261373,0.137614,0.242573,0.0,26532.12,1128.302857,0.0,175
2,0.285468,0.257003,0.126342,0.23762,0.0,108827.6,1474.507246,0.0,138
3,0.00685,0.235471,0.089414,0.215125,0.0,826136.5,1802.333333,0.0,102
4,0.000704,0.119534,0.035901,0.104956,0.0,1463194.0,1030.0,0.0,1


## Metrics by Tier

In [None]:
# Analyze document length by tier
plt.figure(figsize=(12, 6))
sns.boxplot(x='tier', y='original_length', data=metrics_df)
plt.title('Original Document Length by Tier')
plt.xlabel('Document Tier')
plt.ylabel('Document Length (characters)')
plt.yscale('log')  # Log scale for better visualization
plt.grid(True, alpha=0.3)
plt.show()

plt.figure(figsize=(12, 6))
sns.boxplot(x='tier', y='summary_length', data=metrics_df)
plt.title('Summary Length by Tier')
plt.xlabel('Document Tier')
plt.ylabel('Summary Length (characters)')
plt.grid(True, alpha=0.3)
plt.show()

In [37]:
conn.close()