# EurLex Scraper Metrics Analysis

In [3]:
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime

# Set style for plots
plt.style.use('seaborn')
sns.set_palette('husl')

ModuleNotFoundError: No module named 'pandas'

In [2]:
def parse_prom_value(line):
    """Parse a Prometheus metrics line."""
    # Skip help and type lines
    if line.startswith('#'):
        return None, None
    
    # Matches formats like:
    # metric_name{label1="value1",label2="value2"} value
    # or
    # metric_name value
    match = re.match(r'^(\w+)(?:\{[^}]*\})?\s+(\d+(\.\d+)?)', line)
    if match:
        metric = match.group(1)
        value = float(match.group(2))
        return metric, value
    return None, None

In [None]:
def parse_prom_file(file):
    """Parse a Prometheus metrics file and extract key metrics."""
    metrics = {
        'timestamp': datetime.fromtimestamp(0),  # Default timestamp
        'documents_processed_total': 0,
        'requests_total': 0,
        'requests_success': 0,
        'requests_failure': 0,
        'retry_attempts_total': 0,
        'validation_errors_total': 0,
        'storage_size_bytes': 0
    }
    
    # Try to extract timestamp from filename
    try:
        timestamp_str = file.stem.split('_')[1] + '_' + file.stem.split('_')[2]
        metrics['timestamp'] = datetime.strptime(timestamp_str, '%Y%m%d_%H%M%S')
    except:
        pass
    
    with open(file, 'r') as f:
        for line in f:
            line = line.strip()
            metric, value = parse_prom_value(line)
            
            if metric:
                # Normalize metric names
                normalized_metric = metric.lower()
                
                # Map metrics to our keys
                if 'documents_processed_total' in normalized_metric:
                    metrics['documents_processed_total'] = value
                elif 'requests_total' in normalized_metric:
                    metrics['requests_total'] = value
                elif 'retry_attempts_total' in normalized_metric:
                    metrics['retry_attempts_total'] = value
                elif 'validation_errors_total' in normalized_metric:
                    metrics['validation_errors_total'] = value
                elif 'storage_size_bytes' in normalized_metric:
                    metrics['storage_size_bytes'] = value
    
    return metrics

In [None]:
# Load metrics from .prom files
metrics_dir = Path('../metrics')
metrics_files = sorted(list(metrics_dir.glob('*.prom')))

# Process metrics files
metrics_data = []
for file in metrics_files:
    try:
        metrics = parse_prom_file(file)
        metrics_data.append(metrics)
    except Exception as e:
        print(f"Error processing file {file}: {e}")

In [None]:
# Convert to DataFrame
df = pd.DataFrame(metrics_data)

# Filter out sessions with 0 or 1 documents processed
df_filtered = df[df['documents_processed_total'] > 1]

# Sort by timestamp
df_filtered = df_filtered.sort_values('timestamp')

# Print summary
print("Total number of metrics files:", len(df))
print("Number of metrics files after filtering:", len(df_filtered))
print("\nMetrics summary (filtered):")
print(df_filtered.describe())

In [None]:
# Plotting
plt.figure(figsize=(15, 10))

plt.subplot(2, 2, 1)
plt.plot(df_filtered['timestamp'], df_filtered['documents_processed_total'], label='Documents Processed')
plt.title('Documents Processed')
plt.xlabel('Time')
plt.ylabel('Count')
plt.xticks(rotation=45)

plt.subplot(2, 2, 2)
plt.plot(df_filtered['timestamp'], df_filtered['requests_total'], label='Total Requests')
plt.title('Total Requests')
plt.xlabel('Time')
plt.ylabel('Count')
plt.xticks(rotation=45)

plt.subplot(2, 2, 3)
plt.plot(df_filtered['timestamp'], df_filtered['retry_attempts_total'], label='Retry Attempts')
plt.title('Retry Attempts')
plt.xlabel('Time')
plt.ylabel('Count')
plt.xticks(rotation=45)

plt.subplot(2, 2, 4)
plt.plot(df_filtered['timestamp'], df_filtered['storage_size_bytes'], label='Storage Size')
plt.title('Storage Size')
plt.xlabel('Time')
plt.ylabel('Bytes')
plt.xticks(rotation=45)

plt.tight_layout()
plt.savefig('../reports/metrics_analysis.png')
plt.show()