# EUR-Lex Scraper Metrics Analysis

This notebook analyzes the metrics collected during the scraping process.

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime

# Set style for plots
plt.style.use('seaborn')
sns.set_palette('husl')

In [None]:
# Load metrics from JSON files
metrics_dir = Path('../metrics')
metrics_files = list(metrics_dir.glob('*.json'))

metrics_data = []
for file in metrics_files:
    with open(file, 'r') as f:
        data = json.load(f)
        data['timestamp'] = datetime.fromtimestamp(data['timestamp'])
        metrics_data.append(data)

# Convert to DataFrame
df = pd.DataFrame(metrics_data)

## Basic Statistics

In [None]:
# Display basic statistics
print("Total number of scraping sessions:", len(df))
print("\nMetrics summary:")
df.describe()

## Time Series Analysis

In [None]:
# Plot metrics over time
plt.figure(figsize=(12, 6))
plt.plot(df['timestamp'], df['documents_processed'], label='Documents Processed')
plt.plot(df['timestamp'], df['successful_downloads'], label='Successful Downloads')
plt.plot(df['timestamp'], df['failed_downloads'], label='Failed Downloads')
plt.xlabel('Time')
plt.ylabel('Count')
plt.title('Scraping Performance Over Time')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Success Rate Analysis

In [None]:
# Calculate and plot success rates
df['success_rate'] = df['successful_downloads'] / df['documents_processed'] * 100

plt.figure(figsize=(12, 6))
plt.plot(df['timestamp'], df['success_rate'])
plt.xlabel('Time')
plt.ylabel('Success Rate (%)')
plt.title('Scraping Success Rate Over Time')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()