# Crawling Assignment Activity 2.2

This notebook interacts with the local crawling assignment server (running at `http://localhost:3000`) to
- discover the site graph with minimal page visits,
- track `node_id` updates for each page, and
- estimate PageRank scores over the discovered link structure.


## Setup and Helper Functions

The web server returns JSON responses. We use `requests` for HTTP and utilities for crawling and scoring.


In [12]:
%pip install beautifulsoup4


Note: you may need to restart the kernel to use updated packages.


In [13]:
import requests
import time
import math
import json
import re
from collections import deque, defaultdict
from dataclasses import dataclass, field
from typing import Dict, List, Set, Tuple
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock

from bs4 import BeautifulSoup

BASE_URL = "http://localhost:3000"

# ULTRA-OPTIMIZED: Connection pooling for parallel requests
session = requests.Session()
adapter = requests.adapters.HTTPAdapter(
    pool_connections=100,  # Max connection pools
    pool_maxsize=100,      # Max connections per pool
    max_retries=0,         # No retries for speed
)
session.mount('http://', adapter)
session.mount('https://', adapter)
session.headers.update({
    "User-Agent": "CrawlerAssignmentBot/1.2",
    "Accept": "text/html,application/json",
    "Connection": "keep-alive",  # Reuse connections
})


In [14]:
def fetch_page(path: str = "/") -> str:
    """Optimized fetch with shorter timeout for parallel requests"""
    url = BASE_URL.rstrip("/") + path
    response = session.get(url, timeout=3)  # Shorter timeout for faster parallel execution
    response.raise_for_status()
    return response.text


def parse_page(content: str, path: str) -> dict:
    soup = BeautifulSoup(content, "html.parser")

    page_id_text = soup.select_one(".page-id")
    page_id = ""
    if page_id_text:
        page_id = page_id_text.get_text(strip=True).split(":")[-1].strip()

    node_id_elem = soup.select_one(".node-id b")
    node_id = node_id_elem.get_text(strip=True) if node_id_elem else ""

    last_updated_elem = soup.select_one(".last-updated")
    last_updated = ""
    if last_updated_elem:
        last_updated = last_updated_elem.get_text(strip=True)
        if ":" in last_updated:
            last_updated = last_updated.split(":", 1)[-1].strip()

    history_entries: List[dict] = []
    history_container = soup.select_one("details")
    if history_container:
        for entry in history_container.select("div"):
            text = entry.get_text(strip=True)
            text = text.strip("\u0007 \n\r\t")
            match = re.match(r"([A-Za-z0-9]+)\s*\(([^)]+)\)", text)
            if match:
                history_entries.append({
                    "node_id": match.group(1),
                    "timestamp": match.group(2),
                })

    outgoing_links: List[str] = []
    for link in soup.select("a.file-link"):
        href = link.get("href")
        if href and href.startswith("/page_"):
            outgoing_links.append(href)
    outgoing_links = sorted(set(outgoing_links))

    return {
        "path": path,
        "page_id": page_id,
        "node_id": node_id,
        "last_updated": last_updated,
        "history": history_entries,
        "links": outgoing_links,
    }


root_html = fetch_page("/")
root_parsed = parse_page(root_html, "/")
root_parsed


{'path': '/',
 'page_id': 'page_rdwv1o91',
 'node_id': 'm3y9q8rv4m01',
 'last_updated': '2025-12-05 11:58:05 UTC',
 'history': [],
 'links': ['/page_1and89kh',
  '/page_l07vms0e',
  '/page_o41nvqbo',
  '/page_o8uri2ox']}

In [15]:
@dataclass
class PageState:
    page_id: str
    path: str
    last_node_id: str
    last_updated_at: str
    history: List[dict] = field(default_factory=list)
    outgoing: List[str] = field(default_factory=list)
    last_fetched_ts: float = field(default_factory=time.time)
    last_changed_ts: float = field(default_factory=time.time)
    updates_detected: int = 0

    def differs_from(self, node_id: str, last_updated: str, history: List[dict]) -> bool:
        if node_id != self.last_node_id:
            return True
        if last_updated and last_updated != self.last_updated_at:
            return True
        if len(history) != len(self.history):
            return True
        if history and self.history:
            return history[-1] != self.history[-1]
        return False


In [16]:
# ULTRA-OPTIMIZED: Add parallel refresh methods to EfficientCrawler for minimal staleness
def _fetch_and_update_single(self, path: str) -> Tuple[str, PageState, bool]:
    """Helper for parallel fetching - returns (path, state, updated)"""
    try:
        state, updated = self.fetch_and_update(path)
        return path, state, updated
    except Exception as e:
        # Return existing state if fetch fails
        state = self.pages.get(path)
        return path, state, False

def refresh_all_pages_aggressively(self, max_workers: int = 50) -> dict:
    """
    ULTRA-OPTIMIZED: Parallel refresh of ALL pages to minimize staleness.
    Uses ThreadPoolExecutor to fetch all pages simultaneously.
    This is CRITICAL for minimizing staleness in competition!
    """
    if not self.pages:
        return {"refreshed_paths": [], "updates_detected": 0, "fetches": 0}
    
    # Get all pages, sorted by staleness (oldest first) for priority
    current_time = time.time()
    all_pages = sorted(
        self.pages.keys(),
        key=lambda p: current_time - self.pages[p].last_fetched_ts,
        reverse=True
    )
    
    # CRITICAL: Use parallel fetching to minimize total refresh time
    # This dramatically reduces staleness by refreshing all pages simultaneously
    refreshed = []
    updates_detected = 0
    lock = Lock()
    
    # Parallel fetch with ThreadPoolExecutor
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all fetch tasks
        future_to_path = {
            executor.submit(self._fetch_and_update_single, path): path
            for path in all_pages
        }
        
        # Process results as they complete
        for future in as_completed(future_to_path):
            try:
                path, state, updated = future.result(timeout=5)
                if state is not None:
                    refreshed.append(path)
                    if updated:
                        with lock:
                            updates_detected += 1
            except Exception as e:
                # Skip failed fetches
                pass
    
    return {
        "refreshed_paths": refreshed,
        "updates_detected": updates_detected,
        "fetches": len(refreshed),
    }

# Bind methods to class

print("‚úì Ultra-optimized parallel refresh methods added for minimal staleness!")


‚úì Ultra-optimized parallel refresh methods added for minimal staleness!


In [17]:
class EfficientCrawler:
    """
    Optimized crawler with the following efficiency improvements:
    1. Path normalization caching to avoid repeated string operations
    2. Seen paths tracking (set) for O(1) duplicate detection vs O(n) queue checks
    3. Time caching to reduce system calls
    4. Selective logging (only new/changed pages) to reduce memory
    5. Optimized refresh: single pass filter + slice instead of break
    6. Efficient graph building: set operations instead of unions
    7. PageRank optimization: pre-compute sink contributions, separate sink/non-sink processing
    8. Single-pass summary calculation
    """
    def __init__(self, seed_path: str = "/", revisit_window: float = 120.0):
        self.seed_path = seed_path
        self.revisit_window = revisit_window
        self.pages: Dict[str, PageState] = {}
        self.graph: Dict[str, Set[str]] = defaultdict(set)
        self.page_visits = 0
        self.node_updates = 0
        self.fetch_log: List[dict] = []
        # Optimization: Track seen paths to avoid duplicate queue entries
        self._seen_paths: Set[str] = set()
        # Optimization: Cache normalized paths
        self._path_cache: Dict[str, str] = {}

    def normalize_path(self, path: str) -> str:
        # Use cache to avoid repeated normalization
        if path in self._path_cache:
            return self._path_cache[path]
        
        original_path = path
        if path.startswith("http://") or path.startswith("https://"):
            if path.startswith(BASE_URL):
                path = path[len(BASE_URL):]
            else:
                self._path_cache[original_path] = path
                return path  # external link
        if not path.startswith("/"):
            path = "/" + path
        
        self._path_cache[original_path] = path
        return path

    def enqueue_links(self, path: str, links: List[str], queue: deque):
        normalized_links = []
        for link in links:
            normalized = self.normalize_path(link)
            if not normalized.startswith("/"):
                continue  # skip external
            normalized_links.append(normalized)
            # Optimization: Use set lookup instead of checking queue
            if normalized not in self._seen_paths:
                self._seen_paths.add(normalized)
                queue.append(normalized)
        self.graph[path] = set(normalized_links)

    def fetch_and_update(self, path: str) -> Tuple[PageState, bool]:
        content = fetch_page(path)
        parsed = parse_page(content, path)
        self.page_visits += 1

        page_id = parsed.get("page_id", "")
        node_id = parsed.get("node_id", "")
        history = parsed.get("history", [])
        outgoing = parsed.get("links", [])
        last_timestamp = parsed.get("last_updated", "")

        state = self.pages.get(path)
        now = time.time()
        is_new = state is None
        changed = False

        if state is None:
            state = PageState(
                page_id=page_id,
                path=path,
                last_node_id=node_id,
                last_updated_at=last_timestamp,
                history=history,
                outgoing=outgoing,
                last_fetched_ts=now,
                last_changed_ts=now,
            )
            self.pages[path] = state
            self._seen_paths.add(path)  # Mark as seen
        else:
            state.page_id = page_id or state.page_id
            if state.differs_from(node_id, last_timestamp, history):
                changed = True
                state.last_node_id = node_id
                state.last_updated_at = last_timestamp
                state.history = history
                state.last_changed_ts = now
                state.updates_detected += 1
                self.node_updates += 1
            state.outgoing = outgoing
            state.last_fetched_ts = now

        self.graph[path] = set(outgoing)
        # Optimization: Only log if needed (reduce memory)
        if is_new or changed:
            self.fetch_log.append({
                "path": path,
                "timestamp": now,
                "is_new": is_new,
                "changed": (changed and not is_new),
            })
        return state, (changed and not is_new)

    def crawl(self, max_visits: int = 5000):
        queue: deque[str] = deque([self.seed_path])
        visited: Set[str] = set()
        self._seen_paths.add(self.seed_path)
        current_time = time.time()

        while queue and self.page_visits < max_visits:
            path = queue.popleft()
            state = self.pages.get(path)

            should_fetch = False
            if state is None:
                should_fetch = True
            else:
                # Optimization: Cache current time to avoid repeated calls
                current_time = time.time()
                if current_time - state.last_fetched_ts >= self.revisit_window:
                    should_fetch = True

            if not should_fetch:
                continue

            state, _ = self.fetch_and_update(path)
            visited.add(path)
            self.enqueue_links(path, state.outgoing, queue)

        return visited

    def refresh_due_pages(self, max_visits: int = 1000):
        # Optimization: Calculate current time once
        current_time = time.time()
        # Optimization: Use list comprehension with filter for better performance
        due_pages = [
            path for path, state in self.pages.items()
            if current_time - state.last_fetched_ts >= self.revisit_window
        ]
        # Optimization: Sort by staleness (oldest first) for better refresh order
        due_pages.sort(key=lambda p: current_time - self.pages[p].last_fetched_ts, reverse=True)
        
        refreshed = []
        updates_detected = 0
        for path in due_pages[:max_visits]:  # Slice instead of break
            _, updated = self.fetch_and_update(path)
            refreshed.append(path)
            if updated:
                updates_detected += 1
        return {
            "refreshed_paths": refreshed,
            "updates_detected": updates_detected,
            "fetches": len(refreshed),
        }

    def build_pagerank_matrix(self):
        # Optimization: Build node set more efficiently
        all_nodes = set(self.graph.keys())
        for dests in self.graph.values():
            all_nodes.update(dests)
        
        nodes = sorted(all_nodes)
        node_index = {node: idx for idx, node in enumerate(nodes)}
        
        # Optimization: Pre-allocate adjacency list
        n = len(nodes)
        adjacency = [[] for _ in range(n)]
        
        for src, dests in self.graph.items():
            if src not in node_index:
                continue
            src_idx = node_index[src]
            # Optimization: Use list comprehension with filter
            adjacency[src_idx] = [node_index[d] for d in dests if d in node_index]
        
        return nodes, adjacency

    def pagerank(self, damping: float = 0.85, max_iter: int = 100, tol: float = 1e-6):
        nodes, adjacency = self.build_pagerank_matrix()
        n = len(nodes)
        if n == 0:
            return {}
        
        pr = [1.0 / n] * n
        teleport = (1.0 - damping) / n
        sink_share = damping / n  # Pre-compute sink share

        for iteration in range(max_iter):
            new_pr = [teleport] * n
            # Optimization: Pre-compute total sink contribution and add to all nodes once
            sink_total = sum(pr[idx] for idx, neighbors in enumerate(adjacency) if not neighbors)
            if sink_total > 0:
                sink_contribution = sink_total * sink_share
                for j in range(n):
                    new_pr[j] += sink_contribution
            
            # Process non-sink nodes
            for idx, neighbors in enumerate(adjacency):
                if neighbors:  # Only process non-sinks
                    share = damping * pr[idx] / len(neighbors)
                    for dest_idx in neighbors:
                        new_pr[dest_idx] += share
            
            # Optimization: Early convergence check with vectorized computation
            delta = sum(abs(new_pr[i] - pr[i]) for i in range(n))
            pr = new_pr
            if delta < tol:
                break
        
        return {nodes[i]: pr[i] for i in range(n)}

    def summary(self) -> dict:
        # Optimization: Single pass through pages
        total_links = 0
        unique_pages = len(self.pages)
        for state in self.pages.values():
            total_links += len(state.outgoing)
        
        return {
            "unique_pages": unique_pages,
            "page_visits": self.page_visits,
            "node_updates": self.node_updates,
            "average_out_degree": (total_links / unique_pages) if unique_pages else 0.0,
        }


In [18]:
# crawler = EfficientCrawler(seed_path="/", revisit_window=5.0)
# visited = crawler.crawl(max_visits=2000)
# visited_count = len(visited)
# initial_summary = crawler.summary()
# visited_count, crawler.page_visits, initial_summary
EfficientCrawler._fetch_and_update_single = _fetch_and_update_single
EfficientCrawler.refresh_all_pages_aggressively = refresh_all_pages_aggressively

## Refresh Monitoring

We periodically revisit pages (respecting the 5-second `revisit_window`) to track node-id churn while keeping the number of extra fetches low.


## Evaluation Submission

The assignment requires submitting evaluations to `/evaluate` endpoint:
- First evaluation within 15 seconds of first visit
- Subsequent evaluations at least every 15 seconds
- All evaluations within 60 seconds of first visit


In [19]:
# FIX: Replace submit_evaluation with corrected version
# The original was using path instead of actual page_id from state

def submit_evaluation(crawler: EfficientCrawler, pagerank_scores: dict) -> dict:
    entries = []
    for path, state in crawler.pages.items():
        # Use the actual page_id from the page state, not the path
        page_id = state.page_id
        if not page_id or not page_id.strip():
            # Fallback: extract from path if page_id is missing
            page_id = path.lstrip("/")
            if not page_id:
                continue  # Skip root path if no page_id available
        
        latest_node_id = state.last_node_id
        if not latest_node_id or not latest_node_id.strip():
            continue  # Skip entries without valid node_id
        
        score = pagerank_scores.get(path, 0.0)
        entries.append({
            "page_id": page_id,
            "latest_node_id": latest_node_id,
            "score": float(score),
        })
    
    if not entries:
        return {"error": "No valid entries to submit (all entries missing page_id or node_id)"}
    
    payload = {"entries": entries}
    try:
        response = session.post(
            f"{BASE_URL}/evaluate",
            json=payload,
            timeout=5,
        )
        response.raise_for_status()
        result = response.json()
        return result
    except requests.exceptions.RequestException as e:
        # Try to get error details from response
        try:
            if hasattr(e, 'response') and e.response is not None:
                error_detail = e.response.json()
                return {"error": str(e), "detail": error_detail}
        except:
            pass
        return {"error": str(e)}

print("‚úì submit_evaluation function fixed! Now uses state.page_id instead of path.")


‚úì submit_evaluation function fixed! Now uses state.page_id instead of path.


## Note on evaluation.bin

The `evaluation.bin` file is created by the server (not the client) in the `/data` directory. According to the assignment instructions, it contains encrypted evaluation data for all evaluations submitted within the 60-second window. The file may be written:
- After the 60-second window completes
- When the server processes all submitted evaluations
- The file location: `data/evaluation.bin` (if Docker volume is mounted correctly)


In [20]:
import os
from pathlib import Path

evaluation_file = Path("data/evaluation.bin")
if evaluation_file.exists():
    file_size = evaluation_file.stat().st_size
    file_time = evaluation_file.stat().st_mtime
    print(f"‚úì evaluation.bin found!")
    print(f"  Size: {file_size} bytes")
    print(f"  Last modified: {time.ctime(file_time)}")
else:
    print("‚ö† evaluation.bin not found yet.")
    print("  The server creates this file after processing evaluations.")
    print("  It may appear after the 60-second window completes.")
    print(f"  Expected location: {evaluation_file.absolute()}")
    print("\n  To check manually:")
    print(f"    - Local: {evaluation_file}")
    print(f"    - Docker: /data/evaluation.bin (inside container)")


‚úì evaluation.bin found!
  Size: 39904 bytes
  Last modified: Wed Nov 19 00:31:23 2025


## Pre-Run Checklist

**‚ö†Ô∏è CRITICAL: Before running the evaluation cycle, ensure:**

1. ‚úÖ Docker container is **stopped and restarted** to reset server state
   ```bash
   docker stop <container-id>
   docker run --rm -p 3000:3000 -v $(pwd)/data:/data --tmpfs /tmp:rw,noexec,nosuid --cap-drop ALL --security-opt no-new-privileges --pids-limit 128 --memory 256m crawling_assignment:1.2
   ```

2. ‚úÖ Server is running and accessible at `http://localhost:3000`

3. ‚úÖ All cells above have been executed (imports, functions, class definitions)

4. ‚úÖ You're ready to run the evaluation cycle (it will take ~60 seconds)

**The optimized evaluation cycle will:**
- Discover all pages efficiently
- Refresh ALL pages in parallel before each evaluation
- Submit 4-5 evaluations within the 60-second window
- Minimize staleness to < 5000ms (target for competition)


In [21]:
if 'evaluation_results' in globals() and evaluation_results:
    print("Evaluation Submission Summary:")
    print(f"  Total evaluations submitted: {len(evaluation_results)}")
    
    successful = [r for r in evaluation_results if 'error' not in r]
    errors = [r for r in evaluation_results if 'error' in r]
    
    print(f"  Successful: {len(successful)}")
    print(f"  Errors: {len(errors)}")
    
    if successful:
        print("\n  Last successful evaluation metrics:")
        last = successful[-1]
        for key in ['mse', 'coverage', 'avg_staleness', 'visit_count', 'matched_entries']:
            if key in last:
                print(f"    {key}: {last[key]}")
    
    if errors:
        print("\n  Errors encountered:")
        for err in errors:
            print(f"    {err.get('error', 'Unknown error')}")
    
    print("\n  Note: evaluation.bin is created by the server after processing.")
    print("  If the file doesn't exist, the server may:")
    print("    - Write it after the 60-second window")
    print("    - Require all evaluations to be within timing constraints")
    print("    - Only write if evaluations are valid")
else:
    print("No evaluation results found. Run the evaluation cycle cell first.")


Evaluation Submission Summary:
  Total evaluations submitted: 3
  Successful: 3
  Errors: 0

  Last successful evaluation metrics:
    mse: 9.251565123424178e-06
    coverage: 0.9833333333333333
    avg_staleness: 7216.916666666667
    visit_count: 240
    matched_entries: 60

  Note: evaluation.bin is created by the server after processing.
  If the file doesn't exist, the server may:
    - Write it after the 60-second window
    - Require all evaluations to be within timing constraints
    - Only write if evaluations are valid


In [22]:
# ULTRA-OPTIMIZED EVALUATION CYCLE: Minimize staleness for competition
# Strategy: Refresh ALL pages in parallel right before each evaluation submission

# Ensure refresh_all_pages_aggressively method exists
if not hasattr(EfficientCrawler, 'refresh_all_pages_aggressively'):
    def _fetch_and_update_single(self, path: str) -> Tuple[str, PageState, bool]:
        """Helper for parallel fetching - returns (path, state, updated)"""
        try:
            state, updated = self.fetch_and_update(path)
            return path, state, updated
        except Exception as e:
            state = self.pages.get(path)
            return path, state, False
    
    def refresh_all_pages_aggressively(self, max_workers: int = 50) -> dict:
        """Aggressively refresh ALL pages to get the freshest node IDs."""
        if not self.pages:
            return {"refreshed_paths": [], "updates_detected": 0, "fetches": 0}
        current_time = time.time()
        all_pages = sorted(
            self.pages.keys(),
            key=lambda p: current_time - self.pages[p].last_fetched_ts,
            reverse=True
        )
        refreshed = []
        updates_detected = 0
        lock = Lock()
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_to_path = {
                executor.submit(self._fetch_and_update_single, path): path
                for path in all_pages
            }
            for future in as_completed(future_to_path):
                try:
                    path, state, updated = future.result(timeout=5)
                    if state is not None:
                        refreshed.append(path)
                        if updated:
                            with lock:
                                updates_detected += 1
                except Exception as e:
                    pass
        return {
            "refreshed_paths": refreshed,
            "updates_detected": updates_detected,
            "fetches": len(refreshed),
        }
    EfficientCrawler._fetch_and_update_single = _fetch_and_update_single
    EfficientCrawler.refresh_all_pages_aggressively = refresh_all_pages_aggressively

# Ensure submit_evaluation function exists
if 'submit_evaluation' not in globals():
    def submit_evaluation(crawler: EfficientCrawler, pagerank_scores: dict) -> dict:
        entries = []
        for path, state in crawler.pages.items():
            page_id = state.page_id
            if not page_id or not page_id.strip():
                page_id = path.lstrip("/")
                if not page_id:
                    continue
            latest_node_id = state.last_node_id
            if not latest_node_id or not latest_node_id.strip():
                continue
            score = pagerank_scores.get(path, 0.0)
            entries.append({
                "page_id": page_id,
                "latest_node_id": latest_node_id,
                "score": float(score),
            })
        if not entries:
            return {"error": "No valid entries to submit"}
        payload = {"entries": entries}
        try:
            response = session.post(
                f"{BASE_URL}/evaluate",
                json=payload,
                timeout=5,
            )
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            try:
                if hasattr(e, 'response') and e.response is not None:
                    error_detail = e.response.json()
                    return {"error": str(e), "detail": error_detail}
            except:
                pass
            return {"error": str(e)}

print("üöÄ Starting ULTRA-OPTIMIZED evaluation cycle for minimal staleness...")
print("‚ö†Ô∏è  IMPORTANT: Restart Docker container before running to reset server state!")
print()

start_time = time.time()

# Step 1: Initial crawl to discover all pages
crawler_eval = EfficientCrawler(seed_path="/", revisit_window=5.0)
print("üì° Crawling website to discover all pages...")
visited_eval = crawler_eval.crawl(max_visits=10000)
print(f"   ‚úì Discovered {len(visited_eval)} pages with {crawler_eval.page_visits} visits")

# Step 2: Calculate initial PageRank
print("üìä Calculating PageRank...")
pagerank_scores_eval = crawler_eval.pagerank()
print(f"   ‚úì PageRank calculated for {len(pagerank_scores_eval)} pages")

first_visit_time = start_time
evaluation_results = []

# Step 3: Wait until ~14.8 seconds (just before 15s deadline)
elapsed = time.time() - first_visit_time
if elapsed < 14.80:
    time.sleep(14.80 - elapsed)

# Step 4: CRITICAL - Refresh ALL pages in parallel right before first evaluation
print("\nüîÑ FIRST EVALUATION: Refreshing all pages in parallel to minimize staleness...")
refresh_start = time.time()
refresh_stats = crawler_eval.refresh_all_pages_aggressively(max_workers=50)
refresh_time = time.time() - refresh_start
print(f"   ‚úì Refreshed {refresh_stats['fetches']} pages in {refresh_time:.3f}s ({refresh_stats['updates_detected']} node IDs updated)")

# Recalculate PageRank after refresh (in case graph changed)
pagerank_scores_eval = crawler_eval.pagerank()

# Submit first evaluation immediately after refresh
result1 = submit_evaluation(crawler_eval, pagerank_scores_eval)
result1["elapsed_seconds"] = time.time() - first_visit_time
result1["visit_count"] = crawler_eval.page_visits
evaluation_results.append(result1)
print(f"   üì§ First evaluation submitted at {result1['elapsed_seconds']:.2f}s")
if 'error' not in result1:
    print(f"      Staleness: {result1.get('avg_staleness', 'N/A')} ms, Coverage: {result1.get('coverage', 'N/A'):.3f}, MSE: {result1.get('mse', 'N/A'):.2e}")
else:
    print(f"      ‚ùå Error: {result1.get('error', 'Unknown')}")

# Step 5: Subsequent evaluations every ~14.8 seconds
print("\nüîÑ SUBSEQUENT EVALUATIONS:")
last_eval_time = time.time()
eval_count = 1

while time.time() - first_visit_time <= 60.0:  # Stay within 60s window
    # Wait until ~14.8 seconds after last evaluation
    elapsed_since_last = time.time() - last_eval_time
    if elapsed_since_last < 14.80:
        time.sleep(14.80 - elapsed_since_last)
    
    elapsed = time.time() - first_visit_time
    if elapsed > 60.0:
        break
    
    eval_count += 1
    
    # CRITICAL: Refresh ALL pages in parallel right before each evaluation
    print(f"\nüîÑ Evaluation #{eval_count}: Refreshing all pages in parallel...")
    refresh_start = time.time()
    refresh_stats = crawler_eval.refresh_all_pages_aggressively(max_workers=50)
    refresh_time = time.time() - refresh_start
    print(f"   ‚úì Refreshed {refresh_stats['fetches']} pages in {refresh_time:.3f}s ({refresh_stats['updates_detected']} node IDs updated)")
    
    # Recalculate PageRank after refresh
    pagerank_scores_eval = crawler_eval.pagerank()
    
    # Submit evaluation immediately after refresh
    result = submit_evaluation(crawler_eval, pagerank_scores_eval)
    result["elapsed_seconds"] = elapsed
    result["visit_count"] = crawler_eval.page_visits
    evaluation_results.append(result)
    
    if 'error' not in result:
        print(f"   üì§ Evaluation #{eval_count} at {elapsed:.2f}s: Staleness={result.get('avg_staleness', 'N/A')} ms, Coverage={result.get('coverage', 'N/A'):.3f}, MSE={result.get('mse', 'N/A'):.2e}")
    else:
        print(f"   ‚ùå Evaluation #{eval_count} at {elapsed:.2f}s: Error - {result.get('error', 'Unknown')}")
    
    last_eval_time = time.time()

print(f"\n‚úÖ Evaluation cycle complete! Submitted {len(evaluation_results)} evaluations")
print(f"   Total visits: {crawler_eval.page_visits}")

evaluation_summary = pd.DataFrame(evaluation_results)
evaluation_summary


üöÄ Starting ULTRA-OPTIMIZED evaluation cycle for minimal staleness...
‚ö†Ô∏è  IMPORTANT: Restart Docker container before running to reset server state!

üì° Crawling website to discover all pages...
   ‚úì Discovered 60 pages with 60 visits
üìä Calculating PageRank...
   ‚úì PageRank calculated for 60 pages

üîÑ FIRST EVALUATION: Refreshing all pages in parallel to minimize staleness...
   ‚úì Refreshed 60 pages in 0.339s (53 node IDs updated)
   üì§ First evaluation submitted at 15.39s
      Staleness: 6962.483333333334 ms, Coverage: 0.983, MSE: 9.25e-06

üîÑ SUBSEQUENT EVALUATIONS:

üîÑ Evaluation #2: Refreshing all pages in parallel...
   ‚úì Refreshed 60 pages in 0.284s (51 node IDs updated)
   üì§ Evaluation #2 at 30.19s: Staleness=7541.183333333333 ms, Coverage=0.983, MSE=9.25e-06

üîÑ Evaluation #3: Refreshing all pages in parallel...
   ‚úì Refreshed 60 pages in 0.273s (55 node IDs updated)
   üì§ Evaluation #3 at 45.32s: Staleness=6041.383333333333 ms, Coverage=0.98

Unnamed: 0,avg_staleness,coverage,covered_nodes,matched_entries,mse,total_nodes,visit_count,elapsed_seconds
0,6962.483333,0.983333,59,60,9e-06,60,120,15.387426
1,7541.183333,0.983333,59,60,9e-06,60,180,30.187862
2,6041.383333,0.983333,59,60,9e-06,60,240,45.318926
