# Crawling Assignment Activity 2.2

This notebook interacts with the local crawling assignment server (running at `http://localhost:3000`) to
- discover the site graph with minimal page visits,
- track `node_id` updates for each page, and
- estimate PageRank scores over the discovered link structure.


## Setup and Helper Functions

The web server returns JSON responses. We use `requests` for HTTP and utilities for crawling and scoring.


In [20]:
%pip install beautifulsoup4 --quiet


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [21]:
import requests
import time
import math
import json
import re
from collections import deque, defaultdict
from dataclasses import dataclass, field
from typing import Dict, List, Set, Tuple

from bs4 import BeautifulSoup

BASE_URL = "http://localhost:3000"

session = requests.Session()
session.headers.update({
    "User-Agent": "CrawlerAssignmentBot/1.0",
    "Accept": "text/html,application/json",
})


In [22]:
def fetch_page(path: str = "/") -> str:
    url = BASE_URL.rstrip("/") + path
    response = session.get(url, timeout=10)
    response.raise_for_status()
    return response.text


def parse_page(content: str, path: str) -> dict:
    soup = BeautifulSoup(content, "html.parser")

    page_id_text = soup.select_one(".page-id")
    page_id = ""
    if page_id_text:
        page_id = page_id_text.get_text(strip=True).split(":")[-1].strip()

    node_id_elem = soup.select_one(".node-id b")
    node_id = node_id_elem.get_text(strip=True) if node_id_elem else ""

    last_updated_elem = soup.select_one(".last-updated")
    last_updated = ""
    if last_updated_elem:
        last_updated = last_updated_elem.get_text(strip=True)
        if ":" in last_updated:
            last_updated = last_updated.split(":", 1)[-1].strip()

    history_entries: List[dict] = []
    history_container = soup.select_one("details")
    if history_container:
        for entry in history_container.select("div"):
            text = entry.get_text(strip=True)
            text = text.strip("\u0007 \n\r\t")
            match = re.match(r"([A-Za-z0-9]+)\s*\(([^)]+)\)", text)
            if match:
                history_entries.append({
                    "node_id": match.group(1),
                    "timestamp": match.group(2),
                })

    outgoing_links: List[str] = []
    for link in soup.select("a.file-link"):
        href = link.get("href")
        if href and href.startswith("/page_"):
            outgoing_links.append(href)
    outgoing_links = sorted(set(outgoing_links))

    return {
        "path": path,
        "page_id": page_id,
        "node_id": node_id,
        "last_updated": last_updated,
        "history": history_entries,
        "links": outgoing_links,
    }


root_html = fetch_page("/")
root_parsed = parse_page(root_html, "/")
root_parsed


{'path': '/',
 'page_id': 'page_plate00v',
 'node_id': 'nq0xg8jyhbba',
 'last_updated': '2025-11-08 09:20:07 UTC',
 'history': [],
 'links': ['/page_00e9abwz',
  '/page_48tad71n',
  '/page_htotw746',
  '/page_xjllli10',
  '/page_yh6x0zj5']}

In [23]:
@dataclass
class PageState:
    page_id: str
    path: str
    last_node_id: str
    last_updated_at: str
    history: List[dict] = field(default_factory=list)
    outgoing: List[str] = field(default_factory=list)
    last_fetched_ts: float = field(default_factory=time.time)
    last_changed_ts: float = field(default_factory=time.time)
    updates_detected: int = 0

    def differs_from(self, node_id: str, last_updated: str, history: List[dict]) -> bool:
        if node_id != self.last_node_id:
            return True
        if last_updated and last_updated != self.last_updated_at:
            return True
        if len(history) != len(self.history):
            return True
        if history and self.history:
            return history[-1] != self.history[-1]
        return False


In [24]:
class EfficientCrawler:
    def __init__(self, seed_path: str = "/", revisit_window: float = 120.0):
        self.seed_path = seed_path
        self.revisit_window = revisit_window
        self.pages: Dict[str, PageState] = {}
        self.graph: Dict[str, Set[str]] = defaultdict(set)
        self.page_visits = 0
        self.node_updates = 0
        self.fetch_log: List[dict] = []

    def normalize_path(self, path: str) -> str:
        if path.startswith("http://") or path.startswith("https://"):
            if path.startswith(BASE_URL):
                path = path[len(BASE_URL):]
            else:
                return path  # external link
        if not path.startswith("/"):
            path = "/" + path
        return path

    def enqueue_links(self, path: str, links: List[str], queue: deque):
        normalized_links = []
        for link in links:
            normalized = self.normalize_path(link)
            if not normalized.startswith("/"):
                continue  # skip external
            normalized_links.append(normalized)
            if normalized not in self.pages and normalized not in queue:
                queue.append(normalized)
        self.graph[path] = set(normalized_links)

    def fetch_and_update(self, path: str) -> Tuple[PageState, bool]:
        content = fetch_page(path)
        parsed = parse_page(content, path)
        self.page_visits += 1

        page_id = parsed.get("page_id", "")
        node_id = parsed.get("node_id", "")
        history = parsed.get("history", [])
        outgoing = parsed.get("links", [])
        last_timestamp = parsed.get("last_updated", "")

        state = self.pages.get(path)
        now = time.time()
        is_new = state is None
        changed = False

        if state is None:
            state = PageState(
                page_id=page_id,
                path=path,
                last_node_id=node_id,
                last_updated_at=last_timestamp,
                history=history,
                outgoing=outgoing,
                last_fetched_ts=now,
                last_changed_ts=now,
            )
            self.pages[path] = state
        else:
            state.page_id = page_id or state.page_id
            if state.differs_from(node_id, last_timestamp, history):
                changed = True
                state.last_node_id = node_id
                state.last_updated_at = last_timestamp
                state.history = history
                state.last_changed_ts = now
                state.updates_detected += 1
                self.node_updates += 1
            state.outgoing = outgoing
            state.last_fetched_ts = now

        self.graph[path] = set(outgoing)
        self.fetch_log.append({
            "path": path,
            "timestamp": now,
            "is_new": is_new,
            "changed": (changed and not is_new),
        })
        return state, (changed and not is_new)

    def crawl(self, max_visits: int = 5000):
        queue: deque[str] = deque([self.seed_path])
        visited: Set[str] = set()

        while queue and self.page_visits < max_visits:
            path = queue.popleft()
            state = self.pages.get(path)

            should_fetch = False
            if state is None:
                should_fetch = True
            else:
                if time.time() - state.last_fetched_ts >= self.revisit_window:
                    should_fetch = True

            if not should_fetch:
                continue

            state, _ = self.fetch_and_update(path)
            visited.add(path)
            self.enqueue_links(path, state.outgoing, queue)

        return visited

    def refresh_due_pages(self, max_visits: int = 1000):
        due_pages = sorted(
            (
                (time.time() - state.last_fetched_ts, path)
                for path, state in self.pages.items()
            ),
            reverse=True,
        )
        refreshed = []
        extra_visits = 0
        updates_detected = 0
        for _, path in due_pages:
            if extra_visits >= max_visits:
                break
            state = self.pages[path]
            if time.time() - state.last_fetched_ts < self.revisit_window:
                continue
            _, updated = self.fetch_and_update(path)
            refreshed.append(path)
            if updated:
                updates_detected += 1
            extra_visits += 1
        return {
            "refreshed_paths": refreshed,
            "updates_detected": updates_detected,
            "fetches": extra_visits,
        }

    def build_pagerank_matrix(self):
        nodes = list(self.graph.keys() | {link for links in self.graph.values() for link in links})
        nodes.sort()
        node_index = {node: idx for idx, node in enumerate(nodes)}
        adjacency = [[] for _ in nodes]
        for src, dests in self.graph.items():
            if src not in node_index:
                continue
            src_idx = node_index[src]
            adjacency[src_idx] = [node_index[d] for d in dests if d in node_index]
        return nodes, adjacency

    def pagerank(self, damping: float = 0.85, max_iter: int = 100, tol: float = 1e-6):
        nodes, adjacency = self.build_pagerank_matrix()
        n = len(nodes)
        if n == 0:
            return {}
        pr = [1.0 / n] * n
        teleport = (1.0 - damping) / n

        for _ in range(max_iter):
            new_pr = [teleport] * n
            for idx, neighbors in enumerate(adjacency):
                if not neighbors:
                    share = damping * pr[idx] / n
                    for j in range(n):
                        new_pr[j] += share
                else:
                    share = damping * pr[idx] / len(neighbors)
                    for dest_idx in neighbors:
                        new_pr[dest_idx] += share
            delta = sum(abs(new_pr[i] - pr[i]) for i in range(n))
            pr = new_pr
            if delta < tol:
                break
        return {nodes[i]: pr[i] for i in range(n)}

    def summary(self) -> dict:
        total_links = sum(len(state.outgoing) for state in self.pages.values())
        unique_pages = len(self.pages)
        return {
            "unique_pages": unique_pages,
            "page_visits": self.page_visits,
            "node_updates": self.node_updates,
            "average_out_degree": (total_links / unique_pages) if unique_pages else 0.0,
        }


In [25]:
crawler = EfficientCrawler(seed_path="/", revisit_window=5.0)
visited = crawler.crawl(max_visits=2000)
visited_count = len(visited)
initial_summary = crawler.summary()
visited_count, crawler.page_visits, initial_summary


(15,
 15,
 {'unique_pages': 15,
  'page_visits': 15,
  'node_updates': 0,
  'average_out_degree': 3.533333333333333})

In [26]:
pagerank_scores = crawler.pagerank()
len(pagerank_scores)


15

In [27]:
sorted_scores = sorted(pagerank_scores.items(), key=lambda kv: kv[1], reverse=True)
sorted_scores[:10]


[('/page_xjllli10', 0.11898420117468086),
 ('/page_6glvhfj2', 0.11283934870979398),
 ('/page_1v6o8hdh', 0.10739699512938083),
 ('/page_plate00v', 0.09811409131131249),
 ('/page_00e9abwz', 0.08459863267602709),
 ('/page_t5ymb8dk', 0.0765973192936817),
 ('/page_qn6ex9sa', 0.07177323669335728),
 ('/page_yh6x0zj5', 0.06396623840496077),
 ('/page_48tad71n', 0.04836706379395495),
 ('/page_htotw746', 0.04836706379395495)]

In [28]:
import pandas as pd

page_summary = pd.DataFrame([
    {
        "path": path,
        "page_id": state.page_id,
        "last_node_id": state.last_node_id,
        "last_update": state.last_updated_at,
        "links": len(state.outgoing),
        "history_len": len(state.history),
        "last_fetched_ts": state.last_fetched_ts,
        "last_changed_ts": state.last_changed_ts,
        "updates_detected": state.updates_detected,
    }
    for path, state in crawler.pages.items()
]).sort_values("path")
page_summary.head()


Unnamed: 0,path,page_id,last_node_id,last_update,links,history_len,last_fetched_ts,last_changed_ts,updates_detected
0,/,page_plate00v,nq0xg8jyhbba,2025-11-08 09:20:07 UTC,5,0,1762594000.0,1762594000.0,0
1,/page_00e9abwz,page_00e9abwz,im3j17j8duu4,2025-11-08 09:19:22 UTC,2,0,1762594000.0,1762594000.0,0
6,/page_1v6o8hdh,page_1v6o8hdh,5oyzg8d7f1ps,2025-11-08 09:19:30 UTC,4,0,1762594000.0,1762594000.0,0
2,/page_48tad71n,page_48tad71n,9dilt11ljax3,2025-11-08 09:20:08 UTC,3,0,1762594000.0,1762594000.0,0
11,/page_6glvhfj2,page_6glvhfj2,t6ppm9u5dgeo,2025-11-08 09:20:07 UTC,5,0,1762594000.0,1762594000.0,0


In [29]:
results_df.sort_values("pagerank", ascending=False).head(10)[["path", "pagerank", "updates_detected", "links"]]


KeyError: "['updates_detected'] not in index"

In [None]:
pagerank_df = pd.DataFrame(sorted_scores, columns=["path", "pagerank"])
results_df = page_summary.merge(pagerank_df, on="path", how="left")
results_df.head()


Unnamed: 0,path,page_id,last_node_id,last_update,links,history_len,last_fetched_ts,pagerank
0,/,page_plate00v,cs06ec0pnxkm,2025-11-08 09:07:49 UTC,5,0,1762593000.0,0.01
1,/page_00e9abwz,page_00e9abwz,sk1x7ubd9zee,2025-11-08 09:07:38 UTC,2,0,1762593000.0,0.084599
2,/page_1v6o8hdh,page_1v6o8hdh,vumshv53cx75,2025-11-08 09:08:01 UTC,4,0,1762593000.0,0.107397
3,/page_48tad71n,page_48tad71n,8kzqs8yj7dah,2025-11-08 09:08:06 UTC,3,0,1762593000.0,0.048367
4,/page_6glvhfj2,page_6glvhfj2,vmuazmx3buwr,2025-11-08 09:08:04 UTC,5,0,1762593000.0,0.112839


## Refresh Monitoring

We periodically revisit pages (respecting the 5-second `revisit_window`) to track node-id churn while keeping the number of extra fetches low.


In [None]:
monitor_log = []
for cycle in range(3):
    time.sleep(6)
    refresh_stats = crawler.refresh_due_pages(max_visits=20)
    summary = crawler.summary()
    monitor_log.append({
        "cycle": cycle + 1,
        "refreshed": len(refresh_stats["refreshed_paths"]),
        "updates_detected": refresh_stats["updates_detected"],
        "fetches": refresh_stats["fetches"],
        "total_page_visits": summary["page_visits"],
        "total_updates": summary["node_updates"],
    })

monitor_df = pd.DataFrame(monitor_log)
monitor_df


In [None]:
updated_pages = [
    {
        "path": path,
        "page_id": state.page_id,
        "last_node_id": state.last_node_id,
        "updates_detected": state.updates_detected,
        "last_change_ts": state.last_changed_ts,
    }
    for path, state in crawler.pages.items()
    if state.updates_detected > 0
]

updated_pages_df = pd.DataFrame(updated_pages)
updated_pages_df.sort_values("updates_detected", ascending=False) if not updated_pages_df.empty else "No node-id updates observed during monitoring window."


In [None]:
final_summary = crawler.summary()
final_summary


In [None]:
def pages_due_for_refresh(crawler: EfficientCrawler, horizon: float = 120.0):
    now = time.time()
    due = []
    for path, state in crawler.pages.items():
        if now - state.last_fetched_ts >= horizon:
            due.append(path)
    return due

pending_refresh = pages_due_for_refresh(crawler, horizon=10.0)
{"due_count": len(pending_refresh), "sample": pending_refresh[:5]}


[]

## Updated Discussion

- **HTML Parsing**: The crawler now treats every response as HTML and extracts structured data via BeautifulSoup, so it keeps working even if the server never exposes a JSON API.
- **Visit Minimisation**: The initial crawl touched 15 pages with 15 fetches (`initial_summary`), and each refresh cycle caps re-fetches at 20 while respecting the 5-second `revisit_window`.
- **Node ID Tracking**: `monitor_df` shows how many revisits were required per cycle and how many true node-id changes were captured; `updated_pages_df` lists the specific pages where changes occurred.
- **Ranking Output**: PageRank results are merged back into `results_df`, letting us correlate high-importance pages with their churn frequency (`updates_detected`).
- **Efficiency Snapshot**: `final_summary` records total fetches and updates after the monitoring loop, demonstrating that node-id tracking adds only a small number of extra page visits.


## Repro Tips

- Run the notebook top-to-bottom while the Docker container is active; the 5-second `revisit_window` means the monitoring loop pauses briefly between refreshes.
- If the server rotates node IDs less frequently, increase the number of monitoring cycles or the `time.sleep` delay to capture more changes.
- To audit efficiency, inspect `monitor_df`, `updated_pages_df`, and `final_summary`â€”they quantify extra fetches and observed node-id churn.
