# Crawling Assignment Activity 2.2

This notebook interacts with the local crawling assignment server (running at `http://localhost:3000`) to
- discover the site graph with minimal page visits,
- track `node_id` updates for each page, and
- estimate PageRank scores over the discovered link structure.


## Setup and Helper Functions

The web server returns JSON responses. We use `requests` for HTTP and utilities for crawling and scoring.


In [29]:
%pip install beautifulsoup4 --quiet


Note: you may need to restart the kernel to use updated packages.


In [30]:
import requests
import time
import math
import json
import re
from collections import deque, defaultdict
from dataclasses import dataclass, field
from typing import Dict, List, Set, Tuple

from bs4 import BeautifulSoup

BASE_URL = "http://localhost:3000"

session = requests.Session()
session.headers.update({
    "User-Agent": "CrawlerAssignmentBot/1.0",
    "Accept": "text/html,application/json",
})


In [31]:
def fetch_page(path: str = "/") -> str:
    url = BASE_URL.rstrip("/") + path
    response = session.get(url, timeout=10)
    response.raise_for_status()
    return response.text


def parse_page(content: str, path: str) -> dict:
    soup = BeautifulSoup(content, "html.parser")

    page_id_text = soup.select_one(".page-id")
    page_id = ""
    if page_id_text:
        page_id = page_id_text.get_text(strip=True).split(":")[-1].strip()

    node_id_elem = soup.select_one(".node-id b")
    node_id = node_id_elem.get_text(strip=True) if node_id_elem else ""

    last_updated_elem = soup.select_one(".last-updated")
    last_updated = ""
    if last_updated_elem:
        last_updated = last_updated_elem.get_text(strip=True)
        if ":" in last_updated:
            last_updated = last_updated.split(":", 1)[-1].strip()

    history_entries: List[dict] = []
    history_container = soup.select_one("details")
    if history_container:
        for entry in history_container.select("div"):
            text = entry.get_text(strip=True)
            text = text.strip("\u0007 \n\r\t")
            match = re.match(r"([A-Za-z0-9]+)\s*\(([^)]+)\)", text)
            if match:
                history_entries.append({
                    "node_id": match.group(1),
                    "timestamp": match.group(2),
                })

    outgoing_links: List[str] = []
    for link in soup.select("a.file-link"):
        href = link.get("href")
        if href and href.startswith("/page_"):
            outgoing_links.append(href)
    outgoing_links = sorted(set(outgoing_links))

    return {
        "path": path,
        "page_id": page_id,
        "node_id": node_id,
        "last_updated": last_updated,
        "history": history_entries,
        "links": outgoing_links,
    }


root_html = fetch_page("/")
root_parsed = parse_page(root_html, "/")
root_parsed


{'path': '/',
 'page_id': 'page_pjm3eihj',
 'node_id': '9hfmc9s6jcm7',
 'last_updated': '2025-11-15 11:20:09 UTC',
 'history': [],
 'links': ['/page_13q7cjnj',
  '/page_8u5qc2wf',
  '/page_w6ju6bgi',
  '/page_yn8o949j',
  '/page_zvdv4uqs']}

In [32]:
@dataclass
class PageState:
    page_id: str
    path: str
    last_node_id: str
    last_updated_at: str
    history: List[dict] = field(default_factory=list)
    outgoing: List[str] = field(default_factory=list)
    last_fetched_ts: float = field(default_factory=time.time)
    last_changed_ts: float = field(default_factory=time.time)
    updates_detected: int = 0

    def differs_from(self, node_id: str, last_updated: str, history: List[dict]) -> bool:
        if node_id != self.last_node_id:
            return True
        if last_updated and last_updated != self.last_updated_at:
            return True
        if len(history) != len(self.history):
            return True
        if history and self.history:
            return history[-1] != self.history[-1]
        return False


In [33]:
class EfficientCrawler:
    def __init__(self, seed_path: str = "/", revisit_window: float = 120.0):
        self.seed_path = seed_path
        self.revisit_window = revisit_window
        self.pages: Dict[str, PageState] = {}
        self.graph: Dict[str, Set[str]] = defaultdict(set)
        self.page_visits = 0
        self.node_updates = 0
        self.fetch_log: List[dict] = []

    def normalize_path(self, path: str) -> str:
        if path.startswith("http://") or path.startswith("https://"):
            if path.startswith(BASE_URL):
                path = path[len(BASE_URL):]
            else:
                return path  # external link
        if not path.startswith("/"):
            path = "/" + path
        return path

    def enqueue_links(self, path: str, links: List[str], queue: deque):
        normalized_links = []
        for link in links:
            normalized = self.normalize_path(link)
            if not normalized.startswith("/"):
                continue  # skip external
            normalized_links.append(normalized)
            if normalized not in self.pages and normalized not in queue:
                queue.append(normalized)
        self.graph[path] = set(normalized_links)

    def fetch_and_update(self, path: str) -> Tuple[PageState, bool]:
        content = fetch_page(path)
        parsed = parse_page(content, path)
        self.page_visits += 1

        page_id = parsed.get("page_id", "")
        node_id = parsed.get("node_id", "")
        history = parsed.get("history", [])
        outgoing = parsed.get("links", [])
        last_timestamp = parsed.get("last_updated", "")

        state = self.pages.get(path)
        now = time.time()
        is_new = state is None
        changed = False

        if state is None:
            state = PageState(
                page_id=page_id,
                path=path,
                last_node_id=node_id,
                last_updated_at=last_timestamp,
                history=history,
                outgoing=outgoing,
                last_fetched_ts=now,
                last_changed_ts=now,
            )
            self.pages[path] = state
        else:
            state.page_id = page_id or state.page_id
            if state.differs_from(node_id, last_timestamp, history):
                changed = True
                state.last_node_id = node_id
                state.last_updated_at = last_timestamp
                state.history = history
                state.last_changed_ts = now
                state.updates_detected += 1
                self.node_updates += 1
            state.outgoing = outgoing
            state.last_fetched_ts = now

        self.graph[path] = set(outgoing)
        self.fetch_log.append({
            "path": path,
            "timestamp": now,
            "is_new": is_new,
            "changed": (changed and not is_new),
        })
        return state, (changed and not is_new)

    def crawl(self, max_visits: int = 5000):
        queue: deque[str] = deque([self.seed_path])
        visited: Set[str] = set()

        while queue and self.page_visits < max_visits:
            path = queue.popleft()
            state = self.pages.get(path)

            should_fetch = False
            if state is None:
                should_fetch = True
            else:
                if time.time() - state.last_fetched_ts >= self.revisit_window:
                    should_fetch = True

            if not should_fetch:
                continue

            state, _ = self.fetch_and_update(path)
            visited.add(path)
            self.enqueue_links(path, state.outgoing, queue)

        return visited

    def refresh_due_pages(self, max_visits: int = 1000):
        due_pages = sorted(
            (
                (time.time() - state.last_fetched_ts, path)
                for path, state in self.pages.items()
            ),
            reverse=True,
        )
        refreshed = []
        extra_visits = 0
        updates_detected = 0
        for _, path in due_pages:
            if extra_visits >= max_visits:
                break
            state = self.pages[path]
            if time.time() - state.last_fetched_ts < self.revisit_window:
                continue
            _, updated = self.fetch_and_update(path)
            refreshed.append(path)
            if updated:
                updates_detected += 1
            extra_visits += 1
        return {
            "refreshed_paths": refreshed,
            "updates_detected": updates_detected,
            "fetches": extra_visits,
        }

    def build_pagerank_matrix(self):
        nodes = list(self.graph.keys() | {link for links in self.graph.values() for link in links})
        nodes.sort()
        node_index = {node: idx for idx, node in enumerate(nodes)}
        adjacency = [[] for _ in nodes]
        for src, dests in self.graph.items():
            if src not in node_index:
                continue
            src_idx = node_index[src]
            adjacency[src_idx] = [node_index[d] for d in dests if d in node_index]
        return nodes, adjacency

    def pagerank(self, damping: float = 0.85, max_iter: int = 100, tol: float = 1e-6):
        nodes, adjacency = self.build_pagerank_matrix()
        n = len(nodes)
        if n == 0:
            return {}
        pr = [1.0 / n] * n
        teleport = (1.0 - damping) / n

        for _ in range(max_iter):
            new_pr = [teleport] * n
            for idx, neighbors in enumerate(adjacency):
                if not neighbors:
                    share = damping * pr[idx] / n
                    for j in range(n):
                        new_pr[j] += share
                else:
                    share = damping * pr[idx] / len(neighbors)
                    for dest_idx in neighbors:
                        new_pr[dest_idx] += share
            delta = sum(abs(new_pr[i] - pr[i]) for i in range(n))
            pr = new_pr
            if delta < tol:
                break
        return {nodes[i]: pr[i] for i in range(n)}

    def summary(self) -> dict:
        total_links = sum(len(state.outgoing) for state in self.pages.values())
        unique_pages = len(self.pages)
        return {
            "unique_pages": unique_pages,
            "page_visits": self.page_visits,
            "node_updates": self.node_updates,
            "average_out_degree": (total_links / unique_pages) if unique_pages else 0.0,
        }


In [34]:
crawler = EfficientCrawler(seed_path="/", revisit_window=5.0)
visited = crawler.crawl(max_visits=2000)
visited_count = len(visited)
initial_summary = crawler.summary()
visited_count, crawler.page_visits, initial_summary


(17,
 17,
 {'unique_pages': 17,
  'page_visits': 17,
  'node_updates': 0,
  'average_out_degree': 4.0})

In [35]:
pagerank_scores = crawler.pagerank()
len(pagerank_scores)


17

In [36]:
sorted_scores = sorted(pagerank_scores.items(), key=lambda kv: kv[1], reverse=True)
sorted_scores[:10]


[('/page_6hji76ni', 0.08995015560558127),
 ('/page_8u5qc2wf', 0.08022155514698967),
 ('/page_w6ju6bgi', 0.07489186809789804),
 ('/page_4u9kqiu9', 0.07170398198120716),
 ('/page_oumbm1ua', 0.06885662863600116),
 ('/page_zvdv4uqs', 0.067602057849244),
 ('/page_yn8o949j', 0.06737563261179391),
 ('/page_7w8neqxl', 0.06049691328827038),
 ('/page_pjm3eihj', 0.05987092421000326),
 ('/page_4bfggquc', 0.056975868362335445)]

In [37]:
import pandas as pd

page_summary = pd.DataFrame([
    {
        "path": path,
        "page_id": state.page_id,
        "last_node_id": state.last_node_id,
        "last_update": state.last_updated_at,
        "links": len(state.outgoing),
        "history_len": len(state.history),
        "last_fetched_ts": state.last_fetched_ts,
        "last_changed_ts": state.last_changed_ts,
        "updates_detected": state.updates_detected,
    }
    for path, state in crawler.pages.items()
]).sort_values("path")
page_summary.head()


Unnamed: 0,path,page_id,last_node_id,last_update,links,history_len,last_fetched_ts,last_changed_ts,updates_detected
0,/,page_pjm3eihj,9hfmc9s6jcm7,2025-11-15 11:20:09 UTC,5,0,1763206000.0,1763206000.0,0
1,/page_13q7cjnj,page_13q7cjnj,lqmtlwgh07pg,2025-11-15 11:20:10 UTC,4,0,1763206000.0,1763206000.0,0
6,/page_2kghj8tn,page_2kghj8tn,wvy6w4r87xhz,2025-11-15 11:20:05 UTC,5,0,1763206000.0,1763206000.0,0
10,/page_491uh7s9,page_491uh7s9,ctp21ilgds1a,2025-11-15 11:20:05 UTC,4,0,1763206000.0,1763206000.0,0
7,/page_4bfggquc,page_4bfggquc,3b8kjaw0pn22,2025-11-15 11:19:27 UTC,5,0,1763206000.0,1763206000.0,0


In [38]:
if 'results_df' in globals() and not results_df.empty:
    results_df.sort_values("pagerank", ascending=False).head(10)[["path", "pagerank", "updates_detected", "links"]]
else:
    print("results_df not yet created. Run the cells above to create page_summary and merge with pagerank_df.")


In [39]:
pagerank_df = pd.DataFrame(sorted_scores, columns=["path", "pagerank"])
results_df = page_summary.merge(pagerank_df, on="path", how="left")
results_df.head()


Unnamed: 0,path,page_id,last_node_id,last_update,links,history_len,last_fetched_ts,last_changed_ts,updates_detected,pagerank
0,/,page_pjm3eihj,9hfmc9s6jcm7,2025-11-15 11:20:09 UTC,5,0,1763206000.0,1763206000.0,0,0.008824
1,/page_13q7cjnj,page_13q7cjnj,lqmtlwgh07pg,2025-11-15 11:20:10 UTC,4,0,1763206000.0,1763206000.0,0,0.054893
2,/page_2kghj8tn,page_2kghj8tn,wvy6w4r87xhz,2025-11-15 11:20:05 UTC,5,0,1763206000.0,1763206000.0,0,0.044733
3,/page_491uh7s9,page_491uh7s9,ctp21ilgds1a,2025-11-15 11:20:05 UTC,4,0,1763206000.0,1763206000.0,0,0.052511
4,/page_4bfggquc,page_4bfggquc,3b8kjaw0pn22,2025-11-15 11:19:27 UTC,5,0,1763206000.0,1763206000.0,0,0.056976


## Refresh Monitoring

We periodically revisit pages (respecting the 5-second `revisit_window`) to track node-id churn while keeping the number of extra fetches low.


## Evaluation Submission

The assignment requires submitting evaluations to `/evaluate` endpoint:
- First evaluation within 15 seconds of first visit
- Subsequent evaluations at least every 15 seconds
- All evaluations within 60 seconds of first visit


In [40]:
def submit_evaluation(crawler: EfficientCrawler, pagerank_scores: dict) -> dict:
    entries = []
    for path, state in crawler.pages.items():
        page_id = path.lstrip("/")
        if not page_id:
            page_id = "/"
        latest_node_id = state.last_node_id
        score = pagerank_scores.get(path, 0.0)
        entries.append({
            "page_id": page_id,
            "latest_node_id": latest_node_id,
            "score": float(score),
        })
    
    payload = {"entries": entries}
    try:
        response = session.post(
            f"{BASE_URL}/evaluate",
            json=payload,
            timeout=5,
        )
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        return {"error": str(e)}


first_visit_time = None
evaluation_results = []

def run_evaluation_cycle(crawler: EfficientCrawler, pagerank_scores: dict, start_time: float):
    global first_visit_time, evaluation_results
    if first_visit_time is None:
        first_visit_time = start_time
    
    elapsed = time.time() - first_visit_time
    if elapsed > 60.0:
        return False
    
    result = submit_evaluation(crawler, pagerank_scores)
    result["elapsed_seconds"] = elapsed
    result["visit_count"] = crawler.page_visits
    evaluation_results.append(result)
    return elapsed < 60.0


print("Evaluation submission function ready. Use run_evaluation_cycle() to submit evaluations.")


Evaluation submission function ready. Use run_evaluation_cycle() to submit evaluations.


In [41]:
start_time = time.time()

crawler_eval = EfficientCrawler(seed_path="/", revisit_window=5.0)
visited_eval = crawler_eval.crawl(max_visits=2000)

pagerank_scores_eval = crawler_eval.pagerank()

first_visit_time = start_time
evaluation_results = []

elapsed = time.time() - first_visit_time
if elapsed < 15.0:
    time.sleep(15.0 - elapsed)

result1 = submit_evaluation(crawler_eval, pagerank_scores_eval)
result1["elapsed_seconds"] = time.time() - first_visit_time
result1["visit_count"] = crawler_eval.page_visits
evaluation_results.append(result1)
print(f"First evaluation: {result1}")

last_eval_time = time.time()
while time.time() - first_visit_time < 60.0:
    time.sleep(15.0)
    elapsed = time.time() - first_visit_time
    if elapsed >= 60.0:
        break
    
    pagerank_scores_eval = crawler_eval.pagerank()
    result = submit_evaluation(crawler_eval, pagerank_scores_eval)
    result["elapsed_seconds"] = elapsed
    result["visit_count"] = crawler_eval.page_visits
    evaluation_results.append(result)
    print(f"Evaluation at {elapsed:.2f}s: MSE={result.get('mse', 'N/A')}, Coverage={result.get('coverage', 'N/A')}, Staleness={result.get('avg_staleness', 'N/A')}")

evaluation_summary = pd.DataFrame(evaluation_results)
evaluation_summary


First evaluation: {'coverage': 0.5, 'covered_nodes': 8, 'matched_entries': 8, 'mse': 5.158994034641991e-07, 'total_nodes': 16, 'elapsed_seconds': 15.047605991363525, 'visit_count': 17}
Evaluation at 30.05s: MSE=4.250965766518326e-07, Coverage=0.25, Staleness=N/A
Evaluation at 45.09s: MSE=2.2128615632037359e-07, Coverage=0.125, Staleness=N/A


Unnamed: 0,coverage,covered_nodes,matched_entries,mse,total_nodes,elapsed_seconds,visit_count
0,0.5,8,8,5.158994e-07,16,15.047606,17
1,0.25,4,4,4.250966e-07,16,30.04885,17
2,0.125,2,2,2.212862e-07,16,45.09412,17


In [42]:
monitor_log = []
for cycle in range(3):
    time.sleep(6)
    refresh_stats = crawler.refresh_due_pages(max_visits=20)
    summary = crawler.summary()
    monitor_log.append({
        "cycle": cycle + 1,
        "refreshed": len(refresh_stats["refreshed_paths"]),
        "updates_detected": refresh_stats["updates_detected"],
        "fetches": refresh_stats["fetches"],
        "total_page_visits": summary["page_visits"],
        "total_updates": summary["node_updates"],
    })

monitor_df = pd.DataFrame(monitor_log)
monitor_df


Unnamed: 0,cycle,refreshed,updates_detected,fetches,total_page_visits,total_updates
0,1,17,17,17,34,17
1,2,17,4,17,51,21
2,3,17,4,17,68,25


In [43]:
updated_pages = [
    {
        "path": path,
        "page_id": state.page_id,
        "last_node_id": state.last_node_id,
        "updates_detected": state.updates_detected,
        "last_change_ts": state.last_changed_ts,
    }
    for path, state in crawler.pages.items()
    if state.updates_detected > 0
]

updated_pages_df = pd.DataFrame(updated_pages)
updated_pages_df.sort_values("updates_detected", ascending=False) if not updated_pages_df.empty else "No node-id updates observed during monitoring window."


Unnamed: 0,path,page_id,last_node_id,updates_detected,last_change_ts
6,/page_2kghj8tn,page_2kghj8tn,cawjv4koi6c2,3,1763206000.0
13,/page_sbi0db9d,page_sbi0db9d,ncsy2jnnopk4,2,1763206000.0
5,/page_zvdv4uqs,page_zvdv4uqs,kuwvlzcoakg2,2,1763206000.0
8,/page_6hji76ni,page_6hji76ni,8icwopnhewl2,2,1763206000.0
10,/page_491uh7s9,page_491uh7s9,2awa1yw2k2kt,2,1763206000.0
11,/page_4u9kqiu9,page_4u9kqiu9,svlvlyhzn5d6,2,1763206000.0
15,/page_7w8neqxl,page_7w8neqxl,i4tj8uluw7x7,2,1763206000.0
0,/,page_pjm3eihj,6vorsjkyvjgb,1,1763206000.0
1,/page_13q7cjnj,page_13q7cjnj,b8iu9p8ia5k9,1,1763206000.0
7,/page_4bfggquc,page_4bfggquc,d38oire0nlzq,1,1763206000.0


In [44]:
final_summary = crawler.summary()
final_summary


{'unique_pages': 17,
 'page_visits': 68,
 'node_updates': 25,
 'average_out_degree': 4.0}

In [45]:
def pages_due_for_refresh(crawler: EfficientCrawler, horizon: float = 120.0):
    now = time.time()
    due = []
    for path, state in crawler.pages.items():
        if now - state.last_fetched_ts >= horizon:
            due.append(path)
    return due

pending_refresh = pages_due_for_refresh(crawler, horizon=10.0)
{"due_count": len(pending_refresh), "sample": pending_refresh[:5]}


{'due_count': 0, 'sample': []}