Building a Domain-Specific Search Engine with Crawling and Link Analysis

In [None]:
DOMAIN = "machine learning"

In [None]:
seed_urls = [
    "https://openai.com",
    "https://deepmind.com",
    "https://arxiv.org",
    "https://kaggle.com",
    "https://towardsdatascience.com",
    "https://machinelearningmastery.com",
    "https://distill.pub",
    "https://ai.google",
    "https://huggingface.co",
    "https://paperswithcode.com",
    "https://developer.nvidia.com/deep-learning",
    "https://colah.github.io",
    "https://ruder.io",
    "https://jalammar.github.io"
]

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import time
import re
import pickle
from collections import defaultdict
import networkx as nx

def normalize_url(url):
    """Normalize URL by removing fragments and query parameters"""
    parsed = urlparse(url)
    return parsed.scheme + "://" + parsed.netloc + parsed.path

def get_domain(url):
    return urlparse(url).netloc

def crawl(start_urls, max_pages=100, max_visits_per_domain=20, delay=1.5, timeout=10):
    visited = set()
    to_visit = [url.strip() for url in start_urls if url.strip()]
    inverted_index = defaultdict(set)
    web_connection = {}
    domain_visits = defaultdict(int)
    last_request_time = {}
    headers = {'User-Agent': 'Mozilla/5.0 (compatible; AcademicCrawler/1.0)'}
    seen = set(to_visit)

    print(f"Starting crawl with {len(to_visit)} seed URLs")

    while to_visit and len(visited) < max_pages:
        url = to_visit.pop(0)

        try:
            # Normalize URL
            url = normalize_url(url)
            domain = get_domain(url)

            # Check domain limits
            if domain_visits[domain] >= max_visits_per_domain:
                continue

            # Respect crawl delay
            current_time = time.time()
            if domain in last_request_time:
                elapsed = current_time - last_request_time[domain]
                if elapsed < delay:
                    time.sleep(delay - elapsed)

            # Fetch page
            response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
            last_request_time[domain] = time.time()

            # Check content type
            content_type = response.headers.get('Content-Type', '').lower()
            if 'text/html' not in content_type:
                continue

            # Parse HTML
            soup = BeautifulSoup(response.text, 'html.parser')
            visited.add(url)
            domain_visits[domain] += 1

            # Extract text (remove JavaScript/CSS)
            for script in soup(["script", "style"]):
                script.extract()
            text = soup.get_text()
            words = re.findall(r'\b\w{3,20}\b', text.lower())  # Only words of 3-20 chars

            # Update inverted index
            for word in set(words):
                inverted_index[word].add(url)

            # Extract links
            links = []
            for a in soup.find_all('a', href=True):
                href = a['href'].strip()
                if href.startswith('javascript:') or href.startswith('mailto:'):
                    continue

                absolute_url = urljoin(url, href)
                absolute_url = normalize_url(absolute_url)
                parsed = urlparse(absolute_url)

                # Validate URL
                if parsed.scheme in ('http', 'https') and parsed.netloc:
                    links.append(absolute_url)
                    if absolute_url not in seen and absolute_url not in visited:
                        seen.add(absolute_url)
                        to_visit.append(absolute_url)

            # Update web graph
            web_connection[url] = links
            print(f"Crawled: {url} (Links: {len(links)}, Words: {len(words)})")

        except Exception as e:
            print(f"Error crawling {url}: {str(e)[:100]}")

    print(f"\nCrawl completed! Pages: {len(visited)}, Domains: {len(domain_visits)}, Terms: {len(inverted_index)}")
    return dict(inverted_index), web_connection

# Run crawler
inverted_index, web_connection = crawl(
    seed_urls,
    max_pages=100,
    max_visits_per_domain=20,
    delay=1.5,
    timeout=15
)

# Save results
with open('inverted_index.pkl', 'wb') as f:
    pickle.dump(inverted_index, f)

with open('web_connection.pkl', 'wb') as f:
    pickle.dump(web_connection, f)

Starting crawl with 14 seed URLs
Crawled: https://openai.com (Links: 0, Words: 5)
Crawled: https://deepmind.com (Links: 199, Words: 1155)
Crawled: https://arxiv.org (Links: 260, Words: 695)
Crawled: https://kaggle.com (Links: 0, Words: 8)
Crawled: https://towardsdatascience.com (Links: 107, Words: 723)
Crawled: https://machinelearningmastery.com (Links: 0, Words: 7)
Crawled: https://distill.pub (Links: 62, Words: 945)
Crawled: https://ai.google (Links: 260, Words: 2137)
Crawled: https://huggingface.co (Links: 80, Words: 500)
Crawled: https://paperswithcode.com (Links: 103, Words: 530)
Crawled: https://developer.nvidia.com/deep-learning (Links: 22, Words: 704)
Crawled: https://colah.github.io (Links: 65, Words: 501)
Crawled: https://ruder.io (Links: 5, Words: 89)
Crawled: https://jalammar.github.io (Links: 250, Words: 3924)
Crawled: https://deepmind.com/models/ (Links: 186, Words: 950)
Crawled: https://deepmind.com/models/gemini/pro/ (Links: 192, Words: 2450)
Crawled: https://deepmind.c

In [None]:
def build_graph(web_connection):
    G = nx.DiGraph()
    for url, links in web_connection.items():
        G.add_node(url)
        for link in links:
            if link in web_connection:  # Only include crawled pages
                G.add_edge(url, link)
    return G

# Load data
with open('web_connection.pkl', 'rb') as f:
    web_connection = pickle.load(f)

web_graph = build_graph(web_connection)
print(f"Web graph: Nodes={web_graph.number_of_nodes()}, Edges={web_graph.number_of_edges()}")

Web graph: Nodes=100, Edges=1449


In [None]:
import numpy as np

def pagerank(graph, damping=0.85, max_iter=100, tol=1e-6):
    nodes = list(graph.nodes())
    n = len(nodes)
    node_index = {node: i for i, node in enumerate(nodes)}
    pr = np.ones(n) / n

    # Create adjacency matrix
    adj_matrix = np.zeros((n, n))
    out_degrees = np.zeros(n)

    for i, node in enumerate(nodes):
        successors = list(graph.successors(node))
        out_degree = len(successors)
        out_degrees[i] = out_degree
        if out_degree > 0:
            for succ in successors:
                if succ in node_index:
                    j = node_index[succ]
                    adj_matrix[j, i] = 1 / out_degree

    # Handle dangling nodes
    dangling_nodes = np.where(out_degrees == 0)[0]
    dangling_weights = np.ones(n) / n if len(dangling_nodes) > 0 else None

    # Power iteration
    for _ in range(max_iter):
        new_pr = np.zeros(n)

        # Multiply with adjacency matrix
        new_pr = damping * np.dot(adj_matrix, pr)

        # Add damping factor
        new_pr += (1 - damping) / n

        # Add dangling nodes contribution
        if dangling_weights is not None:
            new_pr += damping * pr[dangling_nodes].sum() * dangling_weights

        # Check convergence
        if np.linalg.norm(new_pr - pr, 1) < tol:
            break
        pr = new_pr

    return {node: pr[i] for i, node in enumerate(nodes)}

def hits(graph, max_iter=50, tol=1e-6):
    nodes = list(graph.nodes())
    n = len(nodes)
    auth = np.ones(n)
    hub = np.ones(n)

    # Create adjacency matrix
    adj_matrix = np.zeros((n, n))
    for i, node in enumerate(nodes):
        for successor in graph.successors(node):
            if successor in nodes:
                j = nodes.index(successor)
                adj_matrix[i, j] = 1

    # HITS iteration
    for _ in range(max_iter):
        new_auth = np.dot(adj_matrix.T, hub)
        new_hub = np.dot(adj_matrix, new_auth)

        # Normalize
        auth_norm = np.linalg.norm(new_auth)
        hub_norm = np.linalg.norm(new_hub)
        if auth_norm > 0:
            new_auth /= auth_norm
        if hub_norm > 0:
            new_hub /= hub_norm

        # Check convergence
        auth_diff = np.linalg.norm(new_auth - auth)
        hub_diff = np.linalg.norm(new_hub - hub)
        if auth_diff < tol and hub_diff < tol:
            break

        auth, hub = new_auth, new_hub

    return (
        {node: auth[i] for i, node in enumerate(nodes)},
        {node: hub[i] for i, node in enumerate(nodes)}
    )

# Compute scores
pagerank_scores = pagerank(web_graph)
auth_scores, hub_scores = hits(web_graph)

print("PageRank top 5:")
for url, score in sorted(pagerank_scores.items(), key=lambda x: x[1], reverse=True)[:5]:
    print(f"{score:.4f}: {url}")

PageRank top 5:
0.0531: https://policies.google.com/terms
0.0502: https://policies.google.com/privacy
0.0468: https://about.google/
0.0407: https://www.cornell.edu/
0.0358: https://www.linkedin.com/company/googledeepmind/


In [None]:
from collections import defaultdict

def search(query, inverted_index, rank_scores, top_k=10):
    # Tokenize query
    words = re.findall(r'\b\w{3,20}\b', query.lower())
    if not words:
        return []

    # Find relevant pages
    page_scores = defaultdict(float)
    for word in words:
        if word in inverted_index:
            for url in inverted_index[word]:
                page_scores[url] += rank_scores.get(url, 0)

    # Rank by combined score
    ranked_results = sorted(
        [(url, score) for url, score in page_scores.items()],
        key=lambda x: x[1],
        reverse=True
    )

    return ranked_results[:top_k]

# Load data
with open('inverted_index.pkl', 'rb') as f:
    inverted_index = pickle.load(f)

# Sample queries
queries = [
    "neural networks",
    "transformer models",
    "reinforcement learning",
    "natural language processing"
]

print("\nSearch Results:")
for query in queries:
    print(f"\nQuery: '{query}' (PageRank ranking)")
    results = search(query, inverted_index, pagerank_scores)
    for i, (url, score) in enumerate(results):
        print(f"{i+1}. {score:.4f}: {url}")

    print(f"\nQuery: '{query}' (HITS authority ranking)")
    results = search(query, inverted_index, auth_scores)
    for i, (url, score) in enumerate(results):
        print(f"{i+1}. {score:.4f}: {url}")


Search Results:

Query: 'neural networks' (PageRank ranking)
1. 0.0716: https://github.com/google-deepmind
2. 0.0447: https://arxiv.org
3. 0.0407: https://www.cornell.edu/
4. 0.0381: https://developer.nvidia.com/deep-learning
5. 0.0271: https://info.arxiv.org/help/policies/privacy_policy.html
6. 0.0188: https://arxiv.org/
7. 0.0080: https://arxiv.org/list/cond-mat/new
8. 0.0080: https://arxiv.org/list/cond-mat/recent
9. 0.0079: https://arxiv.org/list/astro-ph/new
10. 0.0079: https://arxiv.org/list/astro-ph/recent

Query: 'neural networks' (HITS authority ranking)
1. 0.3280: https://github.com/google-deepmind
2. 0.0411: https://cloud.google.com/vertex-ai
3. 0.0000: https://arxiv.org/
4. 0.0000: https://info.arxiv.org/help/policies/privacy_policy.html
5. 0.0000: https://arxiv.org
6. 0.0000: https://www.cornell.edu/
7. 0.0000: https://arxiv.org/list/astro-ph/new
8. 0.0000: https://arxiv.org/list/astro-ph/recent
9. 0.0000: https://arxiv.org/list/cond-mat/new
10. 0.0000: https://arxiv.org/