In [1]:
!pip install beautifulsoup4 flask nest-asyncio pyngrok

Collecting pyngrok
  Downloading pyngrok-7.4.1-py3-none-any.whl.metadata (8.1 kB)
Downloading pyngrok-7.4.1-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.4.1


In [2]:
import os

# Make directory (you can change the name if you want)
site_dir = '/content/static_site/'
os.makedirs(site_dir, exist_ok=True)

# Create sample HTML files
html1 = """<html><head><title>Python Tutorial</title></head>
<body><h1>Welcome to Python</h1><p>Python is a programming language.</p></body></html>"""

html2 = """<html><head><title>Search Engine</title></head>
<body><h1>Building Search</h1><p>This is a simple search engine demo.</p></body></html>"""

with open(os.path.join(site_dir, 'page1.html'), 'w') as f:
    f.write(html1)
with open(os.path.join(site_dir, 'page2.html'), 'w') as f:
    f.write(html2)

print("Sample HTML files created in:", site_dir)
print("Contents:", os.listdir(site_dir))

Sample HTML files created in: /content/static_site/
Contents: ['page1.html', 'page2.html']


In [3]:
import re
from bs4 import BeautifulSoup

STOPWORDS = {'the', 'is', 'and', 'a', 'to', 'of', 'it'}

def tokenize(text):
    tokens = re.findall(r'\b\w+\b', text.lower())
    return [t for t in tokens if t not in STOPWORDS]

def extract_from_html(path):
    with open(path, encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'html.parser')
    title = soup.title.string if soup.title else ''
    body = soup.get_text(separator=' ')
    tokens = tokenize(title + ' ' + body)
    return {'title': title, 'tokens': tokens, 'path': path}

# Process all HTML files
docs = {}
for i, fname in enumerate(sorted(os.listdir('static_site'))):
    if fname.endswith('.html'):
        info = extract_from_html(os.path.join('static_site', fname))
        docs[f'doc_{i}'] = info

# Show what was extracted
for doc_id, info in docs.items():
    print(f"{doc_id}: title='{info['title']}', tokens={info['tokens']}")

doc_0: title='Python Tutorial', tokens=['python', 'tutorial', 'python', 'tutorial', 'welcome', 'python', 'python', 'programming', 'language']
doc_1: title='Search Engine', tokens=['search', 'engine', 'search', 'engine', 'building', 'search', 'this', 'simple', 'search', 'engine', 'demo']


In [4]:
from collections import defaultdict
inv_index = defaultdict(lambda: defaultdict(int)) # term -> doc_id -> freq

for doc_id, info in docs.items():
    for term in info['tokens']:
        inv_index[term][doc_id] += 1

# Convert to normal dict for easier view
inv_index = {term: dict(postings) for term, postings in inv_index.items()}
print("Inverted Index:")
for term, postings in inv_index.items():
    print(f" {term}: {postings}")

Inverted Index:
 python: {'doc_0': 4}
 tutorial: {'doc_0': 2}
 welcome: {'doc_0': 1}
 programming: {'doc_0': 1}
 language: {'doc_0': 1}
 search: {'doc_1': 4}
 engine: {'doc_1': 3}
 building: {'doc_1': 1}
 this: {'doc_1': 1}
 simple: {'doc_1': 1}
 demo: {'doc_1': 1}


In [5]:
import math
from collections import defaultdict

N = len(docs) # total docs

# Precompute IDF
idf = {}
for term, postings in inv_index.items():
    df = len(postings)
    idf[term] = math.log((N + 1) / (df + 1)) + 1 # smoothing

def search(query, top_k=3):
    q_terms = [t for t in re.findall(r'\b\w+\b', query.lower()) if t not in STOPWORDS]
    scores = defaultdict(float)
    for term in q_terms:
        if term in inv_index:
            for doc_id, tf in inv_index[term].items():
                tf_weight = 1 + math.log(tf) # log TF
                scores[doc_id] += tf_weight * idf.get(term, 0)
    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    results = []
    for doc_id, score in ranked[:top_k]:
        results.append({
            'doc_id': doc_id,
            'title': docs[doc_id]['title'],
            'score': round(score, 3),
            'path': docs[doc_id]['path']
        })
    return results

# Try queries
print("Query: 'python'")
print(search("python"))
print("Query: 'search engine'")
print(search("search engine"))

Query: 'python'
[{'doc_id': 'doc_0', 'title': 'Python Tutorial', 'score': 3.354, 'path': 'static_site/page1.html'}]
Query: 'search engine'
[{'doc_id': 'doc_1', 'title': 'Search Engine', 'score': 6.303, 'path': 'static_site/page2.html'}]


In [6]:
!pip install gradio



In [7]:
import gradio as gr
import re

# reuse existing `search` function, `docs`, and `inv_index` from earlier cells

def format_results(query):
    results = search(query, top_k=5)
    if not results:
        return f"No results for '{query}'"
    out = []
    q_terms = [t for t in re.findall(r'\b\w+\b', query.lower()) if t not in STOPWORDS]
    for r in results:
        # simple highlighted snippet from file content
        snippet = ""
        try:
            with open(r['path'], encoding='utf-8', errors='ignore') as f:
                text = f.read().replace('\n', ' ')
                for term in set(q_terms):
                    pattern = re.compile(rf'(?i)\b({re.escape(term)})\b')
                    text = pattern.sub(r'**\1**', text)
                snippet = text[:250] + '...'
        except:
            snippet = ''
        out.append({
            "Title": r['title'] or r['doc_id'],
            "Score": r['score'],
            "Snippet": snippet,
            "Path": r['path']
        })
    # Build a readable string
    display = ""
    for item in out:
        display += f"**{item['Title']}** (score: {item['Score']})\n"
        display += f"{item['Snippet']}\n"
        display += f"Path: {item['Path']}\n\n"
    return display

with gr.Blocks() as demo:
    gr.Markdown("## Simple Static-Site Search Engine")
    query_in = gr.Textbox(label="Query", placeholder="Type something like 'python' or 'search engine'")
    output = gr.Markdown()
    query_in.submit(lambda q: format_results(q), inputs=query_in, outputs=output)
    # optional button
    search_btn = gr.Button("Search")
    search_btn.click(lambda q: format_results(q), inputs=query_in, outputs=output)

demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://2fe2afa063fbf37454.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [8]:
import requests
from bs4 import BeautifulSoup
import time
import re
from urllib.parse import urljoin, urlparse
from collections import deque

# Simple breadth-first crawler with depth limit and politeness
def crawl_site(seed_url, max_pages=20, delay=1.0):
    visited = set()
    docs = {}
    queue = deque([seed_url])
    count = 0

    domain = urlparse(seed_url).netloc

    while queue and count < max_pages:
        url = queue.popleft()
        if url in visited:
            continue
        try:
            resp = requests.get(url, timeout=5, headers={"User-Agent": "SimpleSearchBot/1.0"})
            if resp.status_code != 200 or 'text/html' not in resp.headers.get('Content-Type',''):
                continue
            soup = BeautifulSoup(resp.text, 'html.parser')
            title = soup.title.string.strip() if soup.title and soup.title.string else ''
            body = soup.get_text(separator=' ')
            tokens = re.findall(r'\b\w+\b', (title + ' ' + body).lower())
            # basic stopword filtering
            STOPWORDS = {'the','and','is','in','to','of','a','an'}
            tokens = [t for t in tokens if t not in STOPWORDS]

            doc_id = f'doc_live_{count}'
            docs[doc_id] = {
                'title': title,
                'tokens': tokens,
                'url': url
            }
            count += 1
            visited.add(url)

            # extract same-domain links
            for a in soup.find_all('a', href=True):
                href = urljoin(url, a['href'])
                parsed = urlparse(href)
                if parsed.netloc == domain:
                    normalized = parsed.scheme + '://' + parsed.netloc + parsed.path
                    if normalized not in visited:
                        queue.append(normalized)
        except Exception as e:
            # skip failures
            pass
        time.sleep(delay) # politeness

    return docs

# Example usage: crawl first 10 pages of example.com (replace with your target)
live_docs = crawl_site("https://www.kkwagh.edu.in", max_pages=10, delay=1.0)
print(f"Crawled {len(live_docs)} pages.")
for k,v in live_docs.items():
    print(k, v['title'], v['url'])

Crawled 10 pages.
doc_live_0 KK Wagh https://www.kkwagh.edu.in
doc_live_1 KK Wagh https://www.kkwagh.edu.in/
doc_live_2 Distinguish Faculty https://www.kkwagh.edu.in/faculty
doc_live_3 KK Wagh https://www.kkwagh.edu.in/cpage.aspx
doc_live_4 Contact Us https://www.kkwagh.edu.in/contact-us
doc_live_5 Tender https://www.kkwagh.edu.in/tenders
doc_live_6 Careers https://www.kkwagh.edu.in/careers
doc_live_7 Admissions https://www.kkwagh.edu.in/admissions
doc_live_8 Overview https://www.kkwagh.edu.in/overview
doc_live_9 Our Legacy https://www.kkwagh.edu.in/our-legacy


In [9]:
import math
import re
from collections import defaultdict, Counter

# Use live_docs from your crawl
docs = live_docs # each value has 'title', 'tokens', 'url'

# Build inverted index
inv_index = defaultdict(lambda: defaultdict(int))
for doc_id, info in docs.items():
    for term in info['tokens']:
        inv_index[term][doc_id] += 1
# normalize to regular dict
inv_index = {term: dict(postings) for term, postings in inv_index.items()}

# Compute IDF
N = len(docs)
idf = {}
for term, postings in inv_index.items():
    df = len(postings)
    idf[term] = math.log((N + 1) / (df + 1)) + 1 # smoothed

STOPWORDS = {'the','and','is','in','to','of','a','an'}

def search_live(query, top_k=5):
    q_terms = [t for t in re.findall(r'\b\w+\b', query.lower()) if t not in STOPWORDS]
    scores = defaultdict(float)
    for term in q_terms:
        if term not in inv_index:
            continue
        for doc_id, tf in inv_index[term].items():
            tf_weight = 1 + math.log(tf)
            scores[doc_id] += tf_weight * idf.get(term, 0)
    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    results = []
    for doc_id, score in ranked[:top_k]:
        info = docs[doc_id]
        # simple snippet: first occurrence context
        snippet = ""
        try:
            # we don't have raw HTML stored here; rebuild minimal snippet from tokens
            snippet = " ".join(info['tokens'][:30]) + "..."
        except:
            snippet = ""
        results.append({
            'doc_id': doc_id,
            'title': info.get('title') or doc_id,
            'score': round(score, 3),
            'url': info.get('url'),
            'snippet': snippet
        })
    return results

# Demo queries
print("Search for 'admissions':", search_live("admissions"))
print("Search for 'faculty':", search_live("faculty"))
print("Search for 'contact':", search_live("contact"))


Search for 'admissions': [{'doc_id': 'doc_live_7', 'title': 'Admissions', 'score': 2.946, 'url': 'https://www.kkwagh.edu.in/admissions', 'snippet': 'admissions admissions distinguished faculty alumni contact us tender careers admissions erp login home about k k wagh overview our legacy milestones our leadership chairman s profile chairman s message board...'}, {'doc_id': 'doc_live_0', 'title': 'KK Wagh', 'score': 2.609, 'url': 'https://www.kkwagh.edu.in', 'snippet': 'kk wagh kk wagh distinguished faculty alumni contact us tender careers admissions erp login home about k k wagh overview our legacy milestones our leadership chairman s profile chairman s...'}, {'doc_id': 'doc_live_1', 'title': 'KK Wagh', 'score': 2.609, 'url': 'https://www.kkwagh.edu.in/', 'snippet': 'kk wagh kk wagh distinguished faculty alumni contact us tender careers admissions erp login home about k k wagh overview our legacy milestones our leadership chairman s profile chairman s...'}, {'doc_id': 'doc_live_2', 'titl

In [10]:
import gradio as gr
import re
import math
from collections import defaultdict

# Assume crawl + search_live + STOPWORDS already defined above

def gradio_search_live_highlight(query, top_k):
    results = search_live(query, top_k=top_k)
    q_terms = [t for t in re.findall(r'\b\w+\b', query.lower()) if t not in STOPWORDS]

    if not results:
        return f"<p>No results for '<mark>{query}</mark>'</p>"

    html = ""
    for r in results:
        # Highlight terms in title
        title_html = r['title']
        for term in q_terms:
            title_html = re.sub(
                rf"(?i)({re.escape(term)})",
                r"<mark>\1</mark>",
                title_html
            )

        html += f"<h3><a href='{r['url']}' target='_blank'>{title_html}</a></h3>"
        html += f"<p><strong>Score:</strong> {r['score']}</p>"

        # Highlight terms in snippet
        snippet = r['snippet']
        for term in q_terms:
            snippet = re.sub(
                rf"(?i)({re.escape(term)})",
                r"<mark>\1</mark>",
                snippet
            )
        html += f"<p>{snippet}</p><hr/>"

    return html

with gr.Blocks() as demo:
    gr.Markdown("## Live-Site Search Engine with Highlighted Terms")
    with gr.Row():
        query_in = gr.Textbox(label="Search Query", placeholder="e.g., admissions, faculty")
        topk = gr.Slider(1, 10, value=5, step=1, label="Top-K Results")
    output = gr.HTML() # <-- use HTML so <mark> works
    query_in.submit(gradio_search_live_highlight, inputs=[query_in, topk], outputs=output)
    gr.Button("Search").click(gradio_search_live_highlight, inputs=[query_in, topk], outputs=output)
demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://cd81bf3d0b11671cd7.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


