In [None]:
import os
import scrapy
from scrapy.crawler import CrawlerProcess
from urllib.parse import urlparse
import uuid

class Crawler(scrapy.Spider):
    name = "web_crawler"
    
    def __init__(self, seed_url, max_pages, max_depth, save_dir=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.start_urls = [seed_url]
        self.allowed_domains = [urlparse(seed_url).netloc]
        self.max_pages = int(max_pages)
        self.pages_crawled = 0
        self.save_dir = save_dir or 'downloaded_pages'
        self.custom_settings = {
            'DEPTH_LIMIT': int(max_depth),
        }

    def parse(self, response):
        if self.pages_crawled >= self.max_pages:
            return

        os.makedirs(self.save_dir, exist_ok=True)
        filename = f"{str(uuid.uuid4()).upper()}.html"
        filepath = os.path.join(self.save_dir, filename)
        with open(filepath, "wb") as f:
            f.write(response.body)
        print(f"Saved {filename} to folder with {response.url}")
        self.pages_crawled += 1

        if self.pages_crawled >= self.max_pages:
            return
        
        for href in response.css("a::attr(href)").getall():
            if href and not href.startswith("mailto:") and not href.startswith("#"):
                yield response.follow(href, self.parse)
                

crawl_process = CrawlerProcess()
crawl_process.crawl(Crawler, seed_url='https://books.toscrape.com', max_pages=15, max_depth=3, save_dir='downloaded_pages')
crawl_process.start()


In [None]:
from bs4 import BeautifulSoup
import json, re, os

def build_inverted_index(directory: str, output_json: str):
    inverted_index = {}
    documents = []
    document_ids = []

    for filename in os.listdir(directory):
        if not filename.endswith('.html'):
            continue
        doc_id = os.path.splitext(filename)[0]
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r', errors='ignore') as f:
            html = f.read()

        soup = BeautifulSoup(html, 'lxml')
        text = soup.get_text(' ', strip=True)

        documents.append(text)
        document_ids.append(doc_id)

        tokens = re.findall(r"\b\w+\b", text.lower())
        for position, token in enumerate(tokens):
            if token not in inverted_index:
                inverted_index[token] = []
            if inverted_index[token] and inverted_index[token][-1][0] == doc_id:
                inverted_index[token][-1][1].append(position)
            else:
                inverted_index[token].append((doc_id, [position]))

    with open('index.json', 'w', encoding='utf-8') as f:
        f.write('{\n')
        items = list(inverted_index.items())
        for i, (token, entries) in enumerate(items):
            f.write(f'  "{token}": [\n')
            for j, (doc_id, positions) in enumerate(entries):
                doc_str = f"{doc_id}"
                pos_str = "[" + ", ".join(map(str, positions)) + "]"
                line = f"        ({doc_str}, {pos_str})"
                if j < len(entries) - 1:
                    line += ","
                f.write(line + "\n")

            if i < len(items) - 1:
                f.write('  ],\n')
            else:
                f.write('  ]\n')
        f.write('}\n')

    print(f"inverted index saved to {output_json}")

build_inverted_index('downloaded_pages', 'index.json')



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def build_text(folder_name):
    documents = {}
    path = os.path.join(os.getcwd(), folder_name)
    for filename in os.listdir(path):
        if not filename.lower().endswith('.html'):
            continue
        doc_id = os.path.splitext(filename)[0]
        file_path = os.path.join(path, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            html = f.read()
        soup = BeautifulSoup(html, 'html.parser')
        for tag in soup(['script', 'style']):
            tag.decompose()
        text = soup.get_text(separator=' ')
        text = ' '.join(text.split())
        documents[doc_id] = text

    return documents

def search(query, documents):
    doc_ids = list(documents.keys())
    corpus = [documents[doc_id] for doc_id in doc_ids]

    vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")
    tfidf_matrix = vectorizer.fit_transform(corpus)

    query_vector = vectorizer.transform([query])
    
    cosine_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()

    results = []

    for idx, doc_id in enumerate(doc_ids):
        row = tfidf_matrix[idx].toarray()[0]

        tfidf_terms = {}
        for token in query.lower().split():
            if token in vectorizer.vocabulary_:
                term_index = vectorizer.vocabulary_[token]
                tfidf_weight = float(row[term_index])
            else:
                tfidf_weight = 0.0
            tfidf_terms[token] = tfidf_weight
        term_str = ", ".join([f"{term}: {value:.4f}" for term, value in tfidf_terms.items()])     
        results.append({
            "doc_id": doc_id,
            "cosine": float(cosine_scores[idx]),
            "tfidf_str": term_str
        })

    results.sort(key=lambda x: x["cosine"], reverse=True)

    return results


query = 'love science fiction, math, and true crime books'
documents = build_text('downloaded_pages')
result = search(query, documents)

for r in result[:6]:
    print(f"Document {r['doc_id']}: ")
    print(f"   Cosine Similarity Score: {r['cosine']:.4f}")
    print(f"   TF-IDF Weights: {r['tfidf_str']}\n")

