In [None]:
import os
import scrapy
from scrapy.crawler import CrawlerProcess
from urllib.parse import urlparse
import uuid

class Crawler(scrapy.Spider):
    name = "web_crawler"
    
    def __init__(self, seed_url, max_pages, max_depth, save_dir=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.start_urls = [seed_url]
        self.allowed_domains = [urlparse(seed_url).netloc]
        self.max_pages = int(max_pages)
        self.pages_crawled = 0
        self.save_dir = save_dir or 'downloaded_pages'
        self.custom_settings = {
            'DEPTH_LIMIT': int(max_depth),
        }

    def parse(self, response):
        if self.pages_crawled >= self.max_pages:
            return

        os.makedirs(self.save_dir, exist_ok=True)
        filename = f"{str(uuid.uuid4()).upper()}.html"
        filepath = os.path.join(self.save_dir, filename)
        with open(filepath, "wb") as f:
            f.write(response.body)
        print(f"Saved {filename} to folder with {response.url}")
        self.pages_crawled += 1

        if self.pages_crawled >= self.max_pages:
            return
        
        for href in response.css("a::attr(href)").getall():
            if href and not href.startswith("mailto:") and not href.startswith("#"):
                yield response.follow(href, self.parse)
                

crawl_process = CrawlerProcess()
crawl_process.crawl(Crawler, seed_url='https://books.toscrape.com', max_pages=15, max_depth=3, save_dir='downloaded_pages')
crawl_process.start()


In [None]:
from bs4 import BeautifulSoup
import json, re, os

def build_inverted_index(directory: str, output_json: str):
    inverted_index = {}
    documents = []
    document_ids = []

    for filename in os.listdir(directory):
        if not filename.endswith('.html'):
            continue
        doc_id = os.path.splitext(filename)[0]
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r', errors='ignore') as f:
            html = f.read()

        soup = BeautifulSoup(html, 'lxml')
        text = soup.get_text(' ', strip=True)

        documents.append(text)
        document_ids.append(doc_id)

        tokens = re.findall(r"\b\w+\b", text.lower())
        for position, token in enumerate(tokens):
            if token not in inverted_index:
                inverted_index[token] = []
            if inverted_index[token] and inverted_index[token][-1][0] == doc_id:
                inverted_index[token][-1][1].append(position)
            else:
                inverted_index[token].append((doc_id, [position]))

    with open('index.json', 'w', encoding='utf-8') as f:
        f.write('{\n')
        items = list(inverted_index.items())
        for i, (token, entries) in enumerate(items):
            f.write(f'  "{token}": [\n')
            for j, entry in enumerate(entries):
                entry_json = json.dumps(entry)
                if j < len(entries) - 1:
                    f.write(f'    {entry_json},\n')
                else:
                    f.write(f'    {entry_json}\n')
            if i < len(items) - 1:
                f.write('  ],\n')
            else:
                f.write('  ]\n')
        f.write('}\n')

    print(f"inverted index saved to {output_json}")

build_inverted_index('downloaded_pages', 'index.json')



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def build_text(folder_name):
    documents = {}
    path = os.path.join(os.getcwd(), folder_name)
    for filename in os.listdir(path):
        if not filename.lower().endswith('.html'):
            continue
        doc_id = os.path.splitext(filename)[0]
        file_path = os.path.join(path, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            html = f.read()
        soup = BeautifulSoup(html, 'html.parser')
        for tag in soup(['script', 'style']):
            tag.decompose()
        text = soup.get_text(separator=' ')
        text = ' '.join(text.split())
        documents[doc_id] = text

    return documents

def search(query, documents):
    doc_ids = list(documents.keys())
    corpus = [documents[doc_id] for doc_id in doc_ids]

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)

    query_vector = vectorizer.transform([query])
    
    cosine_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()
    feature_names = vectorizer.get_feature_names_out()
    tfidf = {}
    for i, doc_id in enumerate(doc_ids):
        row_vector = tfidf_matrix[i].toarray()[0]
        tfidf[doc_id] = {term: row_vector[j] 
                                   for j, term in enumerate(feature_names) if row_vector[j] > 0}
    return {'tfidf': tfidf , 'cosine similarity': dict(zip(doc_ids, cosine_scores))}

documents = build_text('downloaded_pages')
query = 'I love science fiction books'
results = search(query, documents)
tfidf = results['tfidf']
cosine_sim = results['cosine similarity']
print(f'Cosine Similarity based on query: "{query}"\n')
for doc, score in sorted(cosine_sim.items(), key=lambda x: x[1], reverse=True):
    print(f'{doc}: {score: .4f}')

