$\textbf{\underline{Part 1}}$

In [None]:
import os
import scrapy
from scrapy.crawler import CrawlerProcess
from urllib.parse import urlparse
import uuid

class Crawler(scrapy.Spider):
    name = "web_crawler"
    
    def __init__(self, seed_url, max_pages, max_depth, save_dir=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.start_urls = [seed_url]
        self.allowed_domains = [urlparse(seed_url).netloc]
        self.max_pages = int(max_pages)
        self.pages_crawled = 0
        self.save_dir = save_dir or 'downloaded_pages'
        self.custom_settings = {
            'DEPTH_LIMIT': int(max_depth),
        }

    def parse(self, response):
        if self.pages_crawled >= self.max_pages:
            return

        os.makedirs(self.save_dir, exist_ok=True)
        filename = f"{str(uuid.uuid4()).upper()}.html"
        filepath = os.path.join(self.save_dir, filename)
        with open(filepath, "wb") as f:
            f.write(response.body)
        print(f"Saved {filename} to folder with {response.url}")
        self.pages_crawled += 1

        if self.pages_crawled >= self.max_pages:
            return
        
        for href in response.css("a::attr(href)").getall():
            if href and not href.startswith("mailto:") and not href.startswith("#"):
                yield response.follow(href, self.parse)
                

crawl_process = CrawlerProcess()
crawl_process.crawl(Crawler, seed_url='https://books.toscrape.com', max_pages=15, max_depth=3, save_dir='downloaded_pages')
crawl_process.start()


$\textbf{\underline{Part 2.1}}$

In [5]:
from bs4 import BeautifulSoup
import json, re, os

def build_inverted_index(directory: str, output_json: str):
    inverted_index = {}
    documents = []
    document_ids = []

    for filename in os.listdir(directory):
        if not filename.endswith('.html'):
            continue
        doc_id = os.path.splitext(filename)[0]
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r', errors='ignore') as f:
            html = f.read()

        soup = BeautifulSoup(html, 'lxml')
        text = soup.get_text(' ', strip=True)

        documents.append(text)
        document_ids.append(doc_id)

        tokens = re.findall(r"\b\w+\b", text.lower())
        for position, token in enumerate(tokens):
            if token not in inverted_index:
                inverted_index[token] = []
            if inverted_index[token] and inverted_index[token][-1][0] == doc_id:
                inverted_index[token][-1][1].append(position)
            else:
                inverted_index[token].append((doc_id, [position]))

    with open('index.json', 'w', encoding='utf-8') as f:
        f.write('index = {\n')
        items = list(inverted_index.items())
        for i, (token, entries) in enumerate(items):
            f.write(f'  "{token}": [\n')
            for j, (doc_id, positions) in enumerate(entries):
                doc_str = f"{doc_id}"
                pos_str = "[" + ", ".join(map(str, positions)) + "]"
                line = f"        ({doc_str}, {pos_str})"
                if j < len(entries) - 1:
                    line += ","
                f.write(line + "\n")

            if i < len(items) - 1:
                f.write('  ],\n')
            else:
                f.write('  ]\n')
        f.write('}\n')

    print(f"inverted index saved to {output_json}")

build_inverted_index('downloaded_pages', 'index.json')



inverted index saved to index.json


$\textbf{\underline{Part 2.2}}$

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def build_text(folder_name):
    documents = {}
    path = os.path.join(os.getcwd(), folder_name)
    for filename in os.listdir(path):
        if not filename.lower().endswith('.html'):
            continue
        doc_id = os.path.splitext(filename)[0]
        file_path = os.path.join(path, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            html = f.read()
        soup = BeautifulSoup(html, 'html.parser')
        for tag in soup(['script', 'style']):
            tag.decompose()
        text = soup.get_text(separator=' ')
        text = ' '.join(text.split())
        documents[doc_id] = text

    return documents

def search(query, documents):
    doc_ids = list(documents.keys())
    corpus = [documents[doc_id] for doc_id in doc_ids]

    vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")
    tfidf_matrix = vectorizer.fit_transform(corpus)
    query_vector = vectorizer.transform([query])
    cosine_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()

    results = []

    for idx, doc_id in enumerate(doc_ids):
        row = tfidf_matrix[idx].toarray()[0]

        tfidf_terms = {}
        for token in query.lower().split():
            if token in vectorizer.vocabulary_:
                term_index = vectorizer.vocabulary_[token]
                tfidf_weight = float(row[term_index])
            else:
                tfidf_weight = 0.0
            tfidf_terms[token] = tfidf_weight
        term_str = ", ".join([f"{term}: {value:.4f}" for term, value in tfidf_terms.items()])     
        results.append({
            'doc_id': doc_id,
            'cosine': float(cosine_scores[idx]),
            'tfidf_str': term_str
        })

    results.sort(key=lambda x: x["cosine"], reverse=True)

    return results


query = 'love science fiction, math, and true crime books'
documents = build_text('downloaded_pages')
result = search(query, documents)

for r in result[:6]:
    print(f'Document {r['doc_id']}: ')
    print(f'   Cosine Similarity Score: {r['cosine']:.4f}')
    print(f'   TF-IDF Weights: {r['tfidf_str']}\n')



Document 69F2AF4B-666C-47A0-AE3E-E7199D484D9C: 
   Cosine Similarity Score: 0.5537
   TF-IDF Weights: love: 0.0547, science: 0.1409, fiction,: 0.0000, math,: 0.0000, and: 0.2189, true: 0.0000, crime: 0.3009, books: 0.2189

Document 046DD200-AA15-44BC-8C14-BDED598B0CEF: 
   Cosine Similarity Score: 0.4464
   TF-IDF Weights: love: 0.0550, science: 0.1417, fiction,: 0.0000, math,: 0.0000, and: 0.2201, true: 0.0000, crime: 0.0756, books: 0.2201

Document CE075531-3A01-4F81-BCF9-9B4C98E3DD67: 
   Cosine Similarity Score: 0.4431
   TF-IDF Weights: love: 0.0546, science: 0.1407, fiction,: 0.0000, math,: 0.0000, and: 0.2185, true: 0.0000, crime: 0.0751, books: 0.2185

Document F082154D-2E0D-4854-A66A-AF50AD4B108C: 
   Cosine Similarity Score: 0.4378
   TF-IDF Weights: love: 0.0540, science: 0.1390, fiction,: 0.0000, math,: 0.0000, and: 0.2159, true: 0.0000, crime: 0.0742, books: 0.2159

Document DB449E17-F4AC-47AB-B714-33C0541FDFEB: 
   Cosine Similarity Score: 0.4354
   TF-IDF Weights: love: 

$\textbf{\underline{Part 3}}$

In [4]:
import threading
from flask import Flask, request, send_file, jsonify
import pandas as pd
import io, os

import requests

top_k = 4
documents = build_text('downloaded_pages')

app = Flask(__name__)


@app.route("/process", methods=["GET"])
def processor():
    try:
        filename = request.args.get("file")
        if not filename:
            return jsonify({"error": "Missing 'file' parameter"}), 400

        csv_path = filename
        if not os.path.exists(csv_path):
            return jsonify({"error": f"File not found: {csv_path}"}), 404

        try:
            df = pd.read_csv(csv_path, dtype=str)
        except Exception as error:
            return jsonify({"error": f"CSV read error: {str(error)}"}), 400
        required = {"query_id", "query_text"}
        if not required.issubset(df.columns):
            return jsonify({"error": "CSV must contain query_id and query_text"}), 400

        output_rows = []

        for _, row in df.iterrows():

            qid = row.get("query_id", "").strip()
            qtext = row.get("query_text", "").strip()
            if not qid:
                return jsonify({"error": "query_id cannot be empty"}), 400
            if not qtext:
                return jsonify({"error": f"Empty query_text for query_id {qid}"}), 400
            try:
                ranked = search(qtext, documents)
            except Exception as e:
                return jsonify({
                    "error": f"Search failed for query_id {qid}: {str(e)}"
                }), 500
            for rank, item in enumerate(ranked[:top_k], start=1):
                output_rows.append({
                    "query_id": qid,
                    "rank": rank,
                    "document_id": item["doc_id"]
                })
        out_df = pd.DataFrame(output_rows)

        out_buffer = io.StringIO()
        out_df.to_csv(out_buffer, index=False)
        out_buffer.seek(0)

        output_name = filename.replace(".csv", "_results.csv")

        return send_file(
            io.BytesIO(out_buffer.getvalue().encode("utf-8")),
            mimetype="text/csv",
            as_attachment=True,
            download_name=output_name
        )

    except Exception as fatal:
        return jsonify({"error": f"Unexpected server error: {str(fatal)}"}), 500


def run_app():
    app.run(port=5000)
thread = threading.Thread(target=run_app)
thread.start()

url = 'http://127.0.0.1:5000/process?file=queries.csv'
response = requests.get(url)

if not response.ok:
    print('Error: ', response.text)
else:
    with open("results.csv", "wb") as f:
        f.write(response.content)

    print("Output saved to results.csv")

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [01/Dec/2025 11:52:02] "GET /process?file=queries.csv HTTP/1.1" 200 -


Output saved to results.csv


$\textbf{\underline{Abstract}}$

$\text{This project includes three main parts: a web crawler, an inverted index builder and searching algorithm, and a processing algorithm for Top-K ranked retrieval. The development process}$
$\text{involved an incremental approach, where each part of the project was developed and completed before moving on to the next part. Additionally, initial testing was also done before moving}$
$\text{to other parts of the project to ensure a previously developed function works correctly. Once all parts were completed, more internal testing (not reflected on github repository) was done}$
$\text{to ensure all parts worked correctly, especially with large inputs. Throughout the development process, dedicated days to debugging and understanding a part of the project were allocated}$
$\text{to ensure a more smoother and efficient development.}$

$\text{If more time were to be dedicaated to this project, the next steps in development would be adding query spelling-correction and suggestions, vector embedding representation, and distributed}$
$\text{crawling. This would improve search accuracy by correcting typos and offering alternative terms, enable retrieval of contextually similar documents, and enhance scalability through large-scale,}$ 
$\text{continuous updates to the web crawler.}$

$\textbf{\underline{Overview}}$

$\text{The solution outline is as follows:}$

$\text{• Web Crawler - Used a scrapy crawler with CrawlerProcess; starts at a base URL and recursively follows links up to a specified depth and amount of .html files need} \\ 
\text{}\\
\text{• Indexer - Used BeautifulSoup to extract the text from the folder, tokenized each term within the folder, and built an positional inverted index as a .json file. The search} \\
\text{algorithm calculates the cosine similarity and the TF-IDF weights of each term in comparison to the corpus for a specified query. } \\
\text{} \\
\text{• Processor - A Flask based processor with Top-K ranked retrieval that validates query text and query IDs within the queries.csv file and outputs a results.csv file containing} \\ 
\text{query ID, rank, and document ID.}$ 

$\text{Based on this solution outline, the following literatures were used to gain information about possible implementation methods:}$ 

$\text{• [Scrapy Documentation](https://docs.scrapy.org/en/latest/topics/api.html): Used to learn about a scrapy crawler through CrawlerProcess. Provided useful} \\
\text{information on relevant custom settings and functions that can be used tio create a web crawler.} \\
\text{} \\
\text{• [Flask Documentation](https://flask.palletsprojects.com/en/stable/api/): Used to learn about creating a Flask application and setting up a GET request for the project's processor} \\
\text{algorithm. It also provided information on using send file to save a csv file into a user's directory.}
\text{} \\
\text{• [Sklearn Documentation](https://scikit-learn.org/stable/api/sklearn.html): Used to learn more about calculating TFIDF weights and cosine similarity scores.}$

$\text{The system will use Scrapy based web crawler to gather html documents and store them in a folder. This folder will later be retrieved by a function that builds an} \\$
$\text{posotional inverted index and stores the index in a json file. A query will be given to a search function that calculate its cosine similarity to each document and } \\ $
$\text{TFIDF weights, relative to the corpus. These calculation will then be used to display Top-K ranked retrieved documents after each query ID and query text is processed} \\ $
$\text{and error checked.}$

$\textbf{\underline{Design}}$

$\text{Based on a specified depth and max pages, the system will be able to scrap through a specific website and gather a set of html documents into a folder. The greater the depth specified, the}$
$\text{deeper the crawler will go through each link found in each html document. Based on these results, the folder of html files is used to create a positional inverted index with document IDs,}$
$\text{terms and positions in which each term exists in a document. Furthermore, the folder of html files is converted into a corpus with a python dictionary and used to build a search algorithm. This}$
$\text{search algorithm accepts a query and calculates its cosine similarity and TFIDF weights of each term within the query, relative to each document. These results are used within a processing algorithm}$
$\text{that provides error checking and query validation, and Top K ranked retrieval. This processing function uses the search algoirhtm on a queries.csv file, which contains query text and query IDs and}$
$\text{provides the Top K ranked retrieval results through a results.csv file containing document IDs, query IDs, and ranks for each document.}$

$\text{The following diagram was created to showcase the interactions between each project component:}$
![Interactions Diagram](images/interactions.png)