In [3]:
from flask import Flask, request, jsonify
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser
import os
import json

app = Flask(__name__)

# Directory for Whoosh index
INDEX_DIR = "whoosh_index"

# Define Whoosh schema
schema = Schema(
    title=TEXT(stored=True),
    content=TEXT(stored=True),
    topic=TEXT(stored=True),
    source=ID(stored=True)
)

# Load aggregated data
DATA_FILE = "aggregated_data.json"
if os.path.exists(DATA_FILE):
    with open(DATA_FILE, "r") as f:
        all_data = json.load(f)
else:
    all_data = []

# Initialize Whoosh index
if not os.path.exists(INDEX_DIR):
    os.mkdir(INDEX_DIR)
    ix = create_in(INDEX_DIR, schema)
    writer = ix.writer()
    for doc in all_data:
        writer.add_document(
            title=doc.get("title", ""),
            content=doc.get("content", ""),
            topic=doc.get("topic", ""),
            source=doc.get("source", "")
        )
    writer.commit()
else:
    from whoosh.index import open_dir
    ix = open_dir(INDEX_DIR)

@app.route("/search", methods=["GET"])
def search():
    """Search the indexed data using a query."""
    query_str = request.args.get("q", "")
    if not query_str:
        return jsonify({"error": "No query provided"}), 400

    with ix.searcher() as searcher:
        query = QueryParser("content", ix.schema).parse(query_str)
        results = searcher.search(query, limit=10)
        response = []
        for result in results:
            response.append({
                "title": result["title"],
                "topic": result["topic"],
                "source": result["source"],
                "snippet": result.highlights("content")
            })
        return jsonify(response)

@app.route("/topics", methods=["GET"])
def get_topics():
    """Return all unique topics from the data."""
    topics = list(set(doc["topic"] for doc in all_data))
    return jsonify(topics)

@app.route("/documents", methods=["GET"])
def get_documents():
    """Retrieve documents by topic."""
    topic = request.args.get("topic", "")
    if not topic:
        return jsonify({"error": "No topic provided"}), 400

    filtered_docs = [doc for doc in all_data if doc["topic"] == topic]
    return jsonify(filtered_docs)

if __name__ == "__main__":
    app.run(debug=True, use_reloader=False)



 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [27/Jan/2025 21:58:25] "GET / HTTP/1.1" 404 -
127.0.0.1 - - [27/Jan/2025 23:46:53] "GET / HTTP/1.1" 404 -
