In [None]:
import json
import numpy as np
import os
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import sys

# Add root directory (one level up from notebooks/)
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

  from .autonotebook import tqdm as notebook_tqdm


In [31]:
from utils.data_prep import traverse_directory, traverse_directory_to_single_json

input_directory = r'../data/processed/wikidata'
output_directory = r'../data/processed/wikidata_json'

traverse_directory(input_directory, output_directory)
traverse_directory_to_single_json(input_directory, output_directory + '.json')

Scanning directory: ../data/processed/wikidata
Scanning directory: ../data/processed/wikidata\AA
Scanning directory: ../data/processed/wikidata\AB
Scanning directory: ../data/processed/wikidata\AC
Scanning directory: ../data/processed/wikidata\AD
Scanning directory: ../data/processed/wikidata\AE
Scanning directory: ../data/processed/wikidata\AF
Scanning directory: ../data/processed/wikidata\AG
Scanning directory: ../data/processed/wikidata\AH
Scanning directory: ../data/processed/wikidata\AI
Scanning directory: ../data/processed/wikidata\AJ
Scanning directory: ../data/processed/wikidata\AK
Scanning directory: ../data/processed/wikidata\AL
Scanning directory: ../data/processed/wikidata\AM
Scanning directory: ../data/processed/wikidata\AN
Scanning directory: ../data/processed/wikidata\AO
Scanning directory: ../data/processed/wikidata\AP
Scanning directory: ../data/processed/wikidata\AQ
Scanning directory: ../data/processed/wikidata\AR
Scanning directory: ../data/processed/wikidata\AS
Sca

In [2]:
if not os.path.isfile("../data/embeddings/article_metadata.json") and not os.path.isfile("../data/embeddings/article_embeddings.npy"):
    # Directory containing .json files
    json_dir = "../data/processed/wikidata_json"

    # Load all articles from .json files
    articles = []
    article_metadata = []

    for dir in os.listdir(json_dir):
        dir = os.path.join(json_dir, dir)
        for filename in os.listdir(dir):
            if filename.endswith(".json"):
                filepath = os.path.join(dir, filename)
                with open(filepath, "r", encoding="utf-8") as f:
                    try:
                        data = json.load(f)
                    except json.JSONDecodeError:
                        print(f"Skipping invalid JSON: {filename}")
                        continue

                    for item in data:
                        articles.append(item.get("content", ""))
                        article_metadata.append({
                            "title": item.get("title", "Untitled"),
                            "url": item.get("url", ""),
                            "source_file": filename
                        })
        print(f"Appended directory to JSON: {dir}")

    print(f"Loaded {len(articles)} articles with ≥100 words.")

    # Embed with SentenceTransformer
    embeddings = []
    model = SentenceTransformer("all-MiniLM-L12-v2").to('cuda')
    for i in tqdm(range(0, len(articles), 32)):
        batch = articles[i:i+32]
        embeddings.extend(model.encode(batch, device='cuda'))

    # Save embeddings and metadata
    os.makedirs("../data/embeddings", exist_ok=True)
    np.save("../data/embeddings/article_embeddings.npy", embeddings)

    with open("../data/embeddings/article_metadata.json", "w", encoding="utf-8") as f:
        json.dump(article_metadata, f, ensure_ascii=False, indent=2)

    print("Embeddings and metadata saved.")
else:
    print("Embeddings and metadata already exist.")

Embeddings and metadata already exist.


In [None]:
from utils.faiss_index import create_faiss_index, query_faiss

faiss_path = "../data/embeddings/faiss_index.index"
embedding_path = "../data/embeddings/article_embeddings.npy"

create_faiss_index(faiss_path, embedding_path)

FAISS index already exists.


In [4]:
query = "Who invented science?"
model = SentenceTransformer("all-MiniLM-L12-v2")
query_embedding = model.encode([query])

In [5]:
indices = query_faiss(faiss_path, query_embedding, 10)

# Load article metadata
with open("../data/embeddings/article_metadata.json", encoding='utf-8') as f:
    metadata = json.load(f)

# Retrieve top-k articles
results = [metadata[i] for i in indices[0]]

In [33]:
# Normalize for cosine similarity
query_vec = query_embedding[0].reshape(1, -1)
article_embeddings = np.load("../data/embeddings/article_embeddings.npy")
top_articles = [metadata[i] | {"vec": article_embeddings[i]} for i in indices[0]]

# Rerank
top_articles.sort(
    key=lambda x: cosine_similarity(query_vec, x["vec"].reshape(1, -1))[0][0],
    reverse=True
)

In [34]:
final_results = [
    {
        "title": article["title"],
        "url": article.get("url", "N/A")
    }
    for article in top_articles[:5]
]

# Print nicely
import pprint
pprint.pprint(final_results)

[{'title': 'Scientist', 'url': 'https://en.wikipedia.org/wiki?curid=26997'},
 {'title': 'History of science',
  'url': 'https://en.wikipedia.org/wiki?curid=14400'},
 {'title': 'History of scientific method',
  'url': 'https://en.wikipedia.org/wiki?curid=3143150'},
 {'title': 'Christianity and science',
  'url': 'https://en.wikipedia.org/wiki?curid=42657576'},
 {'title': 'Science in the ancient world',
  'url': 'https://en.wikipedia.org/wiki?curid=1673699'}]
