## Stanza‑level Similarity Search  
Using Sentence‑Transformers embeddings created with `make_embeddings.py`.


In [2]:
# paths
NPZ_PATH   = "Ibn_Arabi_poems_embeddings.npz"
POEMS_JSON = "Ibn_Arabi_poems.json"

# libs (install once per env)
import json, numpy as np, textwrap, pandas as pd
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# ----- load poems ---------------------------------------------------
poems = json.loads(open(POEMS_JSON, encoding="utf-8").read())

# build a flat list for stanza lookup
stanza_lookup = []   # (title, stanza_idx, stanza_text)
for title, stanzas in poems.items():
    for i, s in enumerate(stanzas):
        stanza_lookup.append((title, i, s))

# ----- load embeddings ---------------------------------------------
data = np.load(NPZ_PATH, allow_pickle=True)
embeddings = data["embeddings"]         # (N, dim)  float32
assert embeddings.shape[0] == len(stanza_lookup)

# ----- load the same model used in make_embeddings.py ---------------
model = SentenceTransformer("all-mpnet-base-v2")
print("Loaded", embeddings.shape[0], "stanza embeddings.")

Loaded 516 stanza embeddings.


In [40]:
pd.set_option("display.max_colwidth", None)   # never truncate strings

def top_k_stanzas(query: str, k: int = 5, show_poem: bool = False):
    """
    Return a pandas DataFrame with the top‑k matches.

    Parameters
    ----------
    query : str
    k     : int
    show_poem : bool
        If True, include the 'full poem' column; otherwise omit it.
    """
    q_vec = model.encode(query, normalize_embeddings=True)
    sims  = embeddings @ q_vec
    best  = sims.argsort()[-k:][::-1]

    rows = []
    for rank, idx in enumerate(best, 1):
        score = float(sims[idx])
        title, stanza_idx, stanza_txt = stanza_lookup[idx]
        stanza_txt = stanza_txt.replace("\n", " | ")
        row = dict(
            rank        = rank,
            similarity  = f"{score:.3f}",
            title       = title,
            stanza_idx  = stanza_idx,
            stanza_text = stanza_txt,
        )
        if show_poem:
            row["full_poem"] = "\n".join(poems[title])
        rows.append(row)

    cols = ["rank", "similarity", "title", "stanza_idx", "stanza_text"]
    if show_poem:
        cols.append("full_poem")
    return pd.DataFrame(rows)[cols]


In [41]:
query ="""         As we
              embrace resist
      the future the present the past
      we work we struggle we begin we fail
to understand to find to unbraid to accept to question
       the grief the grief the grief the grief
           we shift we wield we bury
              into light as ash
              across our faces
"""
top_k_stanzas(query, k=5, show_poem=False)

Unnamed: 0,rank,similarity,title,stanza_idx,stanza_text
0,1,0.608,Artemisia and Moringa,0,Patience and solace are gone— | gone with those | who live within | my heart’s dark core
1,2,0.601,Who Forever,1,These are their ruins | tears in memory | of those who melt | the soul forever
2,3,0.586,No New Moon Risen,7,So tell of a man | left torn apart | Cast down in sorrow | before the ruin
3,4,0.575,In the Ruins of My Body,2,"She dies longing, dissolved in desire, | stricken with what struck me | Her mate she mourns and blames time’s | arrow buried in both our hearts"
4,5,0.569,"Gentle Now, Doves",2,"back, in the morning | echo the longing | of the lovesick and lost | Spirits moan"
