In [36]:
import pandas as pd

# Path to your exported .txt root deck
txt_path = "/Users/anjanapro/Desktop/GRE-roots-text.txt"

# Read lines, skip comment lines
with open(txt_path, 'r', encoding='utf-8') as f:
    lines = [line.strip() for line in f if line.strip() and not line.startswith("#")]

# Parse tab-separated front and back
parsed = []
for line in lines:
    if '\t' in line:
        front, back = line.split('\t', 1)
        parsed.append((front.strip(), back.strip()))
    else:
        print(f"Skipping malformed line: {line}")

# Create DataFrame
df = pd.DataFrame(parsed, columns=["Front", "Back"])

# Display full DataFrame
pd.set_option("display.max_rows", None)  # show all rows
print(df)


                 Front                                               Back
0        ben- or bene-  good  examples:  beneficial = helpfulbenevolen...
1        mal- or male-  bad  examples malicious = intending hamrmalevo...
2           anthropic-  having to do with humankind  examples:  anthro...
3       cise- or cide-  strike, cut, or kill  examples: incision = sur...
4        gen- or gene-  origin, kind, or type  examples: generate = to...
5    morph- or morpho-  form or shape  examples: morphology = study of...
6        vol- or voli-  will or intention  examples: voluntary = by ch...
7                ante-  before  examples: antecedent = something befor...
8                anti-  against  examples: antidote = remedy against p...
9              circum-  moving around something examples: circumnaviga...
10              hyper-  over, above  A Greek root that means excessive...
11              trans-  across  examples: transport = carry acrosstran...
12               -able  for adjectives

In [37]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = model.encode(df["Back"].tolist(), show_progress_bar=True)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

In [39]:
import numpy as np

df["Embedding"] = list(embeddings)


In [40]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(embeddings)
edges = []

threshold = 0.85

for i in range(len(df)):
    for j in range(i + 1, len(df)):
        sim = similarity_matrix[i][j]
        if sim >= threshold:
            edges.append((df.iloc[i]["Front"], df.iloc[j]["Front"], sim))

In [45]:
i = df[df["Front"] == "de-"].index[0]
similarities = similarity_matrix[i]
top_indices = np.argsort(similarities)[::-1][1:6]  # exclude self

for j in top_indices:
    print(f"{df.iloc[i]['Front']} ↔ {df.iloc[j]['Front']}")
    print(f"Similarity: {similarities[j]:.2f}")
    print(f"Definition A: {df.iloc[i]['Back']}")
    print(f"Definition B: {df.iloc[j]['Back']}")
    print("-" * 60)

de- ↔ dis-/di-/dif-
Similarity: 0.40
Definition A: down, away, reverse, completely  examples: devalue = reduce in valuedefame = take aware someone's reputation
Definition B: apart, not  examples: dissonance = lack of harmonydivert = to turn away from a path
------------------------------------------------------------
de- ↔ pro-
Similarity: 0.39
Definition A: down, away, reverse, completely  examples: devalue = reduce in valuedefame = take aware someone's reputation
Definition B: forward, for  examples: propel = to push forwardproponent = one who argues in favor of something
------------------------------------------------------------
de- ↔ ben- or bene-
Similarity: 0.37
Definition A: down, away, reverse, completely  examples: devalue = reduce in valuedefame = take aware someone's reputation
Definition B: good  examples:  beneficial = helpfulbenevolent = kindbenign = harmlesl
------------------------------------------------------------
de- ↔ dict
Similarity: 0.35
Definition A: down, awa