In [1]:
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import defaultdict
from gensim.models.poincare import PoincareModel


def prepare_custom_data(
    tsv_path,
    hypernym_col="Hypernym",
    hyponym_col="Hyponym",
    subset_size=10000,
    test_size=0.2,
    seed=42
):

    df = pd.read_csv(tsv_path, sep="\t", header=0, names=[hypernym_col, hyponym_col])


    relations_list = []
    for i, row in df.iterrows():
        hypernym = str(row[hypernym_col]).strip()
        hyponym = str(row[hyponym_col]).strip()
        if not hypernym or not hyponym or hypernym == "nan" or hyponym == "nan":
            continue
        relations_list.append((hyponym, hypernym))

    print(f"Total edges (before subsampling): {len(relations_list)}")

    if subset_size is not None and len(relations_list) > subset_size:
        random.seed(seed)
        relations_list = random.sample(relations_list, subset_size)
        print(f"Using a SUBSET of {subset_size} edges for faster testing.")

    train_relations, test_relations = train_test_split(
        relations_list, test_size=test_size, random_state=seed
    )

    print(f"Train relations: {len(train_relations)}")
    print(f"Test relations : {len(test_relations)}")

    combined_relations = train_relations + test_relations
    return train_relations, test_relations, combined_relations


def remove_edges(relations, removal_probability=0.1, seed=42):
    random.seed(seed)
    kept_relations = []
    for (child, ancestor) in relations:
        if random.random() > removal_probability:
            kept_relations.append((child, ancestor))
    return kept_relations


def train_and_evaluate_poincare(
    train_relations,
    test_relations,
    combined_relations,
    embedding_dim=5,
    epochs=300,
    n_negatives_strict=500,
):

    model = PoincareModel(
        train_data=train_relations,
        size=embedding_dim,
        negative=10,
        burn_in=10
    )
    model.train(epochs=epochs)

    u_to_all_neighbors = defaultdict(set)
    for (child, ancestor) in combined_relations:
        u_to_all_neighbors[child].add(ancestor)

    vocab_nodes = set(model.kv.index_to_key)
    vocab_list = list(vocab_nodes)
    all_edges_set = set(combined_relations)

    def reconstruction_mean_rank_strict_sampled(model, edges, n_negatives=500, seed=1234):
        rng = np.random.default_rng(seed)
        ranks = []
        for (u, v) in edges:
            if (u not in model.kv) or (v not in model.kv):
                continue
            neighbors = u_to_all_neighbors[u]
            neg_candidates = []
            attempts = 0
            while len(neg_candidates) < n_negatives and attempts < 10000:
                candidate = rng.choice(vocab_list)
                if candidate not in (u, v) and (candidate not in neighbors):
                    neg_candidates.append(candidate)
                attempts += 1
            candidates = neg_candidates + [v]
            dists = [(c, model.kv.distance(u, c)) for c in candidates]
            sorted_nodes = [x[0] for x in sorted(dists, key=lambda x: x[1])]
            try:
                rank = sorted_nodes.index(v) + 1
                ranks.append(rank)
            except ValueError:
                pass
        return float(np.mean(ranks)) if ranks else 0.0

    def reconstruction_map_strict_sampled(model, edges, n_negatives=500, seed=1234):
        rng = np.random.default_rng(seed)
        reciprocal_ranks = []
        for (u, v) in edges:
            if (u not in model.kv) or (v not in model.kv):
                continue
            neighbors = u_to_all_neighbors[u]
            neg_candidates = []
            attempts = 0
            while len(neg_candidates) < n_negatives and attempts < 10000:
                candidate = rng.choice(vocab_list)
                if candidate not in (u, v) and (candidate not in neighbors):
                    neg_candidates.append(candidate)
                attempts += 1
            candidates = neg_candidates + [v]
            dists = [(c, model.kv.distance(u, c)) for c in candidates]
            sorted_nodes = [x[0] for x in sorted(dists, key=lambda x: x[1])]
            try:
                r = sorted_nodes.index(v) + 1
                reciprocal_ranks.append(1.0 / r)
            except ValueError:
                pass
        return float(np.mean(reciprocal_ranks)) if reciprocal_ranks else 0.0

    def link_prediction_mean_rank(model, edges, num_negatives=50):
        rng = np.random.default_rng(1234)
        ranks = []
        for (source, target) in edges:
            if source not in vocab_nodes or target not in vocab_nodes:
                continue
            neg_candidates = []
            attempts = 0
            while len(neg_candidates) < num_negatives and attempts < 10000:
                candidate = rng.choice(vocab_list)
                # Must not be the true target, nor an existing edge from (source)
                if candidate != target and (source, candidate) not in all_edges_set:
                    neg_candidates.append(candidate)
                attempts += 1
            if not neg_candidates:
                continue
            candidates = neg_candidates + [target]
            dists = [model.kv.distance(source, c) for c in candidates]
            sorted_candidates = [c for _, c in sorted(zip(dists, candidates), key=lambda x: x[0])]
            rank = sorted_candidates.index(target) + 1
            ranks.append(rank)
        return float(np.mean(ranks)) if ranks else 0.0

    def link_prediction_map(model, edges, num_negatives=50):
        rng = np.random.default_rng(1234)
        reciprocal_ranks = []
        for (source, target) in edges:
            if source not in vocab_nodes or target not in vocab_nodes:
                continue
            neg_candidates = []
            attempts = 0
            while len(neg_candidates) < num_negatives and attempts < 10000:
                candidate = rng.choice(vocab_list)
                if candidate != target and (source, candidate) not in all_edges_set:
                    neg_candidates.append(candidate)
                attempts += 1
            if not neg_candidates:
                continue
            candidates = neg_candidates + [target]
            dists = [model.kv.distance(source, c) for c in candidates]
            sorted_candidates = [c for _, c in sorted(zip(dists, candidates), key=lambda x: x[0])]
            rank = sorted_candidates.index(target) + 1
            reciprocal_ranks.append(1.0 / rank)
        return float(np.mean(reciprocal_ranks)) if reciprocal_ranks else 0.0

    def precision_at_k(model, edges, k=10, num_negatives=50):
        rng = np.random.default_rng(1234)
        hits = 0
        count = 0
        for (source, target) in edges:
            if source not in vocab_nodes or target not in vocab_nodes:
                continue
            neg_candidates = []
            attempts = 0
            while len(neg_candidates) < num_negatives and attempts < 10000:
                candidate = rng.choice(vocab_list)
                if candidate != target and (source, candidate) not in all_edges_set:
                    neg_candidates.append(candidate)
                attempts += 1
            if not neg_candidates:
                continue
            candidates = neg_candidates + [target]
            dists = [model.kv.distance(source, c) for c in candidates]
            sorted_candidates = [c for _, c in sorted(zip(dists, candidates), key=lambda x: x[0])]
            top_k_nodes = sorted_candidates[:k]
            if target in top_k_nodes:
                hits += 1
            count += 1
        return hits / count if count else 0.0

    def recall_at_k(model, edges, k=10, num_negatives=50):
        return precision_at_k(model, edges, k, num_negatives)


    recon_mr_strict = reconstruction_mean_rank_strict_sampled(
        model, train_relations, n_negatives=n_negatives_strict, seed=42
    )
    recon_map_strict = reconstruction_map_strict_sampled(
        model, train_relations, n_negatives=n_negatives_strict, seed=42
    )


    lp_mr = link_prediction_mean_rank(model, test_relations, num_negatives=50)
    lp_map_ = link_prediction_map(model, test_relations, num_negatives=50)
    lp_p10 = precision_at_k(model, test_relations, k=10, num_negatives=50)
    lp_r10 = recall_at_k(model, test_relations, k=10, num_negatives=50)

    return {
        "reconstruction_mean_rank_strict": recon_mr_strict,
        "reconstruction_map_strict": recon_map_strict,
        "lp_mean_rank": lp_mr,
        "lp_map": lp_map_,
        "lp_precision_10": lp_p10,
        "lp_recall_10": lp_r10,
    }


def run_multiple_experiments(
    address,                  
    dim_list,                 
    removal_prob_list,        
    tsv_path,                 
    subset_size=10000,
    test_size=0.2,
    epochs=300,
    n_negatives_strict=500,
    seed=42
):
    train_relations_clean, test_relations, combined_relations = prepare_custom_data(
        tsv_path=tsv_path,
        hypernym_col="Hypernym",
        hyponym_col="Hyponym",
        subset_size=subset_size,
        test_size=test_size,
        seed=seed
    )

    all_results = []


    for embedding_dim in dim_list:
        for removal_prob in removal_prob_list:
            print("========================================================")
            print(f"RUNNING: dim={embedding_dim}, removal_prob={removal_prob:.2f}")
            print("========================================================")

  
            modified_train = remove_edges(
                train_relations_clean,
                removal_probability=removal_prob,
                seed=seed
            )
            removed_edges = len(train_relations_clean) - len(modified_train)
            print(f"Removed {removed_edges} edges from training set.")


            run_results = train_and_evaluate_poincare(
                train_relations=modified_train,
                test_relations=test_relations,
                combined_relations=combined_relations,
                embedding_dim=embedding_dim,
                epochs=epochs,
                n_negatives_strict=n_negatives_strict
            )


            row_info = {
                "embedding_dim": embedding_dim,
                "edge_removal_probability": removal_prob,
                "subset_size": subset_size,
                "test_size": test_size,
                "epochs": epochs,
                "n_negatives_strict": n_negatives_strict,
            }
            row_info.update(run_results)  
            all_results.append(row_info)

    df = pd.DataFrame(all_results)
    df.to_excel(address, index=False)
    print(f"\nAll experiment results saved to: {address}")

    return df



if __name__ == "__main__":
    tsv_path = r"Y:\Data Science Readings\Geometry of Information\hebrew\hebewnet.tsv"
    dimension_list = [5, 10,20,50,100,200]            
    edge_removal_probability_list = [0.0, 0.1, 0.3]  
    excel_path = "Y:\\Data Science Readings\\Geometry of Information\\hebrew\\hebrewnet_experiment.xlsx"
    final_df = run_multiple_experiments(
        address=excel_path,
        dim_list=dimension_list,
        removal_prob_list=edge_removal_probability_list,  
        tsv_path=tsv_path,        
        subset_size=10000,        
        test_size=0.2,
        epochs=100,                
        n_negatives_strict=20,
        seed=42
    )

    print("\nFinal DataFrame of all runs:\n")
    print(final_df)


Total edges (before subsampling): 196
Train relations: 156
Test relations : 40
RUNNING: dim=5, removal_prob=0.00
Removed 0 edges from training set.
RUNNING: dim=5, removal_prob=0.10
Removed 19 edges from training set.
RUNNING: dim=5, removal_prob=0.30
Removed 54 edges from training set.
RUNNING: dim=10, removal_prob=0.00
Removed 0 edges from training set.
RUNNING: dim=10, removal_prob=0.10
Removed 19 edges from training set.
RUNNING: dim=10, removal_prob=0.30
Removed 54 edges from training set.
RUNNING: dim=20, removal_prob=0.00
Removed 0 edges from training set.
RUNNING: dim=20, removal_prob=0.10
Removed 19 edges from training set.
RUNNING: dim=20, removal_prob=0.30
Removed 54 edges from training set.
RUNNING: dim=50, removal_prob=0.00
Removed 0 edges from training set.
RUNNING: dim=50, removal_prob=0.10
Removed 19 edges from training set.
RUNNING: dim=50, removal_prob=0.30
Removed 54 edges from training set.
RUNNING: dim=100, removal_prob=0.00
Removed 0 edges from training set.
RUNNI