In [1]:
import os
import random
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split

from gensim.models.poincare import PoincareModel



def load_facebook_edges(
    filepath,      
    subset_size=None, 
    test_size=0.2,
    seed=42
):

    random.seed(seed)

    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Edge list file not found: {filepath}")

    print(f"[INFO] Loading edges from: {filepath} ...")
    edges = []
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = line.split()
            if len(parts) != 2:
                continue
            
            u, v = parts[0], parts[1]


            edges.append((u, v))

    print(f"[INFO] Total edges loaded: {len(edges)}")

    
    if subset_size is not None and len(edges) > subset_size:
        edges = random.sample(edges, subset_size)
        print(f"[INFO] Using a SUBSET of {subset_size} edges.")

    
    train_relations, test_relations = train_test_split(
        edges, test_size=test_size, random_state=seed
    )
    print(f"[INFO] Train relations: {len(train_relations)}")
    print(f"[INFO] Test relations : {len(test_relations)}")

    combined_relations = train_relations + test_relations
    return train_relations, test_relations, combined_relations



def train_and_evaluate_poincare(
    train_relations,
    test_relations,
    combined_relations,
    embedding_dim=10,
    epochs=50,
    n_negatives_strict=500,
):

    print(f"[INFO] Training Poincaré (dim={embedding_dim}, epochs={epochs}) ...")
    model = PoincareModel(
        train_data=train_relations,
        size=embedding_dim,
        negative=10,  
        burn_in=10    
    )
    model.train(epochs=epochs)

    
    u_to_all_neighbors = defaultdict(set)
    for (u, v) in combined_relations:
        u_to_all_neighbors[u].add(v)

    vocab_nodes = set(model.kv.index_to_key)
    vocab_list = list(vocab_nodes)
    all_edges_set = set(combined_relations)

    
    recon_mr_strict = reconstruction_mean_rank_strict_sampled(
        model, train_relations, u_to_all_neighbors, vocab_list, n_negatives_strict
    )
    recon_map_strict = reconstruction_map_strict_sampled(
        model, train_relations, u_to_all_neighbors, vocab_list, n_negatives_strict
    )

    
    lp_mr = link_prediction_mean_rank(model, test_relations, vocab_list, all_edges_set)
    lp_map_ = link_prediction_map(model, test_relations, vocab_list, all_edges_set)
    lp_p10 = precision_at_k(model, test_relations, vocab_list, all_edges_set, k=10)
    lp_r10 = recall_at_k(model, test_relations, vocab_list, all_edges_set, k=10)

    return {
        "recon_mean_rank_strict": recon_mr_strict,
        "recon_map_strict": recon_map_strict,
        "lp_mean_rank": lp_mr,
        "lp_map": lp_map_,
        "lp_precision_10": lp_p10,
        "lp_recall_10": lp_r10,
    }




def reconstruction_mean_rank_strict_sampled(model, edges, u_to_all_neighbors, vocab_list, n_negatives=500, seed=42):
    rng = np.random.default_rng(seed)
    ranks = []
    for (u, v) in edges:
        if (u not in model.kv) or (v not in model.kv):
            continue

        neighbors = u_to_all_neighbors[u]
        neg_candidates = []
        attempts = 0
        while len(neg_candidates) < n_negatives and attempts < 10000:
            candidate = rng.choice(vocab_list)
            if candidate not in (u, v) and candidate not in neighbors:
                neg_candidates.append(candidate)
            attempts += 1

        candidates = neg_candidates + [v]
        dists = [(c, model.kv.distance(u, c)) for c in candidates]
        sorted_nodes = [x[0] for x in sorted(dists, key=lambda x: x[1])]

        try:
            rank = sorted_nodes.index(v) + 1
            ranks.append(rank)
        except ValueError:
            pass

    return float(np.mean(ranks)) if ranks else 0.0


def reconstruction_map_strict_sampled(model, edges, u_to_all_neighbors, vocab_list, n_negatives=500, seed=42):
    rng = np.random.default_rng(seed)
    reciprocal_ranks = []
    for (u, v) in edges:
        if (u not in model.kv) or (v not in model.kv):
            continue

        neighbors = u_to_all_neighbors[u]
        neg_candidates = []
        attempts = 0
        while len(neg_candidates) < n_negatives and attempts < 10000:
            candidate = rng.choice(vocab_list)
            if candidate not in (u, v) and (candidate not in neighbors):
                neg_candidates.append(candidate)
            attempts += 1

        candidates = neg_candidates + [v]
        dists = [(c, model.kv.distance(u, c)) for c in candidates]
        sorted_nodes = [x[0] for x in sorted(dists, key=lambda x: x[1])]
        try:
            r = sorted_nodes.index(v) + 1
            reciprocal_ranks.append(1.0 / r)
        except ValueError:
            pass

    return float(np.mean(reciprocal_ranks)) if reciprocal_ranks else 0.0


def link_prediction_mean_rank(model, edges, vocab_list, all_edges_set, num_negatives=50):
    rng = np.random.default_rng(1234)
    ranks = []
    for (source, target) in edges:
        if (source not in model.kv) or (target not in model.kv):
            continue

        neg_candidates = []
        attempts = 0
        while len(neg_candidates) < num_negatives and attempts < 10000:
            candidate = rng.choice(vocab_list)
            if (candidate != target) and ((source, candidate) not in all_edges_set):
                neg_candidates.append(candidate)
            attempts += 1

        if not neg_candidates:
            continue

        candidates = neg_candidates + [target]
        dists = [model.kv.distance(source, c) for c in candidates]
        sorted_candidates = [c for _, c in sorted(zip(dists, candidates), key=lambda x: x[0])]
        rank = sorted_candidates.index(target) + 1
        ranks.append(rank)

    return float(np.mean(ranks)) if ranks else 0.0


def link_prediction_map(model, edges, vocab_list, all_edges_set, num_negatives=50):
    rng = np.random.default_rng(1234)
    reciprocal_ranks = []
    for (source, target) in edges:
        if (source not in model.kv) or (target not in model.kv):
            continue

        neg_candidates = []
        attempts = 0
        while len(neg_candidates) < num_negatives and attempts < 10000:
            candidate = rng.choice(vocab_list)
            if (candidate != target) and ((source, candidate) not in all_edges_set):
                neg_candidates.append(candidate)
            attempts += 1

        if not neg_candidates:
            continue

        candidates = neg_candidates + [target]
        dists = [model.kv.distance(source, c) for c in candidates]
        sorted_candidates = [c for _, c in sorted(zip(dists, candidates), key=lambda x: x[0])]
        rank = sorted_candidates.index(target) + 1
        reciprocal_ranks.append(1.0 / rank)

    return float(np.mean(reciprocal_ranks)) if reciprocal_ranks else 0.0


def precision_at_k(model, edges, vocab_list, all_edges_set, k=10, num_negatives=50):
    rng = np.random.default_rng(1234)
    hits = 0
    count = 0
    for (source, target) in edges:
        if (source not in model.kv) or (target not in model.kv):
            continue

        neg_candidates = []
        attempts = 0
        while len(neg_candidates) < num_negatives and attempts < 10000:
            candidate = rng.choice(vocab_list)
            if (candidate != target) and ((source, candidate) not in all_edges_set):
                neg_candidates.append(candidate)
            attempts += 1

        if not neg_candidates:
            continue

        candidates = neg_candidates + [target]
        dists = [model.kv.distance(source, c) for c in candidates]
        sorted_candidates = [c for _, c in sorted(zip(dists, candidates), key=lambda x: x[0])]
        top_k_nodes = sorted_candidates[:k]

        if target in top_k_nodes:
            hits += 1
        count += 1

    return hits / count if count else 0.0


def recall_at_k(model, edges, vocab_list, all_edges_set, k=10, num_negatives=50):
    
    return precision_at_k(model, edges, vocab_list, all_edges_set, k, num_negatives)


def run_facebook_pipeline(
    edge_file="facebook_edges.txt",
    subset_size=10000,
    test_size=0.2,
    embedding_dim=10,
    epochs=50,
    n_negatives_strict=500,
    excel_output="facebook_poincare_results.xlsx"
):

    train_rel, test_rel, combined_rel = load_facebook_edges(
        filepath=edge_file,
        subset_size=subset_size,
        test_size=test_size,
        seed=42
    )

    results = train_and_evaluate_poincare(
        train_relations=train_rel,
        test_relations=test_rel,
        combined_relations=combined_rel,
        embedding_dim=embedding_dim,
        epochs=epochs,
        n_negatives_strict=n_negatives_strict
    )

    # Save in a single-row DataFrame
    df = pd.DataFrame([{
        "edge_file": edge_file,
        "subset_size": subset_size,
        "test_size": test_size,
        "embedding_dim": embedding_dim,
        "epochs": epochs,
        "n_negatives_strict": n_negatives_strict,
        **results
    }])
    df.to_excel(excel_output, index=False)
    print(f"\n[INFO] Results saved to: {excel_output}")
    return df



def run_multiple_experiments(
    edge_file="facebook_edges.txt",
    subset_sizes=[1000, 5000, 10000],      
    dims=[5, 10, 20],                     
    test_size=0.2,
    epochs=50,
    n_negatives_strict=100,
    excel_output="facebook_experiments.xlsx"
):

    all_results = []

    for s_size in subset_sizes:
        for dim in dims:
            print("========================================================")
            print(f"RUNNING EXPERIMENT: subset_size={s_size}, dim={dim}")
            print("========================================================")

            
            train_rel, test_rel, combined_rel = load_facebook_edges(
                filepath=edge_file,
                subset_size=s_size,
                test_size=test_size,
                seed=42
            )

            
            results = train_and_evaluate_poincare(
                train_relations=train_rel,
                test_relations=test_rel,
                combined_relations=combined_rel,
                embedding_dim=dim,
                epochs=epochs,
                n_negatives_strict=n_negatives_strict
            )

           
            row_info = {
                "subset_size": s_size,
                "embedding_dim": dim,
                "test_size": test_size,
                "epochs": epochs,
                "n_negatives_strict": n_negatives_strict
            }
            row_info.update(results)
            all_results.append(row_info)

    df = pd.DataFrame(all_results)
    df.to_excel(excel_output, index=False)
    print(f"\n[INFO] All experiment results saved to '{excel_output}'")

    return df



import os
if __name__ == "__main__":

    desktop_path = os.path.expanduser("~/Desktop/experiment_results_facebook.xlsx") 
    
    # MULTIPLE EXPERIMENTS:
    final_multi = run_multiple_experiments(
        edge_file = os.path.expanduser("~/Desktop/facebook_combined.txt"),
        subset_sizes=[5000, 10000, 15000],  
        dims=[5, 10,20,50,100,200] ,             
        test_size=0.2,
        epochs=100,                        
        n_negatives_strict=50,
        excel_output="facebook_experiments.xlsx"
    )
    print("\nAll experiment runs:\n", final_multi)


RUNNING EXPERIMENT: subset_size=5000, dim=5
[INFO] Loading edges from: /Users/alineduthilleul/Desktop/facebook_combined.txt ...
[INFO] Total edges loaded: 88234
[INFO] Using a SUBSET of 5000 edges.
[INFO] Train relations: 4000
[INFO] Test relations : 1000
[INFO] Training Poincaré (dim=5, epochs=100) ...
RUNNING EXPERIMENT: subset_size=5000, dim=10
[INFO] Loading edges from: /Users/alineduthilleul/Desktop/facebook_combined.txt ...
[INFO] Total edges loaded: 88234
[INFO] Using a SUBSET of 5000 edges.
[INFO] Train relations: 4000
[INFO] Test relations : 1000
[INFO] Training Poincaré (dim=10, epochs=100) ...
RUNNING EXPERIMENT: subset_size=5000, dim=20
[INFO] Loading edges from: /Users/alineduthilleul/Desktop/facebook_combined.txt ...
[INFO] Total edges loaded: 88234
[INFO] Using a SUBSET of 5000 edges.
[INFO] Train relations: 4000
[INFO] Test relations : 1000
[INFO] Training Poincaré (dim=20, epochs=100) ...
RUNNING EXPERIMENT: subset_size=5000, dim=50
[INFO] Loading edges from: /Users/al