In [1]:
!pip install sparse-dot-topn

Collecting sparse-dot-topn
  Downloading sparse_dot_topn-1.1.5-cp39-cp39-win_amd64.whl.metadata (10 kB)
Downloading sparse_dot_topn-1.1.5-cp39-cp39-win_amd64.whl (423 kB)
Installing collected packages: sparse-dot-topn
Successfully installed sparse-dot-topn-1.1.5
Note: you may need to restart the kernel to use updated packages.


In [14]:
import numpy as np
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import lil_matrix, csr_matrix
from sparse_dot_topn import awesome_cossim_topn  # Install using: pip install sparse-dot-topn

### 1️⃣ Load Mapping Files ###
def load_term_id_to_string(file_paths, include_concepts=True):
    """ Load multiple term ID to string mappings and merge them into a single dictionary. """
    term_id_map = {}
    concept_id_map = {}  # For concept mappings

    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            for line in f:
                try:
                    term_id, term_str = line.strip().split("\t")

                    # If it's a concept mapping file, store separately
                    if include_concepts and "concept" in file_path.lower():
                        concept_id_map[term_id] = term_str
                    else:
                        term_id_map[term_id] = term_str
                except ValueError:
                    continue  # Skip malformed lines

    return term_id_map, concept_id_map


### 2️⃣ Load Singleton & Co-Frequency Counts ###
def load_singleton_frequencies(file_path, term_id_map):
    """ Load singleton frequencies and convert IDs to terms """
    singleton_freq = {}
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            term_id, count = line.strip().split("\t")
            term_str = term_id_map.get(term_id, f"UNK_{term_id}")
            singleton_freq[term_str] = int(count)
    return singleton_freq

def load_cofreq_counts(file_path, concept_to_term_map):
    """ Load cofrequency counts and map concept IDs to their corresponding terms. """
    cofreq_counts = []

    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            concept1_id, concept2_id, count = line.strip().split("\t")

            # Get all terms associated with each concept
            terms1 = concept_to_term_map.get(concept1_id, [f"UNK_{concept1_id}"])
            terms2 = concept_to_term_map.get(concept2_id, [f"UNK_{concept2_id}"])

            # Expand cofrequency counts for all term pairs
            for term1 in terms1:
                for term2 in terms2:
                    cofreq_counts.append((term1, term2, int(count)))

    return cofreq_counts



### 3️⃣ Compute NPMI Matrix (Sparse) ###
def compute_npmi(cofreq_counts, singleton_freq, total_count, k=1):
    """ Compute Normalized PMI (NPMI) for better synonym ranking. """
    terms = list(singleton_freq.keys())
    term_index = {term: i for i, term in enumerate(terms)}

    pmi_matrix = lil_matrix((len(terms), len(terms)), dtype=np.float32)

    for term1, term2, co_count in cofreq_counts:
        if term1 not in term_index or term2 not in term_index:
            continue

        i, j = term_index[term1], term_index[term2]
        p_x = singleton_freq.get(term1, 1) / total_count
        p_y = singleton_freq.get(term2, 1) / total_count
        p_xy = co_count / total_count

        # Compute PMI
        pmi = np.log2((p_xy + k) / (p_x * p_y + k))

        # Normalize PMI (NPMI)
        npmi = pmi / -np.log2(p_xy + k) if p_xy > 0 else 0
        pmi_matrix[i, j] = npmi
        pmi_matrix[j, i] = npmi  # Since PMI is symmetric

    return pmi_matrix.tocsr(), term_index  # Convert to efficient sparse CSR format


### 4️⃣ Efficient Synonym Retrieval ###
def find_synonyms_fast(pmi_matrix, term_index, target_word, top_n=5):
    """ Efficient synonym retrieval using sparse cosine similarity. """
    if target_word not in term_index:
        print(f"Word '{target_word}' not found in vocabulary.")
        return []

    words = list(term_index.keys())
    target_idx = term_index[target_word]

    # Compute sparse cosine similarity
    similarity_matrix = awesome_cossim_topn(pmi_matrix[target_idx], pmi_matrix.T, top_n, 0.01)
    sorted_indices = similarity_matrix.nonzero()[1]

    return [(words[i], similarity_matrix[0, i]) for i in sorted_indices if words[i] != target_word][:top_n]


### 5️⃣ Filter Synonyms Using Concept ID ###
def filter_synonyms_by_concept(synonyms, concept_id_map, target_word):
    """ Ensure synonyms belong to the same concept cluster. """
    target_concept = concept_id_map.get(target_word, None)
    if not target_concept:
        return synonyms  # No concept info available

    return [(word, score) for word, score in synonyms if concept_id_map.get(word) == target_concept]

def load_concept_to_term_map(file_path):
    """ Load concept-to-term mappings where multiple terms can map to the same concept. """
    concept_to_term_map = defaultdict(list)

    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            try:
                term_id, concept_id = line.strip().split("\t")
                concept_to_term_map[concept_id].append(term_id)  # Store all terms for a concept
            except ValueError:
                continue  # Skip malformed lines

    return concept_to_term_map



### 🔥 Main Pipeline ###
if __name__ == "__main__":
    # File Paths
    # File Paths
    mapping_files = [
        "../3_ID_Mappings/1_term_ID_to_string.txt",
        "../3_ID_Mappings/2a_concept_ID_to_string.txt",
    ]

    concept_map_file = "../3_ID_Mappings/3_term_ID_to_concept_ID.txt"
    singleton_file = "../2_Singleton_Frequency_Counts/singlets_concepts_perBin_1d.txt"
    cofreq_file = "../1_Cofrequency_Counts/cofreqs_concepts_perBin_1d.txt"

    # Load mappings
    term_id_map, _ = load_term_id_to_string(mapping_files)
    concept_to_term_map = load_concept_to_term_map(concept_map_file)

    # Load Data
    singleton_freq = load_singleton_frequencies(singleton_file, term_id_map)
    cofreq_counts = load_cofreq_counts(cofreq_file, concept_to_term_map)  # 🔹 FIXED

    total_count = sum(singleton_freq.values())

    # Compute PMI
    pmi_matrix, term_index = compute_npmi(cofreq_counts, singleton_freq, total_count)



MemoryError: 

In [11]:
term_index

{'UNK_4012122': 0,
 'UNK_4846925': 1,
 'UNK_4160379': 2,
 'UNK_4022344': 3,
 'UNK_4071782': 4,
 'UNK_4625328': 5,
 'UNK_4015238': 6,
 'UNK_5152426': 7,
 'UNK_4035967': 8,
 'UNK_5128353': 9,
 'UNK_4136534': 10,
 'UNK_4301207': 11,
 'UNK_4742843': 12,
 'UNK_4211033': 13,
 'UNK_4137142': 14,
 'UNK_4542108': 15,
 'UNK_4078185': 16,
 'UNK_4003819': 17,
 'UNK_4189354': 18,
 'UNK_4124412': 19,
 'UNK_4016720': 20,
 'UNK_4196688': 21,
 'UNK_4018221': 22,
 'UNK_4411407': 23,
 'UNK_5153490': 24,
 'UNK_4125020': 25,
 'UNK_4738302': 26,
 'UNK_4111492': 27,
 'UNK_5171825': 28,
 'UNK_4012445': 29,
 'UNK_4139137': 30,
 'UNK_4253878': 31,
 'UNK_4411217': 32,
 'UNK_4698554': 33,
 'UNK_4125609': 34,
 'UNK_4351595': 35,
 'UNK_4909872': 36,
 'UNK_4111302': 37,
 'UNK_4456095': 38,
 'UNK_4021394': 39,
 'UNK_4183578': 40,
 'UNK_4412015': 41,
 'UNK_4226290': 42,
 'UNK_4147269': 43,
 'UNK_4247152': 44,
 'UNK_4083486': 45,
 'UNK_4002888': 46,
 'UNK_4550582': 47,
 'UNK_4753578': 48,
 'UNK_4625309': 49,
 'UNK_4184

In [13]:
# Find synonyms
target_word = "leukemia"
synonyms = find_synonyms_fast(pmi_matrix, term_index, target_word)

# Filter synonyms using concept ID
synonyms_filtered = filter_synonyms_by_concept(synonyms, concept_id_map, target_word)

# Output results
print(f"Top synonyms for '{target_word}':")
for word, score in synonyms_filtered:
    print(f"{word}: {score:.4f}")

Word 'leukemia' not found in vocabulary.
Top synonyms for 'leukemia':
