In [1]:
!pip install sparse-dot-topn

Collecting sparse-dot-topn
  Downloading sparse_dot_topn-1.1.5-cp39-cp39-win_amd64.whl.metadata (10 kB)
Downloading sparse_dot_topn-1.1.5-cp39-cp39-win_amd64.whl (423 kB)
Installing collected packages: sparse-dot-topn
Successfully installed sparse-dot-topn-1.1.5
Note: you may need to restart the kernel to use updated packages.


My next challenge to improve on:

Your current approach relies on co-occurrence-based PMI, which captures statistical relationships but lacks contextual meaning.

PMI works well but is sparse and sensitive to rare words.

Word Embeddings (Word2Vec, FastText, GloVe, etc.) generalize better by mapping similar words to close vectors.

Character Embeddings (useful for medical/clinical terms) help capture variations & typos.

In [None]:
def count_lines(file_path):
    """Count the total number of lines in a file."""
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        return sum(1 for _ in f)

# File paths
files = [
"../1_Cofrequency_Counts/cofreqs_concepts_perBin_1d.txt",
"../2_Singleton_Frequency_Counts/singlets_concepts_perBin_1d.txt",
"../3_ID_Mappings/1_term_ID_to_string.txt",
"../3_ID_Mappings/2a_concept_ID_to_string.txt",
"../3_ID_Mappings/term_id_to_string_test.txt"
]

# Count rows
for file in files:
    print(f"{file}: {count_lines(file)} lines")

In [10]:
import numpy as np
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import lil_matrix, csr_matrix
from sparse_dot_topn import awesome_cossim_topn  # Install using: pip install sparse-dot-topn

### 1️⃣ Load Mapping Files ###
def load_term_id_to_string(file_paths, include_concepts=True):
    """ Load multiple term ID to string mappings and merge them into a single dictionary. """
    term_id_map = {}
    concept_id_map = {}  # For concept mappings

    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            for line in f:
                try:
                    term_id, term_str = line.strip().split("\t")

                    # If it's a concept mapping file, store separately
                    if include_concepts and "concept" in file_path.lower():
                        concept_id_map[term_id] = term_str
                    else:
                        term_id_map[term_id] = term_str
                except ValueError:
                    continue  # Skip malformed lines

    return term_id_map, concept_id_map


def load_singleton_frequencies(file_path, concept_to_term_map, term_id_map):
    """ Load up to max_lines of singleton frequencies and map concept IDs to term strings """
    singleton_freq = {}
    
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for i, line in enumerate(f):
            
          concept_id, count = line.strip().split("\t")
          count = int(count)

          # Get term IDs associated with this concept
          term_ids = concept_to_term_map.get(concept_id, [f"UNK_{concept_id}"])

          # Convert term IDs to actual term names
          terms = [term_id_map.get(term_id, f"UNK_{term_id}") for term_id in term_ids]

          # Store frequencies for all mapped terms
          for term in terms:
              singleton_freq[term] = singleton_freq.get(term, 0) + count  # Sum counts if needed


    return singleton_freq


def load_cofreq_counts(file_path, concept_to_term_map, term_id_map, output_file="temp_cofreq.txt", max_lines=100000000000):
    """ Process cofrequency counts in a memory-efficient way for up to max_lines entries. """
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f, open(output_file, 'w', encoding='utf-8') as out_f:
        for i, line in enumerate(f):
            if i >= max_lines:
                break  # Stop after max_lines
            concept1_id, concept2_id, count = line.strip().split("\t")
            count = int(count)

            # Convert concept IDs into actual term names
            terms1 = [term_id_map.get(t, f"UNK_{t}") for t in concept_to_term_map.get(concept1_id, [concept1_id])]
            terms2 = [term_id_map.get(t, f"UNK_{t}") for t in concept_to_term_map.get(concept2_id, [concept2_id])]

            # Write mapped term pairs to file
            for term1 in terms1:
                for term2 in terms2:
                    if term1 != term2:  # Avoid self-pairs
                        out_f.write(f"{term1}\t{term2}\t{count}\n")

    return output_file  # Return the file path




def compute_npmi(cofreq_file, singleton_freq, total_count, term_index, matrix_size, k=1, output_pmi="temp_pmi.npz"):
    """ Compute Normalized PMI in a memory-efficient way by writing directly to a sparse matrix file. """
    pmi_matrix = lil_matrix((matrix_size, matrix_size), dtype=np.float32)

    with open(cofreq_file, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            term1, term2, co_count = line.strip().split("\t")
            co_count = int(co_count)

            if term1 not in term_index or term2 not in term_index:
                continue  # Skip unknown terms

            i, j = term_index[term1], term_index[term2]
            p_x = singleton_freq.get(term1, 1) / total_count
            p_y = singleton_freq.get(term2, 1) / total_count
            p_xy = co_count / total_count

            # Compute PMI
            pmi = np.log2((p_xy + k) / (p_x * p_y + k))
            npmi = pmi / -np.log2(p_xy + k) if p_xy > 0 else 0

            pmi_matrix[i, j] = npmi
            pmi_matrix[j, i] = npmi  # PMI is symmetric

    # Save PMI as a compressed sparse matrix
    csr_pmi = pmi_matrix.tocsr()
    np.savez(output_pmi, data=csr_pmi.data, indices=csr_pmi.indices, indptr=csr_pmi.indptr, shape=csr_pmi.shape)

    return output_pmi  # Return file path instead of keeping it in memory




def find_synonyms_fast(pmi_file, term_index, target_word, top_n=5):
    """ Efficient synonym retrieval using a stored PMI matrix. """
    if target_word not in term_index:
        print(f"Word '{target_word}' not found in vocabulary.")
        return []

    # Load sparse PMI matrix from file
    loaded = np.load(pmi_file)
    pmi_matrix = csr_matrix((loaded["data"], loaded["indices"], loaded["indptr"]), shape=loaded["shape"])

    words = list(term_index.keys())
    target_idx = term_index[target_word]

    # Compute sparse cosine similarity
    similarities = cosine_similarity(pmi_matrix[target_idx], pmi_matrix)[0]
    sorted_indices = np.argsort(similarities)[::-1]  # Descending order

    return [(words[i], similarities[i]) for i in sorted_indices if words[i] != target_word][:top_n]



### 5️⃣ Filter Synonyms Using Concept ID ###
def filter_synonyms_by_concept(synonyms, concept_id_map, target_word):
    """ Ensure synonyms belong to the same concept cluster. """
    target_concept = concept_id_map.get(target_word, None)
    if not target_concept:
        return synonyms  # No concept info available

    return [(word, score) for word, score in synonyms if concept_id_map.get(word) == target_concept]

def load_concept_to_term_map(file_path):
    """ Load concept-to-term mappings where multiple terms can map to the same concept. """
    concept_to_term_map = defaultdict(list)

    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            try:
                term_id, concept_id = line.strip().split("\t")
                concept_to_term_map[concept_id].append(term_id)  # Store all terms for a concept
            except ValueError:
                continue  # Skip malformed lines

    return concept_to_term_map



if __name__ == "__main__":
    # File Paths
    mapping_files = [
        "../3_ID_Mappings/1_term_ID_to_string.txt",
        "../3_ID_Mappings/2a_concept_ID_to_string.txt"
    ]
    concept_map_file = "../3_ID_Mappings/3_term_ID_to_concept_ID.txt"
    singleton_file = "../2_Singleton_Frequency_Counts/singlets_concepts_perBin_1d.txt"
    cofreq_file = "../1_Cofrequency_Counts/cofreqs_concepts_perBin_1d.txt"

    # Load mappings
    term_id_map, _ = load_term_id_to_string(mapping_files)
    concept_to_term_map = load_concept_to_term_map(concept_map_file)

    # Load Data (Only First 100 Lines)
    singleton_freq = load_singleton_frequencies(singleton_file, concept_to_term_map, term_id_map)
    cofreq_output_file = load_cofreq_counts(cofreq_file, concept_to_term_map, term_id_map, max_lines=1000000)

    total_count = sum(singleton_freq.values())

    # Compute PMI (Streaming, Small Dataset)
    term_index = {term: i for i, term in enumerate(singleton_freq.keys())}
    matrix_size = len(term_index)

    pmi_output_file = compute_npmi(cofreq_output_file, singleton_freq, total_count, term_index, matrix_size)

    # Find synonyms
    target_word = "leukemia"
    synonyms = find_synonyms_fast(pmi_output_file, term_index, target_word)

    # Output results
    print(f"Top synonyms for '{target_word}':")
    for word, score in synonyms:
        print(f"{word}: {score:.4f}")






Top synonyms for 'leukemia':
[m]leukemia nos (morphologic abnormality): 1.0000
blood (leukemia): 1.0000
leucocythemias: 1.0000
[m]leukemia unspecified, nos (morphologic abnormality): 1.0000
leukemia (disorder): 1.0000


In [9]:
singleton_freq

{'UNK_UNK_4012122': 12264,
 'UNK_UNK_4846925': 314,
 'UNK_UNK_4160379': 427,
 'UNK_UNK_4022344': 36883,
 'UNK_UNK_4071782': 153,
 'UNK_UNK_4625328': 373,
 'UNK_UNK_4015238': 5555,
 'UNK_UNK_5152426': 157,
 'UNK_UNK_4035967': 8853,
 'UNK_UNK_5128353': 104,
 'UNK_UNK_4136534': 1192,
 'UNK_UNK_4301207': 270,
 'UNK_UNK_4742843': 267,
 'UNK_UNK_4211033': 12714,
 'UNK_UNK_4137142': 904,
 'UNK_UNK_4542108': 3137,
 'UNK_UNK_4078185': 224,
 'UNK_UNK_4003819': 4074,
 'UNK_UNK_4189354': 2723,
 'UNK_UNK_4124412': 229,
 'UNK_UNK_4016720': 3306,
 'UNK_UNK_4196688': 772,
 'UNK_UNK_4018221': 1552,
 'UNK_UNK_4411407': 8223,
 'UNK_UNK_5153490': 165,
 'UNK_UNK_4125020': 8112,
 'UNK_UNK_4738302': 4514,
 'UNK_UNK_4111492': 647,
 'UNK_UNK_5171825': 1329,
 'UNK_UNK_4012445': 11658,
 'UNK_UNK_4139137': 1060,
 'UNK_UNK_4253878': 110,
 'UNK_UNK_4411217': 468,
 'UNK_UNK_4698554': 745,
 'UNK_UNK_4125609': 165715,
 'UNK_UNK_4351595': 1438,
 'UNK_UNK_4909872': 122,
 'UNK_UNK_4111302': 1008,
 'UNK_UNK_4456095': 359,

In [6]:
term_index

{'UNK_UNK_4012122': 0,
 'UNK_UNK_4846925': 1,
 'UNK_UNK_4160379': 2,
 'UNK_UNK_4022344': 3,
 'UNK_UNK_4071782': 4,
 'UNK_UNK_4625328': 5,
 'UNK_UNK_4015238': 6,
 'UNK_UNK_5152426': 7,
 'UNK_UNK_4035967': 8,
 'UNK_UNK_5128353': 9,
 'UNK_UNK_4136534': 10,
 'UNK_UNK_4301207': 11,
 'UNK_UNK_4742843': 12,
 'UNK_UNK_4211033': 13,
 'UNK_UNK_4137142': 14,
 'UNK_UNK_4542108': 15,
 'UNK_UNK_4078185': 16,
 'UNK_UNK_4003819': 17,
 'UNK_UNK_4189354': 18,
 'UNK_UNK_4124412': 19,
 'UNK_UNK_4016720': 20,
 'UNK_UNK_4196688': 21,
 'UNK_UNK_4018221': 22,
 'UNK_UNK_4411407': 23,
 'UNK_UNK_5153490': 24,
 'UNK_UNK_4125020': 25,
 'UNK_UNK_4738302': 26,
 'UNK_UNK_4111492': 27,
 'UNK_UNK_5171825': 28,
 'UNK_UNK_4012445': 29,
 'UNK_UNK_4139137': 30,
 'UNK_UNK_4253878': 31,
 'UNK_UNK_4411217': 32,
 'UNK_UNK_4698554': 33,
 'UNK_UNK_4125609': 34,
 'UNK_UNK_4351595': 35,
 'UNK_UNK_4909872': 36,
 'UNK_UNK_4111302': 37,
 'UNK_UNK_4456095': 38,
 'UNK_UNK_4021394': 39,
 'UNK_UNK_4183578': 40,
 'UNK_UNK_4412015': 41,
 '

In [11]:
# Find synonyms
target_word = "leukemia"
synonyms = find_synonyms_fast(pmi_output_file, term_index, target_word)

# Output results
print(f"Top synonyms for '{target_word}':")
for word, score in synonyms:
    print(f"{word}: {score:.4f}")

Top synonyms for 'leukemia':
[m]leukemia nos (morphologic abnormality): 1.0000
blood (leukemia): 1.0000
leucocythemias: 1.0000
[m]leukemia unspecified, nos (morphologic abnormality): 1.0000
leukemia (disorder): 1.0000


this next code chunk is trying to optimize memory

In [None]:
import numpy as np
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import lil_matrix, csr_matrix
from sparse_dot_topn import awesome_cossim_topn  # Install using: pip install sparse-dot-topn

### 1️⃣ Load Mapping Files ###
def load_term_id_to_string(file_paths, include_concepts=True):
    """ Load multiple term ID to string mappings and merge them into a single dictionary. """
    term_id_map = {}
    concept_id_map = {}  # For concept mappings

    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            for line in f:
                try:
                    term_id, term_str = line.strip().split("\t")

                    # If it's a concept mapping file, store separately
                    if include_concepts and "concept" in file_path.lower():
                        concept_id_map[term_id] = term_str
                    else:
                        term_id_map[term_id] = term_str
                except ValueError:
                    continue  # Skip malformed lines

    return term_id_map, concept_id_map


def load_singleton_frequencies(file_path, concept_to_term_map, term_id_map, chunk_size=1000):
    """ Load singleton frequencies in chunks and map concept IDs to term strings """
    singleton_freq = {}

    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for i, line in enumerate(f):
            concept_id, count = line.strip().split("\t")
            count = int(count)

            # Get term IDs associated with this concept
            term_ids = concept_to_term_map.get(concept_id, [f"UNK_{concept_id}"])

            # Convert term IDs to actual term names
            terms = [term_id_map.get(term_id, f"UNK_{term_id}") for term_id in term_ids]

            # Store frequencies for all mapped terms
            for term in terms:
                singleton_freq[term] = singleton_freq.get(term, 0) + count  # Sum counts if needed

            if (i + 1) % chunk_size == 0:  # Process in chunks
                print(f"Processed {i + 1} lines")
                # Optionally, write intermediate results to disk here to manage memory

    return singleton_freq


def load_cofreq_counts(file_path, concept_to_term_map, term_id_map, output_file="temp_cofreq.txt", chunk_size=1000):
    """ Process cofrequency counts in a memory-efficient way for up to chunk_size entries. """
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f, open(output_file, 'w', encoding='utf-8') as out_f:
        for i, line in enumerate(f):
            concept1_id, concept2_id, count = line.strip().split("\t")
            count = int(count)

            # Convert concept IDs into actual term names
            terms1 = [term_id_map.get(t, f"UNK_{t}") for t in concept_to_term_map.get(concept1_id, [concept1_id])]
            terms2 = [term_id_map.get(t, f"UNK_{t}") for t in concept_to_term_map.get(concept2_id, [concept2_id])]

            # Write mapped term pairs to file
            for term1 in terms1:
                for term2 in terms2:
                    if term1 != term2:  # Avoid self-pairs
                        out_f.write(f"{term1}\t{term2}\t{count}\n")

            if (i + 1) % chunk_size == 0:  # Process in chunks
                print(f"Processed {i + 1} lines")
                # Optionally, write intermediate results to disk here to manage memory

    return output_file  # Return the file path


def compute_npmi(cofreq_file, singleton_freq, total_count, term_index, matrix_size, k=1, output_pmi="temp_pmi.npz"):
    """ Compute Normalized PMI in a memory-efficient way by writing directly to a sparse matrix file. """
    pmi_matrix = lil_matrix((matrix_size, matrix_size), dtype=np.float32)

    with open(cofreq_file, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            term1, term2, co_count = line.strip().split("\t")
            co_count = int(co_count)

            if term1 not in term_index or term2 not in term_index:
                continue  # Skip unknown terms

            i, j = term_index[term1], term_index[term2]
            p_x = singleton_freq.get(term1, 1) / total_count
            p_y = singleton_freq.get(term2, 1) / total_count
            p_xy = co_count / total_count

            # Compute PMI
            pmi = np.log2((p_xy + k) / (p_x * p_y + k))
            npmi = pmi / -np.log2(p_xy + k) if p_xy > 0 else 0

            pmi_matrix[i, j] = npmi
            pmi_matrix[j, i] = npmi  # PMI is symmetric

    # Save PMI as a compressed sparse matrix
    csr_pmi = pmi_matrix.tocsr()
    np.savez(output_pmi, data=csr_pmi.data, indices=csr_pmi.indices, indptr=csr_pmi.indptr, shape=csr_pmi.shape)

    return output_pmi  # Return file path instead of keeping it in memory


def find_synonyms_fast(pmi_file, term_index, target_word, top_n=5):
    """ Efficient synonym retrieval using a stored PMI matrix. """
    if target_word not in term_index:
        print(f"Word '{target_word}' not found in vocabulary.")
        return []

    # Load sparse PMI matrix from file
    loaded = np.load(pmi_file)
    pmi_matrix = csr_matrix((loaded["data"], loaded["indices"], loaded["indptr"]), shape=loaded["shape"])

    words = list(term_index.keys())
    target_idx = term_index[target_word]

    # Compute sparse cosine similarity
    similarities = cosine_similarity(pmi_matrix[target_idx], pmi_matrix)[0]
    sorted_indices = np.argsort(similarities)[::-1]  # Descending order

    return [(words[i], similarities[i]) for i in sorted_indices if words[i] != target_word][:top_n]

### 5️⃣ Filter Synonyms Using Concept ID ###
def filter_synonyms_by_concept(synonyms, concept_id_map, target_word):
    """ Ensure synonyms belong to the same concept cluster. """
    target_concept = concept_id_map.get(target_word, None)
    if not target_concept:
        return synonyms  # No concept info available

    return [(word, score) for word, score in synonyms if concept_id_map.get(word) == target_concept]

def load_concept_to_term_map(file_path):
    """ Load concept-to-term mappings where multiple terms can map to the same concept. """
    concept_to_term_map = defaultdict(list)

    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            try:
                term_id, concept_id = line.strip().split("\t")
                concept_to_term_map[concept_id].append(term_id)  # Store all terms for a concept
            except ValueError:
                continue  # Skip malformed lines

    return concept_to_term_map


if __name__ == "__main__":
    # File Paths
    mapping_files = [
        "../3_ID_Mappings/1_term_ID_to_string.txt",
        "../3_ID_Mappings/2a_concept_ID_to_string.txt"
    ]
    concept_map_file = "../3_ID_Mappings/3_term_ID_to_concept_ID.txt"
    singleton_file = "../2_Singleton_Frequency_Counts/singlets_concepts_perBin_1d.txt"
    cofreq_file = "../1_Cofrequency_Counts/cofreqs_concepts_perBin_1d.txt"

    # Load mappings
    term_id_map, _ = load_term_id_to_string(mapping_files)
    concept_to_term_map = load_concept_to_term_map(concept_map_file)

    # Load Data (Only First 100 Lines)
    singleton_freq = load_singleton_frequencies(singleton_file, concept_to_term_map, term_id_map)
    cofreq_output_file = load_cofreq_counts(cofreq_file, concept_to_term_map, term_id_map)

    total_count = sum(singleton_freq.values())

    # Compute PMI (Streaming, Small Dataset)
    term_index = {term: i for i, term in enumerate(singleton_freq.keys())}
    matrix_size = len(term_index)

    pmi_output_file = compute_npmi(cofreq_output_file, singleton_freq, total_count, term_index, matrix_size)

    # Find synonyms
    target_word = "leukemia"
    synonyms = find_synonyms_fast(pmi_output_file, term_index, target_word)

    # Output results
    print(f"Top synonyms for '{target_word}':")
    for word, score in synonyms:
        print(f"{word}: {score:.4f}")


Processed 1000 lines
Processed 2000 lines
Processed 3000 lines
Processed 4000 lines
Processed 5000 lines
Processed 6000 lines
Processed 7000 lines
Processed 8000 lines
Processed 9000 lines
Processed 10000 lines
Processed 11000 lines
Processed 12000 lines
Processed 13000 lines
Processed 14000 lines
Processed 15000 lines
Processed 16000 lines
Processed 17000 lines
Processed 18000 lines
Processed 19000 lines
Processed 20000 lines
Processed 21000 lines
Processed 22000 lines
Processed 23000 lines
Processed 1000 lines
Processed 2000 lines
Processed 3000 lines
Processed 4000 lines
Processed 5000 lines
Processed 6000 lines
Processed 7000 lines
Processed 8000 lines
Processed 9000 lines
Processed 10000 lines
Processed 11000 lines
Processed 12000 lines
Processed 13000 lines
Processed 14000 lines
Processed 15000 lines
Processed 16000 lines
Processed 17000 lines
Processed 18000 lines
Processed 19000 lines
Processed 20000 lines
Processed 21000 lines
Processed 22000 lines
Processed 23000 lines
Proces

This next chunk is to try embedding (work on it tomorrow)

In [None]:
import numpy as np
import gensim
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import lil_matrix, csr_matrix
from sparse_dot_topn import awesome_cossim_topn  # Install using: pip install sparse-dot-topn

### 1️⃣ Load Pre-Trained Embeddings (GloVe, FastText, etc.) ###
def load_embeddings(embedding_path):
    """ Load pre-trained word embeddings (GloVe, FastText, etc.) """
    print(f"Loading word embeddings from {embedding_path}...")
    try:
        word_vectors = gensim.models.KeyedVectors.load_word2vec_format(embedding_path, binary=False)
        print("Word embeddings loaded successfully!")
        return word_vectors
    except Exception as e:
        print(f"Error loading embeddings: {e}")
        return None


### 2️⃣ Load Mapping Files ###
def load_term_id_to_string(file_paths, include_concepts=True):
    """ Load multiple term ID to string mappings and merge them into a single dictionary. """
    term_id_map = {}
    concept_id_map = {}  # For concept mappings

    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            for line in f:
                try:
                    term_id, term_str = line.strip().split("\t")

                    # If it's a concept mapping file, store separately
                    if include_concepts and "concept" in file_path.lower():
                        concept_id_map[term_id] = term_str
                    else:
                        term_id_map[term_id] = term_str
                except ValueError:
                    continue  # Skip malformed lines

    return term_id_map, concept_id_map


### 3️⃣ Compute Hybrid Similarity (PMI + Embeddings) ###
def hybrid_similarity(pmi_file, term_index, embeddings, target_word, alpha=0.5, top_n=5):
    """
    Compute synonym similarity using a combination of PMI and word embeddings.
    alpha: 0.5 means equal weight for PMI and embeddings; adjust as needed.
    """
    if target_word not in term_index:
        print(f"Word '{target_word}' not found in vocabulary.")
        return []

    # Load sparse PMI matrix
    loaded = np.load(pmi_file)
    pmi_matrix = csr_matrix((loaded["data"], loaded["indices"], loaded["indptr"]), shape=loaded["shape"])
    
    words = list(term_index.keys())
    target_idx = term_index[target_word]

    # Compute sparse cosine similarity for PMI
    pmi_similarities = cosine_similarity(pmi_matrix[target_idx], pmi_matrix)[0]

    # Compute word embedding similarities
    if embeddings and target_word in embeddings:
        word_emb_similarities = []
        for word in words:
            if word in embeddings:
                sim = embeddings.similarity(target_word, word)
            else:
                sim = 0  # Default to 0 if word not in embedding
            word_emb_similarities.append(sim)
        word_emb_similarities = np.array(word_emb_similarities)
    else:
        word_emb_similarities = np.zeros(len(words))  # No embeddings available

    # Combine PMI and embedding similarity
    final_similarities = alpha * pmi_similarities + (1 - alpha) * word_emb_similarities

    # Rank by similarity
    sorted_indices = np.argsort(final_similarities)[::-1]
    return [(words[i], final_similarities[i]) for i in sorted_indices if words[i] != target_word][:top_n]


### 🔥 Main Execution ###
if __name__ == "__main__":
    # File Paths
    mapping_files = [
        "../3_ID_Mappings/1_term_ID_to_string.txt",
        "../3_ID_Mappings/2a_concept_ID_to_string.txt"
    ]
    concept_map_file = "../3_ID_Mappings/3_term_ID_to_concept_ID.txt"
    singleton_file = "../2_Singleton_Frequency_Counts/singlets_concepts_perBin_1d.txt"
    cofreq_file = "../1_Cofrequency_Counts/cofreqs_concepts_perBin_1d.txt"
    embedding_path = "glove.6B.100d.txt"  # Update to your pre-trained embeddings file

    # Load embeddings
    word_vectors = load_embeddings(embedding_path)

    # Load mappings
    term_id_map, _ = load_term_id_to_string(mapping_files)
    concept_to_term_map = load_concept_to_term_map(concept_map_file)

    # Load Data
    singleton_freq = load_singleton_frequencies(singleton_file, concept_to_term_map, term_id_map)
    cofreq_output_file = load_cofreq_counts(cofreq_file, concept_to_term_map, term_id_map, max_lines=1000000)

    total_count = sum(singleton_freq.values())

    # Compute PMI (Streaming, Small Dataset)
    term_index = {term: i for i, term in enumerate(singleton_freq.keys())}
    matrix_size = len(term_index)

    pmi_output_file = compute_npmi(cofreq_output_file, singleton_freq, total_count, term_index, matrix_size)

    # Find synonyms using hybrid similarity
    target_word = "leukemia"
    synonyms = hybrid_similarity(pmi_output_file, term_index, word_vectors, target_word)

    # Output results
    print(f"Top synonyms for '{target_word}':")
    for word, score in synonyms:
        print(f"{word}: {score:.4f}")
