<a href="https://colab.research.google.com/github/bradyprice/CSE-6250-Final-Project/blob/main/final_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
from google.colab import drive

# View current working directory
print("Current Working Directory:", os.getcwd())

# Mount Google Drive
drive.mount('/content/gdrive')

# Change working directory to your file position
path = "/content/gdrive/My Drive/CSE_6250_BD4H_Final_Project"
os.chdir(path)

print("Working Directory:", os.getcwd())


Current Working Directory: /content
Mounted at /content/gdrive
Working Directory: /content/gdrive/My Drive/CSE_6250_BD4H_Final_Project


You will need to install the sparse_dot_topn package

In [2]:
!pip install sparse_dot_topn

Collecting sparse_dot_topn
  Downloading sparse_dot_topn-1.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Downloading sparse_dot_topn-1.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (266 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.7/266.7 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sparse_dot_topn
Successfully installed sparse_dot_topn-1.1.5


This part here was working to get the context part of SurfCon with the NPMI matrix to work. We wanted to get this section to work before we expanded on the model.

## Context Part of SurfCon

In [None]:
import numpy as np
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import lil_matrix, csr_matrix
from sparse_dot_topn import awesome_cossim_topn  # Install using: pip install sparse-dot-topn
from scipy.sparse import dok_matrix, csr_matrix

### 1️⃣ Load Mapping Files ###
def load_term_id_to_string(file_paths, include_concepts=True):
    """ Load multiple term ID to string mappings and merge them into a single dictionary. """
    term_id_map = {}
    concept_id_map = {}  # For concept mappings

    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            for line in f:
                try:
                    term_id, term_str = line.strip().split("\t")

                    # If it's a concept mapping file, store separately
                    if include_concepts and "concept" in file_path.lower():
                        concept_id_map[term_id] = term_str
                    else:
                        term_id_map[term_id] = term_str
                except ValueError:
                    continue  # Skip malformed lines

    return term_id_map, concept_id_map


def load_singleton_frequencies(file_path, concept_to_term_map, term_id_map, chunk_size=1000):
    """ Load singleton frequencies in chunks and map concept IDs to term strings """
    singleton_freq = {}

    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for i, line in enumerate(f):
            concept_id, count = line.strip().split("\t")
            count = int(count)

            # Get term IDs associated with this concept
            term_ids = concept_to_term_map.get(concept_id, [f"UNK_{concept_id}"])

            # Convert term IDs to actual term names
            terms = [term_id_map.get(term_id, f"UNK_{term_id}") for term_id in term_ids]

            # Store frequencies for all mapped terms
            for term in terms:
                singleton_freq[term] = singleton_freq.get(term, 0) + count

            if (i + 1) % chunk_size == 0:
                print(f"Processed {i + 1} lines")


    return singleton_freq


def load_cofreq_counts(file_path, concept_to_term_map, term_id_map, output_file="temp_cofreq.txt", chunk_size=1000):
    """ Process cofrequency counts in a memory-efficient way for up to chunk_size entries. """
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f, open(output_file, 'w', encoding='utf-8') as out_f:
        for i, line in enumerate(f):
            concept1_id, concept2_id, count = line.strip().split("\t")
            count = int(count)

            # Convert concept IDs into actual term names
            terms1 = [term_id_map.get(t, f"UNK_{t}") for t in concept_to_term_map.get(concept1_id, [concept1_id])]
            terms2 = [term_id_map.get(t, f"UNK_{t}") for t in concept_to_term_map.get(concept2_id, [concept2_id])]

            # Write mapped term pairs to file
            for term1 in terms1:
                for term2 in terms2:
                    if term1 != term2:  # Avoid self-pairs
                        out_f.write(f"{term1}\t{term2}\t{count}\n")

            if (i + 1) % chunk_size == 0:  # Process in chunks
                print(f"Processed {i + 1} lines")
                # Optionally, write intermediate results to disk here to manage memory

    return output_file  # Return the file path


import numpy as np
from scipy.sparse import dok_matrix, csr_matrix

def compute_npmi(cofreq_file, singleton_freq, total_count, term_index, matrix_size, k=1, output_pmi="temp_pmi.npz", chunk_size=100000, max_rows=1000000):
    """
    Compute Normalized PMI (NPMI) using at most `max_rows` rows from co-occurrence file.
    Uses `dok_matrix` for fast updates, then converts to `csr_matrix`.
    """
    print(f"Initializing sparse PMI matrix (limiting to {max_rows} rows)...")
    pmi_matrix = dok_matrix((matrix_size, matrix_size), dtype=np.float32)

    print(f"Processing up to {max_rows} rows from co-occurrence file in chunks...")
    batch_data = []

    with open(cofreq_file, 'r', encoding='utf-8', errors='ignore') as f:
        for i, line in enumerate(f):
            if i >= max_rows:  # 🚀 Stop after 1 million rows
                break

            term1, term2, co_count = line.strip().split("\t")
            co_count = int(co_count)

            if term1 not in term_index or term2 not in term_index:
                continue  # Skip unknown terms

            i_idx, j_idx = term_index[term1], term_index[term2]
            p_x = singleton_freq.get(term1, 1) / total_count
            p_y = singleton_freq.get(term2, 1) / total_count
            p_xy = co_count / total_count

            # Compute PMI
            pmi = np.log2((p_xy + k) / (p_x * p_y + k))
            npmi = pmi / -np.log2(p_xy + k) if p_xy > 0 else 0

            batch_data.append((i_idx, j_idx, npmi))

            if len(batch_data) >= chunk_size:
                for row, col, value in batch_data:
                    pmi_matrix[row, col] = value
                    pmi_matrix[col, row] = value  # PMI is symmetric
                batch_data = []  # Reset batch

            if (i + 1) % chunk_size == 0:
                print(f"Processed {i + 1} lines...")

    # Final batch processing
    for row, col, value in batch_data:
        pmi_matrix[row, col] = value
        pmi_matrix[col, row] = value

    print("Converting PMI matrix to CSR format...")
    csr_pmi = pmi_matrix.tocsr()

    print("Saving PMI matrix to disk...")
    np.savez(output_pmi, data=csr_pmi.data, indices=csr_pmi.indices, indptr=csr_pmi.indptr, shape=csr_pmi.shape)

    print("PMI computation complete!")
    return output_pmi  # Return file path instead of keeping it in memory


def find_synonyms_fast(pmi_file, term_index, target_word, top_n=5):
    """ Efficient synonym retrieval using a stored PMI matrix. """
    if target_word not in term_index:
        print(f"Word '{target_word}' not found in vocabulary.")
        return []

    # Load sparse PMI matrix from file
    loaded = np.load(pmi_file)
    pmi_matrix = csr_matrix((loaded["data"], loaded["indices"], loaded["indptr"]), shape=loaded["shape"])

    words = list(term_index.keys())
    target_idx = term_index[target_word]

    # Compute sparse cosine similarity
    similarities = cosine_similarity(pmi_matrix[target_idx], pmi_matrix)[0]
    sorted_indices = np.argsort(similarities)[::-1]  # Descending order

    return [(words[i], similarities[i]) for i in sorted_indices if words[i] != target_word][:top_n]

### 5️⃣ Filter Synonyms Using Concept ID ###
def filter_synonyms_by_concept(synonyms, concept_id_map, target_word):
    """ Ensure synonyms belong to the same concept cluster. """
    target_concept = concept_id_map.get(target_word, None)
    if not target_concept:
        return synonyms  # No concept info available

    return [(word, score) for word, score in synonyms if concept_id_map.get(word) == target_concept]

def load_concept_to_term_map(file_path):
    """ Load concept-to-term mappings where multiple terms can map to the same concept. """
    concept_to_term_map = defaultdict(list)

    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            try:
                term_id, concept_id = line.strip().split("\t")
                concept_to_term_map[concept_id].append(term_id)  # Store all terms for a concept
            except ValueError:
                continue  # Skip malformed lines

    return concept_to_term_map


if __name__ == "__main__":
    # File Paths
    mapping_files = [
        "1_term_ID_to_string.txt",
        "2a_concept_ID_to_string.txt"
    ]
    concept_map_file = "3_term_ID_to_concept_ID.txt"
    singleton_file = "singlets_concepts_perBin_1d.txt"
    cofreq_file = "cofreqs_concepts_perBin_1d.txt"

    # Load mappings
    term_id_map, _ = load_term_id_to_string(mapping_files)
    concept_to_term_map = load_concept_to_term_map(concept_map_file)

    # Load Data (Only First 100 Lines)
    singleton_freq = load_singleton_frequencies(singleton_file, concept_to_term_map, term_id_map)
    cofreq_output_file = load_cofreq_counts(cofreq_file, concept_to_term_map, term_id_map)

    total_count = sum(singleton_freq.values())

    # Compute PMI (Streaming, Small Dataset)
    term_index = {term: i for i, term in enumerate(singleton_freq.keys())}
    matrix_size = len(term_index)

    pmi_output_file = compute_npmi(cofreq_output_file, singleton_freq, total_count, term_index, matrix_size, max_rows=1000000)


    # Find synonyms
    target_word = "leukemia"
    synonyms = find_synonyms_fast(pmi_output_file, term_index, target_word)

    # Output results
    print(f"Top synonyms for '{target_word}':")
    for word, score in synonyms:
        print(f"{word}: {score:.4f}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processed 3577000 lines
Processed 3578000 lines
Processed 3579000 lines
Processed 3580000 lines
Processed 3581000 lines
Processed 3582000 lines
Processed 3583000 lines
Processed 3584000 lines
Processed 3585000 lines
Processed 3586000 lines
Processed 3587000 lines
Processed 3588000 lines
Processed 3589000 lines
Processed 3590000 lines
Processed 3591000 lines
Processed 3592000 lines
Processed 3593000 lines
Processed 3594000 lines
Processed 3595000 lines
Processed 3596000 lines
Processed 3597000 lines
Processed 3598000 lines
Processed 3599000 lines
Processed 3600000 lines
Processed 3601000 lines
Processed 3602000 lines
Processed 3603000 lines
Processed 3604000 lines
Processed 3605000 lines
Processed 3606000 lines
Processed 3607000 lines
Processed 3608000 lines
Processed 3609000 lines
Processed 3610000 lines
Processed 3611000 lines
Processed 3612000 lines
Processed 3613000 lines
Processed 3614000 lines
Processed 3615000 lines

## CharNGram - Surface

Now I am going to be trying to do the charnGram. Only need to do the cell below once. Make sure to download from the following link: https://github.com/hassyGo/charNgram2vec/releases

In [None]:
!gzip -d "/content/gdrive/My Drive/CSE_6250_BD4H_Final_Project/jmt_pre-trained_embeddings.tar.gz"
!tar -xvf "/content/gdrive/My Drive/CSE_6250_BD4H_Final_Project/jmt_pre-trained_embeddings.tar" "charNgram.txt" -C "/content/gdrive/My Drive/CSE_6250_BD4H_Final_Project/"


In [3]:
import numpy as np
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import lil_matrix, csr_matrix
from sparse_dot_topn import awesome_cossim_topn  # Install using: pip install sparse-dot-topn
from scipy.sparse import dok_matrix, csr_matrix

### 1️⃣ Load Mapping Files ###
def load_term_id_to_string(file_paths, include_concepts=True):
    """ Load multiple term ID to string mappings and merge them into a single dictionary. """
    term_id_map = {}
    concept_id_map = {}  # For concept mappings

    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            for line in f:
                try:
                    term_id, term_str = line.strip().split("\t")

                    # If it's a concept mapping file, store separately
                    if include_concepts and "concept" in file_path.lower():
                        concept_id_map[term_id] = term_str
                    else:
                        term_id_map[term_id] = term_str
                except ValueError:
                    continue  # Skip malformed lines

    return term_id_map, concept_id_map



### 5️⃣ Filter Synonyms Using Concept ID ###
def filter_synonyms_by_concept(synonyms, concept_id_map, target_word):
    """ Ensure synonyms belong to the same concept cluster. """
    target_concept = concept_id_map.get(target_word, None)
    if not target_concept:
        return synonyms  # No concept info available

    return [(word, score) for word, score in synonyms if concept_id_map.get(word) == target_concept]

def load_concept_to_term_map(file_path):
    """ Load concept-to-term mappings where multiple terms can map to the same concept. """
    concept_to_term_map = defaultdict(list)

    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            try:
                term_id, concept_id = line.strip().split("\t")
                concept_to_term_map[concept_id].append(term_id)  # Store all terms for a concept
            except ValueError:
                continue  # Skip malformed lines

    return concept_to_term_map

def extract_char_ngrams(term, min_n=3, max_n=13):
    term = f"<{term}>"
    ngrams = []
    for n in range(min_n, max_n + 1):
        ngrams += [f"{n}gram-{term[i:i+n]}" for i in range(len(term)-n+1)]
    return ngrams


def load_char_ngram_embeddings(file_path):
    embedding_map = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) < 10:  # Skip headers or bad lines
                continue
            gram = parts[0]
            vec = np.array([float(x) for x in parts[1:]], dtype=np.float32)
            embedding_map[gram] = vec
    return embedding_map

def compute_term_embedding(term, ngram_embeds, min_n=1, max_n=13, dim=100):
    term = term.lower().strip()
    ngrams = extract_char_ngrams(term, min_n, max_n)
    vectors = [ngram_embeds[ng] for ng in ngrams if ng in ngram_embeds]
    if not vectors:
        return np.zeros(dim)
    return np.mean(vectors, axis=0)


def build_term_embeddings(term_list, ngram_embeds, dim=100):
    term_vectors = {}
    for term in term_list:
        term_vectors[term] = compute_term_embedding(term, ngram_embeds, dim=dim)
    return term_vectors

def find_surface_form_synonyms(term_vectors, target_term, ngram_embeddings, dim=100, top_n=5):
    if target_term in term_vectors:
        target_vec = term_vectors[target_term]
    else:
        print(f"'{target_term}' not in vocabulary — computing from n-grams.")
        target_vec = compute_term_embedding(target_term, ngram_embeddings, dim=dim)

    if np.linalg.norm(target_vec) == 0:
        print(f"Could not compute embedding for '{target_term}' (no matching n-grams).")
        return []

    target_vec = target_vec.reshape(1, -1)
    terms = list(term_vectors.keys())
    matrix = np.stack([term_vectors[t] for t in terms])
    sims = cosine_similarity(target_vec, matrix)[0]

    ranked = sorted(zip(terms, sims), key=lambda x: -x[1])
    return [(term, sim) for term, sim in ranked if term != target_term][:top_n]





if __name__ == "__main__":
    # File Paths
    mapping_files = [
        "1_term_ID_to_string.txt",
        "2a_concept_ID_to_string.txt"
    ]
    concept_map_file = "3_term_ID_to_concept_ID.txt"
    singleton_file = "singlets_concepts_perBin_1d.txt"
    cofreq_file = "cofreqs_concepts_perBin_1d.txt"
    charngram_embedding_file = 'charNgram.txt'

    # Load mappings
    term_id_map, concept_id_map = load_term_id_to_string(mapping_files)
    concept_to_term_map = load_concept_to_term_map(concept_map_file)

     # === Build Term List ===
    term_list = list(term_id_map.values())

    # === Load CharNGram Embeddings ===
    ngram_embeddings = load_char_ngram_embeddings(charngram_embedding_file)

    # === Compute Embeddings for All Terms ===
    print("Computing term embeddings...")
    term_vectors = build_term_embeddings(term_list, ngram_embeddings, dim=100)

    # === Find Synonyms ===
    target_word = "leukemia"
    synonyms = find_surface_form_synonyms(term_vectors, target_word, ngram_embeddings, top_n=10)

    print(f"Top CharNGram-based synonyms for '{target_word}':")
    for word, score in synonyms:
        print(f"{word}: {score:.4f}")


Computing term embeddings...
Top CharNGram-based synonyms for 'leukemia':
cns leukemia: 0.9925
leukemias: 0.9921
leukemia of cns: 0.9880
leukemia nos: 0.9879
[m]leukemia nos: 0.9855
leukemias nec: 0.9854
rat leukemia: 0.9842
aleukemic leukemia: 0.9837
leukemia aleukemic: 0.9837
lgl leukemia: 0.9826


In [None]:
    # === Find Synonyms ===
    target_word = "leukemia"
    synonyms = find_surface_form_synonyms(term_vectors, target_word, ngram_embeddings, top_n=10)

    print(f"Top CharNGram-based synonyms for '{target_word}':")
    for word, score in synonyms:
        print(f"{word}: {score:.4f}")

Top CharNGram-based synonyms for 'leukemia':
cns leukemia: 0.9925
leukemias: 0.9921
leukemia of cns: 0.9880
leukemia nos: 0.9879
[m]leukemia nos: 0.9855
leukemias nec: 0.9854
rat leukemia: 0.9842
aleukemic leukemia: 0.9837
leukemia aleukemic: 0.9837
lgl leukemia: 0.9826


In [None]:
    # === Find Synonyms ===
    target_word = "cool"
    synonyms = find_surface_form_synonyms(term_vectors, target_word, ngram_embeddings, top_n=10)

    print(f"Top CharNGram-based synonyms for '{target_word}':")
    for word, score in synonyms:
        print(f"{word}: {score:.4f}")



'cool' not in vocabulary — computing from n-grams.
Top CharNGram-based synonyms for 'cool':
cool1: 0.9936
cool2: 0.9925
coolers: 0.9478
li(x)coo2: 0.9468
coos: 0.9459
pscoo: 0.9410
coombs: 0.9404
cool skin: 0.9395
oil bleo: 0.9362
dermacool: 0.9357


## Context - word2vec

Downloade the word2vec GloVe embedding file here: https://www.kaggle.com/datasets/watts2/glove6b50dtxt

In [None]:
pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m63.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━

In [None]:
pip install --upgrade scipy numpy

Collecting scipy
  Downloading scipy-1.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting numpy
  Downloading numpy-2.2.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scipy-1.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (37.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.6/37.6 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-2.2.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m102.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy, scipy
  Attempting uninstall: numpy
   

In [None]:
pip install --upgrade --force-reinstall gensim

Collecting gensim
  Using cached gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Using cached scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.1.0-py3-none-any.whl.metadata (24 kB)
Collecting wrapt (from smart-open>=1.8.1->gensim)
  Downloading wrapt-1.17.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.4 kB)
Using cached gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
Using cached scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38

In [None]:
pip install numpy==1.24.4

Collecting numpy==1.24.4
  Downloading numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Downloading numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m96.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.24.4 which is incompatible.
jaxlib 0.5.1 requires numpy>=1.25, but you have numpy 1.24.4 which is incompatible.
treescope 0.1.9 requires numpy>=1.25.2, but you have numpy 1.24.4 which is incompatible.
jax 0.5.2 requires n

In [None]:

from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
import os


### 1️⃣ Load Term ID to String Mapping ###
def load_term_id_to_string(file_paths):
    """Load multiple term ID to string mappings and merge them into a single dictionary."""
    term_id_map = {}
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            for line in f:
                try:
                    term_id, term_str = line.strip().split("\t")
                    term_id_map[term_id] = term_str
                except ValueError:
                    continue  # Skip malformed lines
    return term_id_map



### 2️⃣ Convert and Load GloVe Vectors ###
def load_glove_model(glove_input_file, word2vec_output_file):
    # Convert GloVe to word2vec format if not already done
    if not os.path.exists(word2vec_output_file):
        print(f"Converting {glove_input_file} to word2vec format...")
        glove2word2vec(glove_input_file, word2vec_output_file)
        print("Conversion done.")
    print(f"Loading vectors from: {word2vec_output_file}")
    model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)
    print("Model loaded.")
    return model


### 3️⃣ Find Semantic Synonyms ###
def find_word2vec_synonyms(model, target_word, top_n=10):
    if target_word not in model.key_to_index:
        print(f"'{target_word}' not in GloVe vocabulary.")
        return []
    similar = model.most_similar(target_word, topn=top_n)
    return similar



### 4️⃣ Main Execution ###
if __name__ == "__main__":
    # File paths
    mapping_files = [
        "1_term_ID_to_string.txt",
        "2a_concept_ID_to_string.txt"
    ]
    glove_input_file = "glove.6B.50d.txt"
    word2vec_output_file = "glove.6B.50d.word2vec.txt"



    # Load term mappings
term_id_map = load_term_id_to_string(mapping_files)
term_list = list(term_id_map.values())


    # Convert and load GloVe vectors
model = load_glove_model(glove_input_file, word2vec_output_file)


# Filter term list to those in the model's vocabulary
in_vocab_terms = [term for term in term_list if term in model.key_to_index]
print(f"Number of terms in both your list and the model's vocabulary: {len(in_vocab_terms)}")


# Find and print synonyms for a target word
target_words = ["leukemia", "cool"]
for target_word in target_words:
    print(f"\nTop GloVe-based semantic synonyms for '{target_word}':")
    synonyms = find_word2vec_synonyms(model, target_word, top_n=10)
    for word, score in synonyms:
        print(f"{word}: {score:.4f}")


Loading vectors from: glove.6B.50d.word2vec.txt
Model loaded.
Number of terms in both your list and the model's vocabulary: 25562

Top GloVe-based semantic synonyms for 'leukemia':
leukaemia: 0.8707
myeloid: 0.8264
lymphoma: 0.8256
diagnosed: 0.8164
alzheimer: 0.8011
diabetes: 0.7896
parkinson: 0.7881
myelogenous: 0.7858
cancer: 0.7851
schizophrenia: 0.7781

Top GloVe-based semantic synonyms for 'cool':
hot: 0.8605
warm: 0.8107
cold: 0.8104
bit: 0.7934
dry: 0.7666
cooler: 0.7619
little: 0.7598
mix: 0.7533
soft: 0.7454
bright: 0.7423


## SurfCon Final

In [None]:
import numpy as np
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import lil_matrix, csr_matrix
from sparse_dot_topn import awesome_cossim_topn  # Install using: pip install sparse-dot-topn
from scipy.sparse import dok_matrix, csr_matrix

# Load Term and Concept Mapping Files
def load_term_id_to_string(file_paths, include_concepts=True):
    """ Load multiple term ID to string mappings and merge them into a single dictionary. """
    term_id_map = {}
    concept_id_map = {}  # For concept mappings

    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            for line in f:
                try:
                    term_id, term_str = line.strip().split("\t")

                    # If it's a concept mapping file, store separately
                    if include_concepts and "concept" in file_path.lower():
                        concept_id_map[term_id] = term_str
                    else:
                        term_id_map[term_id] = term_str
                except ValueError:
                    continue  # Skip malformed lines

    return term_id_map, concept_id_map


# Load Singleton Frequencies
def load_singleton_frequencies(file_path, concept_to_term_map, term_id_map, chunk_size=1000):
    """ Load singleton frequencies in chunks and map concept IDs to term strings """
    singleton_freq = {}

    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for i, line in enumerate(f):
            concept_id, count = line.strip().split("\t")
            count = int(count)

            # Get term IDs associated with this concept
            term_ids = concept_to_term_map.get(concept_id, [f"UNK_{concept_id}"])

            # Convert term IDs to actual term names
            terms = [term_id_map.get(term_id, f"UNK_{term_id}") for term_id in term_ids]

            # Store frequencies for all mapped terms
            for term in terms:
                singleton_freq[term] = singleton_freq.get(term, 0) + count  # Sum counts if needed

            if (i + 1) % chunk_size == 0:  # Process in chunks
                print(f"Processed {i + 1} lines")
                # Optionally, write intermediate results to disk here to manage memory

    return singleton_freq

# Load Co-occurrence Frequencies
def load_cofreq_counts(file_path, concept_to_term_map, term_id_map, output_file="temp_cofreq.txt", chunk_size=1000):
    """ Process cofrequency counts in a memory-efficient way for up to chunk_size entries. """
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f, open(output_file, 'w', encoding='utf-8') as out_f:
        for i, line in enumerate(f):
            concept1_id, concept2_id, count = line.strip().split("\t")
            count = int(count)

            # Convert concept IDs into actual term names
            terms1 = [term_id_map.get(t, f"UNK_{t}") for t in concept_to_term_map.get(concept1_id, [concept1_id])]
            terms2 = [term_id_map.get(t, f"UNK_{t}") for t in concept_to_term_map.get(concept2_id, [concept2_id])]

            # Write mapped term pairs to file
            for term1 in terms1:
                for term2 in terms2:
                    if term1 != term2:  # Avoid self-pairs
                        out_f.write(f"{term1}\t{term2}\t{count}\n")

            if (i + 1) % chunk_size == 0:  # Process in chunks
                print(f"Processed {i + 1} lines")
                # Optionally, write intermediate results to disk here to manage memory

    return output_file  # Return the file path


import numpy as np
from scipy.sparse import dok_matrix, csr_matrix

# Compute NPMI Matrix
def compute_npmi(cofreq_file, singleton_freq, total_count, term_index, matrix_size, k=1, output_pmi="temp_pmi.npz", chunk_size=100000, max_rows=1000000):
    """
    Compute Normalized PMI (NPMI) using at most `max_rows` rows from co-occurrence file.
    Uses `dok_matrix` for fast updates, then converts to `csr_matrix`.
    """
    print(f"Initializing sparse PMI matrix (limiting to {max_rows} rows)...")
    pmi_matrix = dok_matrix((matrix_size, matrix_size), dtype=np.float32)

    print(f"Processing up to {max_rows} rows from co-occurrence file in chunks...")
    batch_data = []

    with open(cofreq_file, 'r', encoding='utf-8', errors='ignore') as f:
        for i, line in enumerate(f):
            if i >= max_rows:  # 🚀 Stop after 1 million rows
                break

            term1, term2, co_count = line.strip().split("\t")
            co_count = int(co_count)

            if term1 not in term_index or term2 not in term_index:
                continue  # Skip unknown terms

            i_idx, j_idx = term_index[term1], term_index[term2]
            p_x = singleton_freq.get(term1, 1) / total_count
            p_y = singleton_freq.get(term2, 1) / total_count
            p_xy = co_count / total_count

            # Compute PMI
            pmi = np.log2((p_xy + k) / (p_x * p_y + k))
            npmi = pmi / -np.log2(p_xy + k) if p_xy > 0 else 0

            batch_data.append((i_idx, j_idx, npmi))

            if len(batch_data) >= chunk_size:
                for row, col, value in batch_data:
                    pmi_matrix[row, col] = value
                    pmi_matrix[col, row] = value  # PMI is symmetric
                batch_data = []  # Reset batch

            if (i + 1) % chunk_size == 0:
                print(f"Processed {i + 1} lines...")

    # Final batch processing
    for row, col, value in batch_data:
        pmi_matrix[row, col] = value
        pmi_matrix[col, row] = value

    print("Converting PMI matrix to CSR format...")
    csr_pmi = pmi_matrix.tocsr()

    print("Saving PMI matrix to disk...")
    np.savez(output_pmi, data=csr_pmi.data, indices=csr_pmi.indices, indptr=csr_pmi.indptr, shape=csr_pmi.shape)

    print("PMI computation complete!")
    return output_pmi  # Return file path instead of keeping it in memory


# Find Synonyms Based on Contextual Similarity
def find_synonyms_fast(pmi_file, term_index, target_word, top_n=5):
    """ Efficient synonym retrieval using a stored PMI matrix. """
    if target_word not in term_index:
        print(f"Word '{target_word}' not found in vocabulary.")
        return []

    # Load sparse PMI matrix from file
    loaded = np.load(pmi_file)
    pmi_matrix = csr_matrix((loaded["data"], loaded["indices"], loaded["indptr"]), shape=loaded["shape"])

    words = list(term_index.keys())
    target_idx = term_index[target_word]

    # Compute sparse cosine similarity
    similarities = cosine_similarity(pmi_matrix[target_idx], pmi_matrix)[0]
    sorted_indices = np.argsort(similarities)[::-1]  # Descending order

    return [(words[i], similarities[i]) for i in sorted_indices if words[i] != target_word][:top_n]

# Filter Synonyms to Same Concept Cluster
def filter_synonyms_by_concept(synonyms, concept_id_map, target_word):
    """ Ensure synonyms belong to the same concept cluster. """
    target_concept = concept_id_map.get(target_word, None)
    if not target_concept:
        return synonyms  # No concept info available

    return [(word, score) for word, score in synonyms if concept_id_map.get(word) == target_concept]

# Load Concept to Term Map
def load_concept_to_term_map(file_path):
    """ Load concept-to-term mappings where multiple terms can map to the same concept. """
    concept_to_term_map = defaultdict(list)

    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            try:
                term_id, concept_id = line.strip().split("\t")
                concept_to_term_map[concept_id].append(term_id)  # Store all terms for a concept
            except ValueError:
                continue  # Skip malformed lines

    return concept_to_term_map


# from CharNGram code

def load_concept_to_term_map(file_path):
    """ Load concept-to-term mappings where multiple terms can map to the same concept. """
    concept_to_term_map = defaultdict(list)

    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            try:
                term_id, concept_id = line.strip().split("\t")
                concept_to_term_map[concept_id].append(term_id)  # Store all terms for a concept
            except ValueError:
                continue  # Skip malformed lines

    return concept_to_term_map

# Extract Character n-grams from Term
def extract_char_ngrams(term, min_n=3, max_n=13):
    term = f"<{term}>"
    ngrams = []
    for n in range(min_n, max_n + 1):
        ngrams += [f"{n}gram-{term[i:i+n]}" for i in range(len(term)-n+1)]
    return ngrams

# Load Character n-gram Embeddings
def load_char_ngram_embeddings(file_path):
    embedding_map = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) < 10:  # Skip headers or bad lines
                continue
            gram = parts[0]
            vec = np.array([float(x) for x in parts[1:]], dtype=np.float32)
            embedding_map[gram] = vec
    return embedding_map

# Compute Embedding for a Single Term
def compute_term_embedding(term, ngram_embeds, min_n=1, max_n=13, dim=100):
    term = term.lower().strip()
    ngrams = extract_char_ngrams(term, min_n, max_n)
    vectors = [ngram_embeds[ng] for ng in ngrams if ng in ngram_embeds]
    if not vectors:
        return np.zeros(dim)
    return np.mean(vectors, axis=0)

# Build Embeddings for All Terms
def build_term_embeddings(term_list, ngram_embeds, dim=100):
    term_vectors = {}
    for term in term_list:
        term_vectors[term] = compute_term_embedding(term, ngram_embeds, dim=dim)
    return term_vectors

# Find Surface Form Synonyms
def find_surface_form_synonyms(term_vectors, target_term, ngram_embeddings, dim=100, top_n=5):
    if target_term in term_vectors:
        target_vec = term_vectors[target_term]
    else:
        print(f"'{target_term}' not in vocabulary — computing from n-grams.")
        target_vec = compute_term_embedding(target_term, ngram_embeddings, dim=dim)

    if np.linalg.norm(target_vec) == 0:
        print(f"Could not compute embedding for '{target_term}' (no matching n-grams).")
        return []

    target_vec = target_vec.reshape(1, -1)
    terms = list(term_vectors.keys())
    matrix = np.stack([term_vectors[t] for t in terms])
    sims = cosine_similarity(target_vec, matrix)[0]

    ranked = sorted(zip(terms, sims), key=lambda x: -x[1])
    return [(term, sim) for term, sim in ranked if term != target_term][:top_n]

# Hybrid code

# Normalize Scores
def normalize_scores(score_list):
    scores = np.array([score for _, score in score_list])
    if len(scores) == 0:
        return []
    min_val, max_val = np.min(scores), np.max(scores)
    norm = lambda s: (s - min_val) / (max_val - min_val) if max_val > min_val else 0.0
    return [(term, norm(score)) for term, score in score_list]

# Combine Surface and Contextual Scores
def combine_hybrid_scores(surface_scores, contextual_scores, alpha=0.5):
    surface_dict = dict(surface_scores)
    contextual_dict = dict(contextual_scores)
    combined = {}

    all_terms = set(surface_dict.keys()).union(contextual_dict.keys())
    for term in all_terms:
        s_score = surface_dict.get(term, 0.0)
        c_score = contextual_dict.get(term, 0.0)
        combined[term] = alpha * s_score + (1 - alpha) * c_score

    sorted_combined = sorted(combined.items(), key=lambda x: -x[1])
    return sorted_combined

# Main Execution Pipeline
if __name__ == "__main__":
    # File Paths
    mapping_files = [
        "1_term_ID_to_string.txt",
        "2a_concept_ID_to_string.txt"
    ]
    concept_map_file = "3_term_ID_to_concept_ID.txt"
    singleton_file = "singlets_concepts_perBin_1d.txt"
    cofreq_file = "cofreqs_concepts_perBin_1d.txt"

    # Load term and concept string mappings
    term_id_map, concept_id_map = load_term_id_to_string(mapping_files)
    concept_to_term_map = load_concept_to_term_map(concept_map_file)

    # Load singleton and co-occurrence frequencies
    singleton_freq = load_singleton_frequencies(singleton_file, concept_to_term_map, term_id_map)
    cofreq_output_file = load_cofreq_counts(cofreq_file, concept_to_term_map, term_id_map)

    # Calculate total frequency count and build term index
    total_count = sum(singleton_freq.values())
    term_index = {term: i for i, term in enumerate(singleton_freq.keys())}
    matrix_size = len(term_index)

    # Compute contextual PMI matrix and save to disk
    pmi_output_file = compute_npmi(cofreq_output_file, singleton_freq, total_count, term_index, matrix_size, max_rows=3000000000)

    # Surface Model (CharNGram) Setup
    charngram_embedding_file = 'charNgram.txt'
    term_list = list(term_id_map.values())

    #Load n-gram embeddings and build term vectors
    ngram_embeddings = load_char_ngram_embeddings(charngram_embedding_file)
    print("Computing term embeddings...")
    term_vectors = build_term_embeddings(term_list, ngram_embeddings, dim=100)


    # === Choose Target Word ===
    target_word = "leukemia"

    #Surface-form synonyms using CharNGram similarity
    print(f"\n🔍 Finding surface-form synonyms for: {target_word}")
    surface_synonyms = find_surface_form_synonyms(term_vectors, target_word, ngram_embeddings, dim=100, top_n=100)
    surface_synonyms = normalize_scores(surface_synonyms)

    # Contextual synonyms using NPMI-based similarity
    print(f"\n🔍 Finding contextual synonyms for: {target_word}")
    contextual_synonyms = find_synonyms_fast(pmi_output_file, term_index, target_word, top_n=100)
    contextual_synonyms = normalize_scores(contextual_synonyms)

    # Combine both types of synonyms using hybrid model
    print(f"\n🔀 Combining hybrid scores for: {target_word}")
    hybrid_synonyms = combine_hybrid_scores(surface_synonyms, contextual_synonyms, alpha=0.5)

    # Print top hybrid synonyms
    print(f"\n✅ Top Hybrid Synonyms for '{target_word}':")
    for term, score in hybrid_synonyms[:10]:
        print(f"{term}: {score:.4f}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processed 206300000 lines...
Processed 206400000 lines...
Processed 206500000 lines...
Processed 206600000 lines...
Processed 206700000 lines...
Processed 206800000 lines...
Processed 206900000 lines...
Processed 207000000 lines...
Processed 207100000 lines...
Processed 207200000 lines...
Processed 207300000 lines...
Processed 207400000 lines...
Processed 207500000 lines...
Processed 207600000 lines...
Processed 207700000 lines...
Processed 207800000 lines...
Processed 207900000 lines...
Processed 208000000 lines...
Processed 208100000 lines...
Processed 208200000 lines...
Processed 208300000 lines...
Processed 208400000 lines...
Processed 208500000 lines...
Processed 208600000 lines...
Processed 208700000 lines...
Processed 208800000 lines...
Processed 208900000 lines...
Processed 209000000 lines...
Processed 209100000 lines...
Processed 209200000 lines...
Processed 209300000 lines...
Processed 209400000 lines...
Process

### MAP for all terms (not split out for IV and OOV Terms)

Now, we need to download the UMLS file here https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html (you have to make sure that you get approval to download the file)

In [None]:
!unzip umls-2024AB-mrconso.zip -d umls_extracted


Archive:  umls-2024AB-mrconso.zip
  inflating: umls_extracted/2024AB/META/MRCONSO.RRF  


In [None]:
import numpy as np
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import random

# === Load UMLS Synonym Clusters ===
# This function loads UMLS concept clusters from MRCONSO.RRF, filtering by language and source vocabulary (SAB).
# Returns a dictionary: CUI -> set of terms.
def load_umls_synonym_clusters(mrconso_path, lang="ENG", sab_filter=None):
    concept_to_terms = defaultdict(set)
    with open(mrconso_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            parts = line.strip().split('|')
            cui = parts[0]
            lang_code = parts[1]
            sab = parts[11]
            term = parts[14].lower()
            if lang_code != lang:
                continue
            if sab_filter and sab not in sab_filter:
                continue
            concept_to_terms[cui].add(term)
    return concept_to_terms


# === Sample Negative Candidates for Evaluation ===
# Randomly sample negative (non-synonym) candidates, avoiding positives and the true synonym.
def sample_negative_candidates(true_synonym, all_terms, positives, sample_size=100):
    blacklist = set(positives + [true_synonym])
    candidates = [term for term in all_terms if term not in blacklist]
    return random.sample(candidates, sample_size) if len(candidates) >= sample_size else candidates

# === Calculate Average Precision for One Query ===
# Given a ranked list of candidates and known relevant terms, compute average precision.
def average_precision(ranked_terms, relevant_terms):
    hits = 0
    sum_precisions = 0.0
    for i, term in enumerate(ranked_terms):
        if term in relevant_terms:
            hits += 1
            sum_precisions += hits / (i + 1)
    return sum_precisions / len(relevant_terms) if relevant_terms else 0.0

# === MAP Evaluation Using Random Candidate Pools ===
# This function samples query-synonym pairs from gold clusters, evaluates model performance using random negatives,
# and computes the Mean Average Precision (MAP) score over many queries.
def evaluate_map_random_negatives(gold_clusters, model_func, all_terms, sample_size=100, top_k=101, num_queries=1000):
    queries = []
    for terms in gold_clusters.values():
        terms = list(terms)
        if len(terms) < 2:
            continue
        for i in range(len(terms)):
            for j in range(i + 1, len(terms)):
                queries.append((terms[i], terms[j], terms))

    random.shuffle(queries)
    queries = queries[:num_queries]

    map_total = 0.0
    for query_term, true_synonym, all_positives in queries:
        negatives = sample_negative_candidates(true_synonym, all_terms, all_positives, sample_size)
        candidate_pool = [true_synonym] + negatives
        random.shuffle(candidate_pool)
        scores = model_func(query_term, candidate_pool)
        ranked = [term for term, _ in sorted(scores.items(), key=lambda x: -x[1])]
        ap = average_precision(ranked, [true_synonym])
        map_total += ap

    return map_total / len(queries) if queries else 0.0


# === Surface Model Scorer (CharNGram-based Embeddings) ===
# Given a query term and a list of candidates, computes cosine similarity between their CharNGram embeddings.
def surface_model_scoring(term_vectors, ngram_embeddings, compute_term_embedding):
    def score_fn(query, candidates):
        scores = {}
        query_vec = term_vectors.get(query, compute_term_embedding(query, ngram_embeddings))
        if np.linalg.norm(query_vec) == 0:
            return {c: 0.0 for c in candidates}
        query_vec = query_vec.reshape(1, -1)
        for c in candidates:
            vec = term_vectors.get(c, compute_term_embedding(c, ngram_embeddings))
            scores[c] = cosine_similarity(query_vec, vec.reshape(1, -1))[0][0] if np.linalg.norm(vec) > 0 else 0.0
        return scores
    return score_fn

# === Contextual Model Scorer (NPMI-based) ===
# Loads a precomputed NPMI matrix and uses it to compute similarity between terms based on their contextual PMI vectors.
def npmi_model_scoring(pmi_file, term_index):
    loaded = np.load(pmi_file)
    pmi_matrix = csr_matrix((loaded["data"], loaded["indices"], loaded["indptr"]), shape=loaded["shape"])
    vocab = list(term_index.keys())

    def score_fn(query, candidates):
        if query not in term_index:
            return {c: 0.0 for c in candidates}
        idx = term_index[query]
        vec = pmi_matrix[idx]
        sims = cosine_similarity(vec, pmi_matrix)[0]
        return {c: sims[term_index[c]] if c in term_index else 0.0 for c in candidates}
    return score_fn

# === Compute Average Word2Vec Vector for Multi-word Terms ===
# For a given term, returns the average of its component token embeddings (if any exist in the model).
def get_avg_vector(term, model):
    tokens = term.lower().split()
    vectors = [model[word] for word in tokens if word in model.key_to_index]
    if not vectors:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)


# === Contextual Model Scorer (Word2Vec-based) ===
# Computes cosine similarity between average Word2Vec embeddings for query and candidate terms.
def context_model_word2vec_fn(w2v_model):
    def scorer(query, candidates):
        scores = {}
        query_vec = get_avg_vector(query, w2v_model)
        if np.linalg.norm(query_vec) == 0:
            return {c: 0.0 for c in candidates}
        query_vec = query_vec.reshape(1, -1)
        for c in candidates:
            c_vec = get_avg_vector(c, w2v_model)
            if np.linalg.norm(c_vec) == 0:
                scores[c] = 0.0
            else:
                scores[c] = cosine_similarity(query_vec, c_vec.reshape(1, -1))[0][0]
        return scores
    return scorer


# === Normalize Dictionary of Scores ===
# Scales all scores into [0, 1] range based on min-max normalization.
def normalize_scores(score_dict):
    scores = np.array(list(score_dict.values()))
    if len(scores) == 0:
        return {}
    min_val, max_val = np.min(scores), np.max(scores)
    norm = lambda s: (s - min_val) / (max_val - min_val) if max_val > min_val else 0.0
    return {term: norm(score) for term, score in score_dict.items()}

# === Combine Surface and Context Scores into Hybrid Score ===
# Combines normalized surface-form and contextual scores via a weighted average using `alpha`.
def combine_hybrid_scores_dict(surface_dict, context_dict, alpha=0.5):
    all_terms = set(surface_dict.keys()).union(context_dict.keys())
    combined = {}
    for term in all_terms:
        s_score = surface_dict.get(term, 0.0)
        c_score = context_dict.get(term, 0.0)
        combined[term] = alpha * s_score + (1 - alpha) * c_score
    return combined

# === Hybrid Model Scorer (Surface + NPMI) ===
# Builds a scoring function that combines CharNGram similarity and NPMI-based contextual similarity.
def hybrid_model_npmi_scoring(term_vectors, ngram_embeddings, pmi_file, term_index, compute_term_embedding, alpha=0.5):
    surface_fn = surface_model_scoring(term_vectors, ngram_embeddings, compute_term_embedding)
    context_fn = npmi_model_scoring(pmi_file, term_index)

    def score_fn(query, candidates):
        s_scores = surface_fn(query, candidates)
        c_scores = context_fn(query, candidates)
        norm_s = normalize_scores(s_scores)
        norm_c = normalize_scores(c_scores)
        return combine_hybrid_scores_dict(norm_s, norm_c, alpha)
    return score_fn



# === Main MAP Evaluation Pipeline ===
# This function loads UMLS synonym clusters, sets up scoring functions for all models (surface, context, hybrid),
# evaluates their MAP performance on a subset of queries, and prints the final results.
def run_umls_map_evaluation_with_npmi(mrconso_path, term_list, term_vectors, ngram_embeddings,
                                      pmi_file, term_index, compute_term_embedding, w2v_model,
                                      num_queries=1000):
    print("\n📦 Loading UMLS Synonym Clusters...")
    clusters = load_umls_synonym_clusters(mrconso_path, sab_filter={"MSH", "SNOMEDCT_US"})
    print("✅ UMLS concepts loaded:", len(clusters))

    print("\n📊 Preparing model functions for scoring...")
    surface_fn = surface_model_scoring(term_vectors, ngram_embeddings, compute_term_embedding)
    context_fn = context_model_word2vec_fn(w2v_model)
    hybrid_fn = hybrid_model_npmi_scoring(term_vectors, ngram_embeddings, pmi_file, term_index, compute_term_embedding, alpha=0.5)

    print("\n🚀 Running MAP evaluation (Random Candidate Selection)...")
    map_surface = evaluate_map_random_negatives(clusters, surface_fn, term_list, num_queries=num_queries)
    map_context = evaluate_map_random_negatives(clusters, context_fn, term_list, num_queries=num_queries)
    map_hybrid = evaluate_map_random_negatives(clusters, hybrid_fn, term_list, num_queries=num_queries)

    print("\n📈 Evaluation Results (MAP):")
    print(f"Surface (CharNGram): {map_surface:.4f}")
    print(f"Context (Word2Vec):      {map_context:.4f}")
    print(f"Hybrid (Char+NPMI):  {map_hybrid:.4f}")


In [None]:
mrconso_path="umls_extracted/2024AB/META/MRCONSO.RRF"
run_umls_map_evaluation_with_npmi(
    mrconso_path=mrconso_path,
    term_list=term_list,
    term_vectors=term_vectors,
    ngram_embeddings=ngram_embeddings,
    pmi_file=pmi_output_file,
    term_index=term_index,
    compute_term_embedding=compute_term_embedding,
    w2v_model=model,
    num_queries=1000
)



📦 Loading UMLS Synonym Clusters...
✅ UMLS concepts loaded: 866217

📊 Preparing model functions for scoring...

🚀 Running MAP evaluation (Random Candidate Selection)...

📈 Evaluation Results (MAP):
Surface (CharNGram): 0.7471
Context (Word2Vec):      0.6376
Hybrid (Char+NPMI):  0.7719


### MAP split out for IV and OOV terms

In [None]:
import numpy as np
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import random

# === Load UMLS Synonym Clusters ===
# Parses the MRCONSO.RRF file to load clusters of synonymous terms (CUI -> set of terms),
# filtered by language and optionally source vocabularies (SABs).
def load_umls_synonym_clusters(mrconso_path, lang="ENG", sab_filter=None):
    concept_to_terms = defaultdict(set)
    with open(mrconso_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            parts = line.strip().split('|')
            cui = parts[0]
            lang_code = parts[1]
            sab = parts[11]
            term = parts[14].lower()
            if lang_code != lang:
                continue
            if sab_filter and sab not in sab_filter:
                continue
            concept_to_terms[cui].add(term)
    return concept_to_terms


# === MAP Evaluation with IV/OOV Term Breakdown ===
# Randomly samples non-synonym terms (negatives) for candidate selection during MAP evaluation.
def sample_negative_candidates(true_synonym, all_terms, positives, sample_size=100):
    blacklist = set(positives + [true_synonym])
    candidates = [term for term in all_terms if term not in blacklist]
    return random.sample(candidates, sample_size) if len(candidates) >= sample_size else candidates

# Computes average precision for a single ranked result list and a set of known relevant items.
def average_precision(ranked_terms, relevant_terms):
    hits = 0
    sum_precisions = 0.0
    for i, term in enumerate(ranked_terms):
        if term in relevant_terms:
            hits += 1
            sum_precisions += hits / (i + 1)
    return sum_precisions / len(relevant_terms) if relevant_terms else 0.0

# Evaluates MAP by distinguishing between in-vocabulary (IV) and out-of-vocabulary (OOV) terms.
# Splits query evaluation results based on vocab_checker function and returns MAP and counts for IV/OOV.
def evaluate_map_by_vocab_status(gold_clusters, model_func, all_terms, sample_size=100, top_k=101, num_queries=1000, vocab_checker=None):
    queries = []
    for terms in gold_clusters.values():
        terms = list(terms)
        if len(terms) < 2:
            continue
        for i in range(len(terms)):
            for j in range(i + 1, len(terms)):
                queries.append((terms[i], terms[j], terms))

    random.shuffle(queries)
    queries = queries[:num_queries]

    map_total_iv, map_total_oov = 0.0, 0.0
    count_iv, count_oov = 0, 0

    for query_term, true_synonym, all_positives in queries:
        is_iv = vocab_checker(query_term) and vocab_checker(true_synonym)
        negatives = sample_negative_candidates(true_synonym, all_terms, all_positives, sample_size)
        candidate_pool = [true_synonym] + negatives
        random.shuffle(candidate_pool)

        scores = model_func(query_term, candidate_pool)
        ranked = [term for term, _ in sorted(scores.items(), key=lambda x: -x[1])]
        ap = average_precision(ranked, [true_synonym])

        if is_iv:
            map_total_iv += ap
            count_iv += 1
        else:
            map_total_oov += ap
            count_oov += 1

    map_iv = map_total_iv / count_iv if count_iv > 0 else 0.0
    map_oov = map_total_oov / count_oov if count_oov > 0 else 0.0
    return map_iv, map_oov, count_iv, count_oov


# === Vocabulary Checkers ===
# Checks if a term is in the CharNGram model’s vocabulary by checking existing embeddings or computing one.
def is_in_vocab_charn(term, term_vectors, ngram_embeddings, compute_term_embedding):
    vec = term_vectors.get(term)
    if vec is not None:
        return True
    vec = compute_term_embedding(term, ngram_embeddings)
    return np.linalg.norm(vec) > 0

# Checks if a term (or one of its tokens) exists in the Word2Vec model’s vocabulary.
def is_in_vocab_w2v(term, w2v_model):
    tokens = term.lower().split()
    return any(token in w2v_model.key_to_index for token in tokens)

# Checks if a term exists in the NPMI term index.
def is_in_vocab_npmi(term, term_index):
    return term in term_index


# === Model Scorers ===
# Computes cosine similarity scores between query and candidate terms using surface-form CharNGram embeddings.
def surface_model_scoring(term_vectors, ngram_embeddings, compute_term_embedding):
    def score_fn(query, candidates):
        scores = {}
        query_vec = term_vectors.get(query, compute_term_embedding(query, ngram_embeddings))
        if np.linalg.norm(query_vec) == 0:
            return {c: 0.0 for c in candidates}
        query_vec = query_vec.reshape(1, -1)
        for c in candidates:
            vec = term_vectors.get(c, compute_term_embedding(c, ngram_embeddings))
            scores[c] = cosine_similarity(query_vec, vec.reshape(1, -1))[0][0] if np.linalg.norm(vec) > 0 else 0.0
        return scores
    return score_fn

# Computes average vector from Word2Vec model for a potentially multi-token term.
def get_avg_vector(term, model):
    tokens = term.lower().split()
    vectors = [model[word] for word in tokens if word in model.key_to_index]
    if not vectors:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

# Computes similarity using Word2Vec embeddings for both query and candidate terms.
def context_model_word2vec_fn(w2v_model):
    def scorer(query, candidates):
        scores = {}
        query_vec = get_avg_vector(query, w2v_model)
        if np.linalg.norm(query_vec) == 0:
            return {c: 0.0 for c in candidates}
        query_vec = query_vec.reshape(1, -1)
        for c in candidates:
            c_vec = get_avg_vector(c, w2v_model)
            if np.linalg.norm(c_vec) == 0:
                scores[c] = 0.0
            else:
                scores[c] = cosine_similarity(query_vec, c_vec.reshape(1, -1))[0][0]
        return scores
    return scorer

# Computes similarity using a precomputed NPMI matrix loaded from disk.
def npmi_model_scoring(pmi_file, term_index):
    loaded = np.load(pmi_file)
    pmi_matrix = csr_matrix((loaded["data"], loaded["indices"], loaded["indptr"]), shape=loaded["shape"])
    def score_fn(query, candidates):
        if query not in term_index:
            return {c: 0.0 for c in candidates}
        idx = term_index[query]
        vec = pmi_matrix[idx]
        sims = cosine_similarity(vec, pmi_matrix)[0]
        return {c: sims[term_index[c]] if c in term_index else 0.0 for c in candidates}
    return score_fn

# Normalizes scores to [0, 1] range using min-max normalization.
def normalize_scores(score_dict):
    scores = np.array(list(score_dict.values()))
    if len(scores) == 0:
        return {}
    min_val, max_val = np.min(scores), np.max(scores)
    norm = lambda s: (s - min_val) / (max_val - min_val) if max_val > min_val else 0.0
    return {term: norm(score) for term, score in score_dict.items()}

# Combines scores from two sources (e.g., surface and context models) with a weight alpha.
def combine_hybrid_scores_dict(surface_dict, context_dict, alpha=0.5):
    all_terms = set(surface_dict.keys()).union(context_dict.keys())
    return {term: alpha * surface_dict.get(term, 0.0) + (1 - alpha) * context_dict.get(term, 0.0)
            for term in all_terms}

# Constructs a hybrid model that combines CharNGram and NPMI-based similarity.
# Scores are normalized and blended with weight alpha.
def hybrid_model_npmi_scoring(term_vectors, ngram_embeddings, pmi_file, term_index, compute_term_embedding, alpha=0.5):
    surface_fn = surface_model_scoring(term_vectors, ngram_embeddings, compute_term_embedding)
    context_fn = npmi_model_scoring(pmi_file, term_index)
    def score_fn(query, candidates):
        s_scores = surface_fn(query, candidates)
        c_scores = context_fn(query, candidates)
        return combine_hybrid_scores_dict(normalize_scores(s_scores), normalize_scores(c_scores), alpha)
    return score_fn



# === Main Evaluation Pipeline with IV/OOV Reporting ===
# Loads clusters, prepares scoring functions for CharNGram, Word2Vec, and Hybrid models.
# Evaluates each model separately, splitting results by whether the terms are IV or OOV.
def run_umls_map_evaluation_with_vocab_splits(mrconso_path, term_list, term_vectors, ngram_embeddings,
                                              pmi_file, term_index, compute_term_embedding, w2v_model,
                                              num_queries=1000):
    print("\n📦 Loading UMLS Synonym Clusters...")
    clusters = load_umls_synonym_clusters(mrconso_path, sab_filter={"MSH", "SNOMEDCT_US"})
    print("✅ UMLS concepts loaded:", len(clusters))

    print("\n📊 Preparing model functions...")
    surface_fn = surface_model_scoring(term_vectors, ngram_embeddings, compute_term_embedding)
    context_fn = context_model_word2vec_fn(w2v_model)
    hybrid_fn = hybrid_model_npmi_scoring(term_vectors, ngram_embeddings, pmi_file, term_index, compute_term_embedding)

    print("\n🚀 Evaluating CharNGram...")
    map_iv_s, map_oov_s, c_iv_s, c_oov_s = evaluate_map_by_vocab_status(
        clusters, surface_fn, term_list, num_queries=num_queries,
        vocab_checker=lambda t: is_in_vocab_charn(t, term_vectors, ngram_embeddings, compute_term_embedding)
    )

    print("\n🚀 Evaluating Word2Vec...")
    map_iv_c, map_oov_c, c_iv_c, c_oov_c = evaluate_map_by_vocab_status(
        clusters, context_fn, term_list, num_queries=num_queries,
        vocab_checker=lambda t: is_in_vocab_w2v(t, w2v_model)
    )

    print("\n🚀 Evaluating Hybrid...")
    map_iv_h, map_oov_h, c_iv_h, c_oov_h = evaluate_map_by_vocab_status(
        clusters, hybrid_fn, term_list, num_queries=num_queries,
        vocab_checker=lambda t: is_in_vocab_charn(t, term_vectors, ngram_embeddings, compute_term_embedding)
                                 and is_in_vocab_npmi(t, term_index)
    )

    print("\n📈 Evaluation Results (MAP):")
    print(f"🔹 CharNGram:  IV={map_iv_s:.4f} ({c_iv_s}), OOV={map_oov_s:.4f} ({c_oov_s})")
    print(f"🔹 Word2Vec:   IV={map_iv_c:.4f} ({c_iv_c}), OOV={map_oov_c:.4f} ({c_oov_c})")
    print(f"🔹 Hybrid:     IV={map_iv_h:.4f} ({c_iv_h}), OOV={map_oov_h:.4f} ({c_oov_h})")


In [None]:
mrconso_path="umls_extracted/2024AB/META/MRCONSO.RRF"
run_umls_map_evaluation_with_vocab_splits(
    mrconso_path=mrconso_path,
    term_list=term_list,
    term_vectors=term_vectors,
    ngram_embeddings=ngram_embeddings,
    pmi_file=pmi_output_file,
    term_index=term_index,
    compute_term_embedding=compute_term_embedding,
    w2v_model=model,
    num_queries=1000
)


📦 Loading UMLS Synonym Clusters...
✅ UMLS concepts loaded: 866217

📊 Preparing model functions...

🚀 Evaluating CharNGram...

🚀 Evaluating Word2Vec...

🚀 Evaluating Hybrid...


### Model tests on random words

In [None]:
pip install nltk


In [None]:
import random
import nltk
from nltk.corpus import words
nltk.download('words')

# === 1. Get Random Non-Medical English Words ===
# Returns a list of `num_words` English words that are not present in the given term list (e.g., medical terms).
# Used to simulate out-of-domain query terms for model evaluation.
def get_non_umls_english_words(term_list, num_words=1000):
    english_vocab = set(w.lower() for w in words.words() if w.isalpha() and len(w) > 2)
    medical_vocab = set(t.lower() for t in term_list)
    non_medical_words = list(english_vocab - medical_vocab)
    random.seed(42)
    return random.sample(non_medical_words, min(num_words, len(non_medical_words)))


# === 2. Evaluate Model on Random Non-Medical Query Terms ===
# For a list of random query terms, applies the model to a fixed candidate pool and prints top-K results for each query.
# Only the first 10 queries are printed for brevity.
def test_model_on_random_queries(random_terms, model_func, candidate_pool, top_k=5, model_name=""):
    print(f"\n📊 Evaluating {model_name} on {len(random_terms)} non-medical queries...\n")
    for i, query_term in enumerate(random_terms[:10]):  # Just show first 10 queries
        scores = model_func(query_term, candidate_pool)
        ranked = sorted(scores.items(), key=lambda x: -x[1])[:top_k]
        print(f"\n🔍 Query [{i+1}]: {query_term}")
        for rank, (term, score) in enumerate(ranked, start=1):
            print(f"  {rank}. {term} ({score:.4f})")

# === 3. Surface Model Wrapper (CharNGram) ===
# Computes similarity between query and candidates using character n-gram embeddings.
# Falls back on computing term embedding if not found in the cached vectors.
def surface_model_fn(term_vectors, ngram_embeddings):
    def scorer(query, candidates):
        scores = {}
        q_vec = compute_term_embedding(query, ngram_embeddings, dim=100)
        if np.linalg.norm(q_vec) == 0:
            return {c: 0.0 for c in candidates}
        for c in candidates:
            c_vec = term_vectors.get(c)
            if c_vec is None:
                c_vec = compute_term_embedding(c, ngram_embeddings, dim=100)
            sim = cosine_similarity(q_vec.reshape(1, -1), c_vec.reshape(1, -1))[0][0]
            scores[c] = sim
        return scores
    return scorer


# === Get Average Word2Vec Vector ===
# Splits a term into tokens and returns the average of Word2Vec vectors for known tokens.
def get_avg_vector(term, model):
    tokens = term.lower().split()
    vectors = [model[word] for word in tokens if word in model.key_to_index]
    if not vectors:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

# === Contextual Model Wrapper (Word2Vec) ===
# Scores query and candidate terms based on cosine similarity between their average Word2Vec vectors.
def context_model_word2vec_fn(w2v_model):
    def scorer(query, candidates):
        scores = {}
        query_vec = get_avg_vector(query, w2v_model)
        if np.linalg.norm(query_vec) == 0:
            return {c: 0.0 for c in candidates}
        query_vec = query_vec.reshape(1, -1)
        for c in candidates:
            c_vec = get_avg_vector(c, w2v_model)
            if np.linalg.norm(c_vec) == 0:
                scores[c] = 0.0
            else:
                scores[c] = cosine_similarity(query_vec, c_vec.reshape(1, -1))[0][0]
        return scores
    return scorer


# === NPMI Contextual Model Scorer ===
# Loads the precomputed sparse PMI matrix and scores query vs. candidates using cosine similarity of row vectors.
def npmi_model_scoring(pmi_file, term_index):
    loaded = np.load(pmi_file)
    pmi_matrix = csr_matrix((loaded["data"], loaded["indices"], loaded["indptr"]), shape=loaded["shape"])
    def score_fn(query, candidates):
        if query not in term_index:
            return {c: 0.0 for c in candidates}
        idx = term_index[query]
        vec = pmi_matrix[idx]
        sims = cosine_similarity(vec, pmi_matrix)[0]
        return {c: sims[term_index[c]] if c in term_index else 0.0 for c in candidates}
    return score_fn


# === Normalize Scores Dictionary ===
# Scales all scores in a dictionary to the [0, 1] range using min-max normalization.
def normalize_scores(score_dict):
    scores = np.array(list(score_dict.values()))
    if len(scores) == 0:
        return {}
    min_val, max_val = np.min(scores), np.max(scores)
    norm = lambda s: (s - min_val) / (max_val - min_val) if max_val > min_val else 0.0
    return {term: norm(score) for term, score in score_dict.items()}


# === Combine Surface and Context Scores ===
# Blends two score dictionaries using weight alpha to create a hybrid score for each candidate.
def combine_hybrid_scores_dict(surface_dict, context_dict, alpha=0.5):
    all_terms = set(surface_dict.keys()).union(context_dict.keys())
    return {term: alpha * surface_dict.get(term, 0.0) + (1 - alpha) * context_dict.get(term, 0.0)
            for term in all_terms}

# === Hybrid Model Wrapper (CharNGram + NPMI) ===
# Combines normalized surface (CharNGram) and contextual (NPMI) scores using weighted average.
def hybrid_model_npmi_scoring(term_vectors, ngram_embeddings, pmi_file, term_index, compute_term_embedding, alpha=0.5):
    surface_fn = surface_model_scoring(term_vectors, ngram_embeddings, compute_term_embedding)
    context_fn = npmi_model_scoring(pmi_file, term_index)
    def score_fn(query, candidates):
        s_scores = surface_fn(query, candidates)
        c_scores = context_fn(query, candidates)
        return combine_hybrid_scores_dict(normalize_scores(s_scores), normalize_scores(c_scores), alpha)
    return score_fn

# === Run Full Non-Medical Evaluation Pipeline ===
# Selects non-medical words, builds model scorers, and evaluates performance using each model type (surface, context, hybrid).
def run_non_medical_query_test(term_list, term_vectors, ngram_embeddings, w2v_model, pmi_file, term_index):
    print("\n📦 Getting 1,000 non-medical query terms...")
    non_medical_queries = get_non_umls_english_words(term_list, num_words=1000)
    candidate_pool = random.sample(term_list, 500)

    surface_fn = surface_model_fn(term_vectors, ngram_embeddings)
    context_fn = context_model_word2vec_fn(w2v_model)
    hybrid_fn = hybrid_model_npmi_scoring(term_vectors, ngram_embeddings, pmi_file, term_index, compute_term_embedding)

    evaluate_non_medical_queries_summary(non_medical_queries, surface_fn, candidate_pool, model_name="Surface")
    evaluate_non_medical_queries_summary(non_medical_queries, context_fn, candidate_pool, model_name="Context (Word2Vec)")
    evaluate_non_medical_queries_summary(non_medical_queries, hybrid_fn, candidate_pool, model_name="Hybrid")



In [None]:
# === Evaluate Model on Non-Medical Queries and Summarize Results ===
# For a list of non-medical (out-of-domain) query terms, this function:
# 1. Applies the given model to score each query against a candidate pool.
# 2. Records the top-1 score for each query.
# 3. Counts how often the top prediction exceeds a specified confidence threshold (i.e., high-confidence false positive).
# 4. Computes and prints summary statistics: average top-1 score and FP rate.
# Returns the average top-1 score and the false positive rate (FP@1).
def evaluate_non_medical_queries_summary(query_terms, model_func, candidate_pool, top_k=1, score_threshold=0.5, model_name=""):
    print(f"\n📊 Evaluating {model_name} on {len(query_terms)} non-medical queries...")

    top1_scores = []
    high_conf_fp_count = 0

    for query_term in query_terms:
        scores = model_func(query_term, candidate_pool)
        ranked = sorted(scores.items(), key=lambda x: -x[1])[:top_k]

        if not ranked:
            continue

        top_score = ranked[0][1]
        top1_scores.append(top_score)

        if top_score >= score_threshold:
            high_conf_fp_count += 1

    num_queries = len(top1_scores)
    avg_top1_score = np.mean(top1_scores)
    fp_rate = high_conf_fp_count / num_queries if num_queries else 0.0

    print(f"🔹 Avg Top-1 Score:         {avg_top1_score:.4f}")
    print(f"🔹 High-Confidence FP@1:   {fp_rate:.4f} (score ≥ {score_threshold})")

    return avg_top1_score, fp_rate


In [None]:
run_non_medical_query_test(term_list, term_vectors, ngram_embeddings, model, pmi_output_file, term_index)
