<a href="https://colab.research.google.com/github/bradyprice/CSE-6250-Final-Project/blob/main/hybrid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# View and modify the working path
import os
from google.colab import drive

# View current working directory
print("Current Working Directory:", os.getcwd())

# Mount Google Drive
drive.mount('/content/gdrive')

# Change working directory to your file position
path = "/content/gdrive/My Drive/CSE_6250_BD4H_Final_Project"
os.chdir(path)

# Confirm the change
print("Working Directory:", os.getcwd())


Current Working Directory: /content
Mounted at /content/gdrive
Working Directory: /content/gdrive/My Drive/CSE_6250_BD4H_Final_Project


In [3]:
!pip install sparse_dot_topn

Collecting sparse_dot_topn
  Downloading sparse_dot_topn-1.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Downloading sparse_dot_topn-1.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (266 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/266.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.7/266.7 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sparse_dot_topn
Successfully installed sparse_dot_topn-1.1.5


In [6]:
import numpy as np
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import lil_matrix, csr_matrix
from sparse_dot_topn import awesome_cossim_topn  # Install using: pip install sparse-dot-topn
from scipy.sparse import dok_matrix, csr_matrix

### 1️⃣ Load Mapping Files ###
def load_term_id_to_string(file_paths, include_concepts=True):
    """ Load multiple term ID to string mappings and merge them into a single dictionary. """
    term_id_map = {}
    concept_id_map = {}  # For concept mappings

    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            for line in f:
                try:
                    term_id, term_str = line.strip().split("\t")

                    # If it's a concept mapping file, store separately
                    if include_concepts and "concept" in file_path.lower():
                        concept_id_map[term_id] = term_str
                    else:
                        term_id_map[term_id] = term_str
                except ValueError:
                    continue  # Skip malformed lines

    return term_id_map, concept_id_map


def load_singleton_frequencies(file_path, concept_to_term_map, term_id_map, chunk_size=1000):
    """ Load singleton frequencies in chunks and map concept IDs to term strings """
    singleton_freq = {}

    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for i, line in enumerate(f):
            concept_id, count = line.strip().split("\t")
            count = int(count)

            # Get term IDs associated with this concept
            term_ids = concept_to_term_map.get(concept_id, [f"UNK_{concept_id}"])

            # Convert term IDs to actual term names
            terms = [term_id_map.get(term_id, f"UNK_{term_id}") for term_id in term_ids]

            # Store frequencies for all mapped terms
            for term in terms:
                singleton_freq[term] = singleton_freq.get(term, 0) + count  # Sum counts if needed

            if (i + 1) % chunk_size == 0:  # Process in chunks
                print(f"Processed {i + 1} lines")
                # Optionally, write intermediate results to disk here to manage memory

    return singleton_freq


def load_cofreq_counts(file_path, concept_to_term_map, term_id_map, output_file="temp_cofreq.txt", chunk_size=1000):
    """ Process cofrequency counts in a memory-efficient way for up to chunk_size entries. """
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f, open(output_file, 'w', encoding='utf-8') as out_f:
        for i, line in enumerate(f):
            concept1_id, concept2_id, count = line.strip().split("\t")
            count = int(count)

            # Convert concept IDs into actual term names
            terms1 = [term_id_map.get(t, f"UNK_{t}") for t in concept_to_term_map.get(concept1_id, [concept1_id])]
            terms2 = [term_id_map.get(t, f"UNK_{t}") for t in concept_to_term_map.get(concept2_id, [concept2_id])]

            # Write mapped term pairs to file
            for term1 in terms1:
                for term2 in terms2:
                    if term1 != term2:  # Avoid self-pairs
                        out_f.write(f"{term1}\t{term2}\t{count}\n")

            if (i + 1) % chunk_size == 0:  # Process in chunks
                print(f"Processed {i + 1} lines")
                # Optionally, write intermediate results to disk here to manage memory

    return output_file  # Return the file path


import numpy as np
from scipy.sparse import dok_matrix, csr_matrix

def compute_npmi(cofreq_file, singleton_freq, total_count, term_index, matrix_size, k=1, output_pmi="temp_pmi.npz", chunk_size=100000, max_rows=1000000):
    """
    Compute Normalized PMI (NPMI) using at most `max_rows` rows from co-occurrence file.
    Uses `dok_matrix` for fast updates, then converts to `csr_matrix`.
    """
    print(f"Initializing sparse PMI matrix (limiting to {max_rows} rows)...")
    pmi_matrix = dok_matrix((matrix_size, matrix_size), dtype=np.float32)

    print(f"Processing up to {max_rows} rows from co-occurrence file in chunks...")
    batch_data = []

    with open(cofreq_file, 'r', encoding='utf-8', errors='ignore') as f:
        for i, line in enumerate(f):
            if i >= max_rows:  # 🚀 Stop after 1 million rows
                break

            term1, term2, co_count = line.strip().split("\t")
            co_count = int(co_count)

            if term1 not in term_index or term2 not in term_index:
                continue  # Skip unknown terms

            i_idx, j_idx = term_index[term1], term_index[term2]
            p_x = singleton_freq.get(term1, 1) / total_count
            p_y = singleton_freq.get(term2, 1) / total_count
            p_xy = co_count / total_count

            # Compute PMI
            pmi = np.log2((p_xy + k) / (p_x * p_y + k))
            npmi = pmi / -np.log2(p_xy + k) if p_xy > 0 else 0

            batch_data.append((i_idx, j_idx, npmi))

            if len(batch_data) >= chunk_size:
                for row, col, value in batch_data:
                    pmi_matrix[row, col] = value
                    pmi_matrix[col, row] = value  # PMI is symmetric
                batch_data = []  # Reset batch

            if (i + 1) % chunk_size == 0:
                print(f"Processed {i + 1} lines...")

    # Final batch processing
    for row, col, value in batch_data:
        pmi_matrix[row, col] = value
        pmi_matrix[col, row] = value

    print("Converting PMI matrix to CSR format...")
    csr_pmi = pmi_matrix.tocsr()

    print("Saving PMI matrix to disk...")
    np.savez(output_pmi, data=csr_pmi.data, indices=csr_pmi.indices, indptr=csr_pmi.indptr, shape=csr_pmi.shape)

    print("PMI computation complete!")
    return output_pmi  # Return file path instead of keeping it in memory


def find_synonyms_fast(pmi_file, term_index, target_word, top_n=5):
    """ Efficient synonym retrieval using a stored PMI matrix. """
    if target_word not in term_index:
        print(f"Word '{target_word}' not found in vocabulary.")
        return []

    # Load sparse PMI matrix from file
    loaded = np.load(pmi_file)
    pmi_matrix = csr_matrix((loaded["data"], loaded["indices"], loaded["indptr"]), shape=loaded["shape"])

    words = list(term_index.keys())
    target_idx = term_index[target_word]

    # Compute sparse cosine similarity
    similarities = cosine_similarity(pmi_matrix[target_idx], pmi_matrix)[0]
    sorted_indices = np.argsort(similarities)[::-1]  # Descending order

    return [(words[i], similarities[i]) for i in sorted_indices if words[i] != target_word][:top_n]

### 5️⃣ Filter Synonyms Using Concept ID ###
def filter_synonyms_by_concept(synonyms, concept_id_map, target_word):
    """ Ensure synonyms belong to the same concept cluster. """
    target_concept = concept_id_map.get(target_word, None)
    if not target_concept:
        return synonyms  # No concept info available

    return [(word, score) for word, score in synonyms if concept_id_map.get(word) == target_concept]

def load_concept_to_term_map(file_path):
    """ Load concept-to-term mappings where multiple terms can map to the same concept. """
    concept_to_term_map = defaultdict(list)

    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            try:
                term_id, concept_id = line.strip().split("\t")
                concept_to_term_map[concept_id].append(term_id)  # Store all terms for a concept
            except ValueError:
                continue  # Skip malformed lines

    return concept_to_term_map


if __name__ == "__main__":
    # File Paths
    mapping_files = [
        "1_term_ID_to_string.txt",
        "2a_concept_ID_to_string.txt"
    ]
    concept_map_file = "3_term_ID_to_concept_ID.txt"
    singleton_file = "singlets_concepts_perBin_1d.txt"
    cofreq_file = "cofreqs_concepts_perBin_1d.txt"

    # Load mappings
    term_id_map, _ = load_term_id_to_string(mapping_files)
    concept_to_term_map = load_concept_to_term_map(concept_map_file)

    # Load Data (Only First 100 Lines)
    singleton_freq = load_singleton_frequencies(singleton_file, concept_to_term_map, term_id_map)
    cofreq_output_file = load_cofreq_counts(cofreq_file, concept_to_term_map, term_id_map)

    total_count = sum(singleton_freq.values())

    # Compute PMI (Streaming, Small Dataset)
    term_index = {term: i for i, term in enumerate(singleton_freq.keys())}
    matrix_size = len(term_index)

    pmi_output_file = compute_npmi(cofreq_output_file, singleton_freq, total_count, term_index, matrix_size, max_rows=1000000)


    # Find synonyms
    target_word = "leukemia"
    synonyms = find_synonyms_fast(pmi_output_file, term_index, target_word)

    # Output results
    print(f"Top synonyms for '{target_word}':")
    for word, score in synonyms:
        print(f"{word}: {score:.4f}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processed 3577000 lines
Processed 3578000 lines
Processed 3579000 lines
Processed 3580000 lines
Processed 3581000 lines
Processed 3582000 lines
Processed 3583000 lines
Processed 3584000 lines
Processed 3585000 lines
Processed 3586000 lines
Processed 3587000 lines
Processed 3588000 lines
Processed 3589000 lines
Processed 3590000 lines
Processed 3591000 lines
Processed 3592000 lines
Processed 3593000 lines
Processed 3594000 lines
Processed 3595000 lines
Processed 3596000 lines
Processed 3597000 lines
Processed 3598000 lines
Processed 3599000 lines
Processed 3600000 lines
Processed 3601000 lines
Processed 3602000 lines
Processed 3603000 lines
Processed 3604000 lines
Processed 3605000 lines
Processed 3606000 lines
Processed 3607000 lines
Processed 3608000 lines
Processed 3609000 lines
Processed 3610000 lines
Processed 3611000 lines
Processed 3612000 lines
Processed 3613000 lines
Processed 3614000 lines
Processed 3615000 lines

Now I am going to be trying to do the charnGram. Only need to do the cell below once

In [11]:
!gzip -d "/content/gdrive/My Drive/CSE_6250_BD4H_Final_Project/jmt_pre-trained_embeddings.tar.gz"


In [15]:
!tar -xvf "/content/gdrive/My Drive/CSE_6250_BD4H_Final_Project/jmt_pre-trained_embeddings.tar" "charNgram.txt" -C "/content/gdrive/My Drive/CSE_6250_BD4H_Final_Project/"


charNgram.txt


In [4]:
import numpy as np
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import lil_matrix, csr_matrix
from sparse_dot_topn import awesome_cossim_topn  # Install using: pip install sparse-dot-topn
from scipy.sparse import dok_matrix, csr_matrix

### 1️⃣ Load Mapping Files ###
def load_term_id_to_string(file_paths, include_concepts=True):
    """ Load multiple term ID to string mappings and merge them into a single dictionary. """
    term_id_map = {}
    concept_id_map = {}  # For concept mappings

    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            for line in f:
                try:
                    term_id, term_str = line.strip().split("\t")

                    # If it's a concept mapping file, store separately
                    if include_concepts and "concept" in file_path.lower():
                        concept_id_map[term_id] = term_str
                    else:
                        term_id_map[term_id] = term_str
                except ValueError:
                    continue  # Skip malformed lines

    return term_id_map, concept_id_map



### 5️⃣ Filter Synonyms Using Concept ID ###
def filter_synonyms_by_concept(synonyms, concept_id_map, target_word):
    """ Ensure synonyms belong to the same concept cluster. """
    target_concept = concept_id_map.get(target_word, None)
    if not target_concept:
        return synonyms  # No concept info available

    return [(word, score) for word, score in synonyms if concept_id_map.get(word) == target_concept]

def load_concept_to_term_map(file_path):
    """ Load concept-to-term mappings where multiple terms can map to the same concept. """
    concept_to_term_map = defaultdict(list)

    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            try:
                term_id, concept_id = line.strip().split("\t")
                concept_to_term_map[concept_id].append(term_id)  # Store all terms for a concept
            except ValueError:
                continue  # Skip malformed lines

    return concept_to_term_map

def extract_char_ngrams(term, min_n=3, max_n=13):
    term = f"<{term}>"
    ngrams = []
    for n in range(min_n, max_n + 1):
        ngrams += [f"{n}gram-{term[i:i+n]}" for i in range(len(term)-n+1)]
    return ngrams


def load_char_ngram_embeddings(file_path):
    embedding_map = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) < 10:  # Skip headers or bad lines
                continue
            gram = parts[0]
            vec = np.array([float(x) for x in parts[1:]], dtype=np.float32)
            embedding_map[gram] = vec
    return embedding_map

def compute_term_embedding(term, ngram_embeds, min_n=1, max_n=13, dim=100):
    term = term.lower().strip()
    ngrams = extract_char_ngrams(term, min_n, max_n)
    vectors = [ngram_embeds[ng] for ng in ngrams if ng in ngram_embeds]
    if not vectors:
        return np.zeros(dim)
    return np.mean(vectors, axis=0)


def build_term_embeddings(term_list, ngram_embeds, dim=100):
    term_vectors = {}
    for term in term_list:
        term_vectors[term] = compute_term_embedding(term, ngram_embeds, dim=dim)
    return term_vectors

def find_surface_form_synonyms(term_vectors, target_term, ngram_embeddings, dim=100, top_n=5):
    if target_term in term_vectors:
        target_vec = term_vectors[target_term]
    else:
        print(f"'{target_term}' not in vocabulary — computing from n-grams.")
        target_vec = compute_term_embedding(target_term, ngram_embeddings, dim=dim)

    if np.linalg.norm(target_vec) == 0:
        print(f"Could not compute embedding for '{target_term}' (no matching n-grams).")
        return []

    target_vec = target_vec.reshape(1, -1)
    terms = list(term_vectors.keys())
    matrix = np.stack([term_vectors[t] for t in terms])
    sims = cosine_similarity(target_vec, matrix)[0]

    ranked = sorted(zip(terms, sims), key=lambda x: -x[1])
    return [(term, sim) for term, sim in ranked if term != target_term][:top_n]





if __name__ == "__main__":
    # File Paths
    mapping_files = [
        "1_term_ID_to_string.txt",
        "2a_concept_ID_to_string.txt"
    ]
    concept_map_file = "3_term_ID_to_concept_ID.txt"
    singleton_file = "singlets_concepts_perBin_1d.txt"
    cofreq_file = "cofreqs_concepts_perBin_1d.txt"
    charngram_embedding_file = 'charNgram.txt'

    # Load mappings
    term_id_map, concept_id_map = load_term_id_to_string(mapping_files)
    concept_to_term_map = load_concept_to_term_map(concept_map_file)

     # === Build Term List ===
    term_list = list(term_id_map.values())

    # === Load CharNGram Embeddings ===
    ngram_embeddings = load_char_ngram_embeddings(charngram_embedding_file)

    # === Compute Embeddings for All Terms ===
    print("Computing term embeddings...")
    term_vectors = build_term_embeddings(term_list, ngram_embeddings, dim=100)

    # === Find Synonyms ===
    target_word = "leukemia"
    synonyms = find_surface_form_synonyms(term_vectors, target_word, ngram_embeddings, top_n=10)

    print(f"Top CharNGram-based synonyms for '{target_word}':")
    for word, score in synonyms:
        print(f"{word}: {score:.4f}")


Computing term embeddings...


TypeError: find_surface_form_synonyms() missing 1 required positional argument: 'ngram_embeddings'

In [5]:
    # === Find Synonyms ===
    target_word = "leukemia"
    synonyms = find_surface_form_synonyms(term_vectors, target_word, ngram_embeddings, top_n=10)

    print(f"Top CharNGram-based synonyms for '{target_word}':")
    for word, score in synonyms:
        print(f"{word}: {score:.4f}")

Top CharNGram-based synonyms for 'leukemia':
cns leukemia: 0.9925
leukemias: 0.9921
leukemia of cns: 0.9880
leukemia nos: 0.9879
[m]leukemia nos: 0.9855
leukemias nec: 0.9854
rat leukemia: 0.9842
aleukemic leukemia: 0.9837
leukemia aleukemic: 0.9837
lgl leukemia: 0.9826


In [8]:
    # === Find Synonyms ===
    target_word = "cool"
    synonyms = find_surface_form_synonyms(term_vectors, target_word, ngram_embeddings, top_n=10)

    print(f"Top CharNGram-based synonyms for '{target_word}':")
    for word, score in synonyms:
        print(f"{word}: {score:.4f}")



'cool' not in vocabulary — computing from n-grams.
Top CharNGram-based synonyms for 'cool':
cool1: 0.9936
cool2: 0.9925
coolers: 0.9478
li(x)coo2: 0.9468
coos: 0.9459
pscoo: 0.9410
coombs: 0.9404
cool skin: 0.9395
oil bleo: 0.9362
dermacool: 0.9357


I need to work on a hybrid approach

Now I am going to try a hybrid approach

In [9]:
import numpy as np
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import lil_matrix, csr_matrix
from sparse_dot_topn import awesome_cossim_topn  # Install using: pip install sparse-dot-topn
from scipy.sparse import dok_matrix, csr_matrix

### 1️⃣ Load Mapping Files ###
def load_term_id_to_string(file_paths, include_concepts=True):
    """ Load multiple term ID to string mappings and merge them into a single dictionary. """
    term_id_map = {}
    concept_id_map = {}  # For concept mappings

    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            for line in f:
                try:
                    term_id, term_str = line.strip().split("\t")

                    # If it's a concept mapping file, store separately
                    if include_concepts and "concept" in file_path.lower():
                        concept_id_map[term_id] = term_str
                    else:
                        term_id_map[term_id] = term_str
                except ValueError:
                    continue  # Skip malformed lines

    return term_id_map, concept_id_map


def load_singleton_frequencies(file_path, concept_to_term_map, term_id_map, chunk_size=1000):
    """ Load singleton frequencies in chunks and map concept IDs to term strings """
    singleton_freq = {}

    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for i, line in enumerate(f):
            concept_id, count = line.strip().split("\t")
            count = int(count)

            # Get term IDs associated with this concept
            term_ids = concept_to_term_map.get(concept_id, [f"UNK_{concept_id}"])

            # Convert term IDs to actual term names
            terms = [term_id_map.get(term_id, f"UNK_{term_id}") for term_id in term_ids]

            # Store frequencies for all mapped terms
            for term in terms:
                singleton_freq[term] = singleton_freq.get(term, 0) + count  # Sum counts if needed

            if (i + 1) % chunk_size == 0:  # Process in chunks
                print(f"Processed {i + 1} lines")
                # Optionally, write intermediate results to disk here to manage memory

    return singleton_freq


def load_cofreq_counts(file_path, concept_to_term_map, term_id_map, output_file="temp_cofreq.txt", chunk_size=1000):
    """ Process cofrequency counts in a memory-efficient way for up to chunk_size entries. """
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f, open(output_file, 'w', encoding='utf-8') as out_f:
        for i, line in enumerate(f):
            concept1_id, concept2_id, count = line.strip().split("\t")
            count = int(count)

            # Convert concept IDs into actual term names
            terms1 = [term_id_map.get(t, f"UNK_{t}") for t in concept_to_term_map.get(concept1_id, [concept1_id])]
            terms2 = [term_id_map.get(t, f"UNK_{t}") for t in concept_to_term_map.get(concept2_id, [concept2_id])]

            # Write mapped term pairs to file
            for term1 in terms1:
                for term2 in terms2:
                    if term1 != term2:  # Avoid self-pairs
                        out_f.write(f"{term1}\t{term2}\t{count}\n")

            if (i + 1) % chunk_size == 0:  # Process in chunks
                print(f"Processed {i + 1} lines")
                # Optionally, write intermediate results to disk here to manage memory

    return output_file  # Return the file path


import numpy as np
from scipy.sparse import dok_matrix, csr_matrix

def compute_npmi(cofreq_file, singleton_freq, total_count, term_index, matrix_size, k=1, output_pmi="temp_pmi.npz", chunk_size=100000, max_rows=1000000):
    """
    Compute Normalized PMI (NPMI) using at most `max_rows` rows from co-occurrence file.
    Uses `dok_matrix` for fast updates, then converts to `csr_matrix`.
    """
    print(f"Initializing sparse PMI matrix (limiting to {max_rows} rows)...")
    pmi_matrix = dok_matrix((matrix_size, matrix_size), dtype=np.float32)

    print(f"Processing up to {max_rows} rows from co-occurrence file in chunks...")
    batch_data = []

    with open(cofreq_file, 'r', encoding='utf-8', errors='ignore') as f:
        for i, line in enumerate(f):
            if i >= max_rows:  # 🚀 Stop after 1 million rows
                break

            term1, term2, co_count = line.strip().split("\t")
            co_count = int(co_count)

            if term1 not in term_index or term2 not in term_index:
                continue  # Skip unknown terms

            i_idx, j_idx = term_index[term1], term_index[term2]
            p_x = singleton_freq.get(term1, 1) / total_count
            p_y = singleton_freq.get(term2, 1) / total_count
            p_xy = co_count / total_count

            # Compute PMI
            pmi = np.log2((p_xy + k) / (p_x * p_y + k))
            npmi = pmi / -np.log2(p_xy + k) if p_xy > 0 else 0

            batch_data.append((i_idx, j_idx, npmi))

            if len(batch_data) >= chunk_size:
                for row, col, value in batch_data:
                    pmi_matrix[row, col] = value
                    pmi_matrix[col, row] = value  # PMI is symmetric
                batch_data = []  # Reset batch

            if (i + 1) % chunk_size == 0:
                print(f"Processed {i + 1} lines...")

    # Final batch processing
    for row, col, value in batch_data:
        pmi_matrix[row, col] = value
        pmi_matrix[col, row] = value

    print("Converting PMI matrix to CSR format...")
    csr_pmi = pmi_matrix.tocsr()

    print("Saving PMI matrix to disk...")
    np.savez(output_pmi, data=csr_pmi.data, indices=csr_pmi.indices, indptr=csr_pmi.indptr, shape=csr_pmi.shape)

    print("PMI computation complete!")
    return output_pmi  # Return file path instead of keeping it in memory


def find_synonyms_fast(pmi_file, term_index, target_word, top_n=5):
    """ Efficient synonym retrieval using a stored PMI matrix. """
    if target_word not in term_index:
        print(f"Word '{target_word}' not found in vocabulary.")
        return []

    # Load sparse PMI matrix from file
    loaded = np.load(pmi_file)
    pmi_matrix = csr_matrix((loaded["data"], loaded["indices"], loaded["indptr"]), shape=loaded["shape"])

    words = list(term_index.keys())
    target_idx = term_index[target_word]

    # Compute sparse cosine similarity
    similarities = cosine_similarity(pmi_matrix[target_idx], pmi_matrix)[0]
    sorted_indices = np.argsort(similarities)[::-1]  # Descending order

    return [(words[i], similarities[i]) for i in sorted_indices if words[i] != target_word][:top_n]

### 5️⃣ Filter Synonyms Using Concept ID ###
def filter_synonyms_by_concept(synonyms, concept_id_map, target_word):
    """ Ensure synonyms belong to the same concept cluster. """
    target_concept = concept_id_map.get(target_word, None)
    if not target_concept:
        return synonyms  # No concept info available

    return [(word, score) for word, score in synonyms if concept_id_map.get(word) == target_concept]

def load_concept_to_term_map(file_path):
    """ Load concept-to-term mappings where multiple terms can map to the same concept. """
    concept_to_term_map = defaultdict(list)

    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            try:
                term_id, concept_id = line.strip().split("\t")
                concept_to_term_map[concept_id].append(term_id)  # Store all terms for a concept
            except ValueError:
                continue  # Skip malformed lines

    return concept_to_term_map


# from CharNGram code

### 5️⃣ Filter Synonyms Using Concept ID ###
def filter_synonyms_by_concept(synonyms, concept_id_map, target_word):
    """ Ensure synonyms belong to the same concept cluster. """
    target_concept = concept_id_map.get(target_word, None)
    if not target_concept:
        return synonyms  # No concept info available

    return [(word, score) for word, score in synonyms if concept_id_map.get(word) == target_concept]

def load_concept_to_term_map(file_path):
    """ Load concept-to-term mappings where multiple terms can map to the same concept. """
    concept_to_term_map = defaultdict(list)

    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            try:
                term_id, concept_id = line.strip().split("\t")
                concept_to_term_map[concept_id].append(term_id)  # Store all terms for a concept
            except ValueError:
                continue  # Skip malformed lines

    return concept_to_term_map

def extract_char_ngrams(term, min_n=3, max_n=13):
    term = f"<{term}>"
    ngrams = []
    for n in range(min_n, max_n + 1):
        ngrams += [f"{n}gram-{term[i:i+n]}" for i in range(len(term)-n+1)]
    return ngrams


def load_char_ngram_embeddings(file_path):
    embedding_map = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) < 10:  # Skip headers or bad lines
                continue
            gram = parts[0]
            vec = np.array([float(x) for x in parts[1:]], dtype=np.float32)
            embedding_map[gram] = vec
    return embedding_map

def compute_term_embedding(term, ngram_embeds, min_n=1, max_n=13, dim=100):
    term = term.lower().strip()
    ngrams = extract_char_ngrams(term, min_n, max_n)
    vectors = [ngram_embeds[ng] for ng in ngrams if ng in ngram_embeds]
    if not vectors:
        return np.zeros(dim)
    return np.mean(vectors, axis=0)


def build_term_embeddings(term_list, ngram_embeds, dim=100):
    term_vectors = {}
    for term in term_list:
        term_vectors[term] = compute_term_embedding(term, ngram_embeds, dim=dim)
    return term_vectors

def find_surface_form_synonyms(term_vectors, target_term, ngram_embeddings, dim=100, top_n=5):
    if target_term in term_vectors:
        target_vec = term_vectors[target_term]
    else:
        print(f"'{target_term}' not in vocabulary — computing from n-grams.")
        target_vec = compute_term_embedding(target_term, ngram_embeddings, dim=dim)

    if np.linalg.norm(target_vec) == 0:
        print(f"Could not compute embedding for '{target_term}' (no matching n-grams).")
        return []

    target_vec = target_vec.reshape(1, -1)
    terms = list(term_vectors.keys())
    matrix = np.stack([term_vectors[t] for t in terms])
    sims = cosine_similarity(target_vec, matrix)[0]

    ranked = sorted(zip(terms, sims), key=lambda x: -x[1])
    return [(term, sim) for term, sim in ranked if term != target_term][:top_n]

# Hybrid code
def normalize_scores(score_list):
    scores = np.array([score for _, score in score_list])
    if len(scores) == 0:
        return []
    min_val, max_val = np.min(scores), np.max(scores)
    norm = lambda s: (s - min_val) / (max_val - min_val) if max_val > min_val else 0.0
    return [(term, norm(score)) for term, score in score_list]


def combine_hybrid_scores(surface_scores, contextual_scores, alpha=0.5):
    surface_dict = dict(surface_scores)
    contextual_dict = dict(contextual_scores)
    combined = {}

    all_terms = set(surface_dict.keys()).union(contextual_dict.keys())
    for term in all_terms:
        s_score = surface_dict.get(term, 0.0)
        c_score = contextual_dict.get(term, 0.0)
        combined[term] = alpha * s_score + (1 - alpha) * c_score

    sorted_combined = sorted(combined.items(), key=lambda x: -x[1])
    return sorted_combined


if __name__ == "__main__":
    # File Paths
    mapping_files = [
        "1_term_ID_to_string.txt",
        "2a_concept_ID_to_string.txt"
    ]
    concept_map_file = "3_term_ID_to_concept_ID.txt"
    singleton_file = "singlets_concepts_perBin_1d.txt"
    cofreq_file = "cofreqs_concepts_perBin_1d.txt"

    # Load mappings
    term_id_map, concept_id_map = load_term_id_to_string(mapping_files)
    concept_to_term_map = load_concept_to_term_map(concept_map_file)

    # Load Data (Only First 100 Lines)
    singleton_freq = load_singleton_frequencies(singleton_file, concept_to_term_map, term_id_map)
    cofreq_output_file = load_cofreq_counts(cofreq_file, concept_to_term_map, term_id_map)

    total_count = sum(singleton_freq.values())

    # Compute PMI (Streaming, Small Dataset)
    term_index = {term: i for i, term in enumerate(singleton_freq.keys())}
    matrix_size = len(term_index)

    pmi_output_file = compute_npmi(cofreq_output_file, singleton_freq, total_count, term_index, matrix_size, max_rows=1000000)

    #CharNGram part
    charngram_embedding_file = 'charNgram.txt'

     # === Build Term List ===
    term_list = list(term_id_map.values())

    # === Load CharNGram Embeddings ===
    ngram_embeddings = load_char_ngram_embeddings(charngram_embedding_file)

    # === Compute Embeddings for All Terms ===
    print("Computing term embeddings...")
    term_vectors = build_term_embeddings(term_list, ngram_embeddings, dim=100)


    # === Choose Target Word ===
    target_word = "leukemia"

    print(f"\n🔍 Finding surface-form synonyms for: {target_word}")
    surface_synonyms = find_surface_form_synonyms(term_vectors, target_word, ngram_embeddings, dim=100, top_n=100)
    surface_synonyms = normalize_scores(surface_synonyms)

    print(f"\n🔍 Finding contextual synonyms for: {target_word}")
    contextual_synonyms = find_synonyms_fast(pmi_output_file, term_index, target_word, top_n=100)
    contextual_synonyms = normalize_scores(contextual_synonyms)

    print(f"\n🔀 Combining hybrid scores for: {target_word}")
    hybrid_synonyms = combine_hybrid_scores(surface_synonyms, contextual_synonyms, alpha=0.5)

    print(f"\n✅ Top Hybrid Synonyms for '{target_word}':")
    for term, score in hybrid_synonyms[:10]:
        print(f"{term}: {score:.4f}")

    # # === Find Synonyms ===
    # target_word = "leukemia"
    # synonyms = find_surface_form_synonyms(term_vectors, target_word, ngram_embeddings, top_n=10)

    # print(f"Top CharNGram-based synonyms for '{target_word}':")
    # for word, score in synonyms:
    #     print(f"{word}: {score:.4f}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processed 3590000 lines
Processed 3591000 lines
Processed 3592000 lines
Processed 3593000 lines
Processed 3594000 lines
Processed 3595000 lines
Processed 3596000 lines
Processed 3597000 lines
Processed 3598000 lines
Processed 3599000 lines
Processed 3600000 lines
Processed 3601000 lines
Processed 3602000 lines
Processed 3603000 lines
Processed 3604000 lines
Processed 3605000 lines
Processed 3606000 lines
Processed 3607000 lines
Processed 3608000 lines
Processed 3609000 lines
Processed 3610000 lines
Processed 3611000 lines
Processed 3612000 lines
Processed 3613000 lines
Processed 3614000 lines
Processed 3615000 lines
Processed 3616000 lines
Processed 3617000 lines
Processed 3618000 lines
Processed 3619000 lines
Processed 3620000 lines
Processed 3621000 lines
Processed 3622000 lines
Processed 3623000 lines
Processed 3624000 lines
Processed 3625000 lines
Processed 3626000 lines
Processed 3627000 lines
Processed 3628000 lines