In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
pip install sparse_dot_topn

Collecting sparse_dot_topn
  Downloading sparse_dot_topn-1.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Downloading sparse_dot_topn-1.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (266 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/266.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.7/266.7 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sparse_dot_topn
Successfully installed sparse_dot_topn-1.1.5


#**Word2Vec**

In [3]:
import numpy as np
from collections import defaultdict
from scipy.sparse import lil_matrix, csr_matrix
from sklearn.decomposition import TruncatedSVD

def load_term_id_to_string(file_paths):
    """ Load term ID to string mappings """
    term_id_map = {}
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            for line in f:
                try:
                    term_id, term_str = line.strip().split("\t")
                    term_id_map[term_id] = term_str
                except ValueError:
                    continue
    return term_id_map

In [4]:
def load_concept_to_term_map(file_path):
    """ Load concept-to-term mappings """
    concept_to_term_map = defaultdict(list)
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            try:
                term_id, concept_id = line.strip().split("\t")
                concept_to_term_map[concept_id].append(term_id)
            except ValueError:
                continue
    return concept_to_term_map

In [5]:
def load_singleton_frequencies(file_path, concept_to_term_map, term_id_map):
    """ Load singleton frequencies """
    singleton_freq = defaultdict(int)
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            concept_id, count = line.strip().split("\t")
            term_ids = concept_to_term_map.get(concept_id, [])
            for term_id in term_ids:
                term = term_id_map.get(term_id, f"UNK_{term_id}")
                singleton_freq[term] += int(count)
    return singleton_freq

In [6]:
def process_cofreq_file(cofreq_file, concept_to_term_map, term_id_map, output_file):
    """ Process co-occurrence frequencies """
    with open(cofreq_file, 'r') as fin, open(output_file, 'w') as fout:
        for line in fin:
            concept1, concept2, count = line.strip().split("\t")
            terms1 = [term_id_map.get(t, f"UNK_{t}")
                     for t in concept_to_term_map.get(concept1, [])]
            terms2 = [term_id_map.get(t, f"UNK_{t}")
                     for t in concept_to_term_map.get(concept2, [])]
            for t1 in terms1:
                for t2 in terms2:
                    if t1 != t2:
                        fout.write(f"{t1}\t{t2}\t{count}\n")


In [7]:
"""def compute_sppmi_matrix(cofreq_path, singleton_freq, total_count, term_index, shift=1.0, chunk_size):
    vocab_size = len(term_index)
    sppmi_matrix = lil_matrix((vocab_size, vocab_size), dtype=np.float32)

    with open(cofreq_path, 'r') as f:
        for line in f:
            term1, term2, count = line.strip().split("\t")
            count = int(count)

            if term1 not in term_index or term2 not in term_index:
                continue

            i, j = term_index[term1], term_index[term2]
            p_x = singleton_freq[term1] / total_count
            p_y = singleton_freq[term2] / total_count
            p_xy = count / total_count

            if p_xy == 0:
                continue

            pmi = np.log(p_xy / (p_x * p_y))
            sppmi = max(pmi - np.log(shift), 0)

            sppmi_matrix[i, j] = sppmi
            sppmi_matrix[j, i] = sppmi  # Symmetric

    return sppmi_matrix.tocsr()"""

def compute_sppmi_matrix(cofreq_path, singleton_freq, total_count, term_index,
                        shift=1.0, chunk_size=500_000, output_file="sppmi.npz"):
    """Compute SPPMI matrix with chunked processing"""
    vocab_size = len(term_index)
    sppmi_matrix = lil_matrix((vocab_size, vocab_size), dtype=np.float32)

    # Buffer for batch processing
    batch_data = []

    with open(cofreq_path, 'r') as f:
        for i, line in enumerate(f):
            if i >= chunk_size:  # Process in chunks
                _process_batch(batch_data, sppmi_matrix)
                batch_data = []
                break  # Remove this for full processing

            term1, term2, count = line.strip().split("\t")
            count = int(count)

            if term1 not in term_index or term2 not in term_index:
                continue

            i_idx, j_idx = term_index[term1], term_index[term2]
            p_x = singleton_freq[term1] / total_count
            p_y = singleton_freq[term2] / total_count
            p_xy = count / total_count

            if p_xy == 0:
                continue

            pmi = np.log(p_xy / (p_x * p_y))
            sppmi = max(pmi - np.log(shift), 0)

            batch_data.append((i_idx, j_idx, sppmi))

            if len(batch_data) >= chunk_size:
                _process_batch(batch_data, sppmi_matrix)
                batch_data = []

    # Process final batch
    if batch_data:
        _process_batch(batch_data, sppmi_matrix)

    # Save and return matrix
    csr_matrix = sppmi_matrix.tocsr()
    np.savez(output_file,
             data=csr_matrix.data,
             indices=csr_matrix.indices,
             indptr=csr_matrix.indptr,
             shape=csr_matrix.shape)
    return output_file

def _process_batch(batch_data, matrix):
    """Process a batch of co-occurrence pairs"""
    for i_idx, j_idx, sppmi in batch_data:
        matrix[i_idx, j_idx] = sppmi
        matrix[j_idx, i_idx] = sppmi  # Maintain symmetry

In [8]:
"""def train_word2vec_embeddings(sppmi_matrix, dim=300):
    svd = TruncatedSVD(n_components=dim, random_state=42)
    embeddings = svd.fit_transform(sppmi_matrix)

    # L2-normalize embeddings
    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
    return embeddings / (norms + 1e-8)"""

def train_word2vec_embeddings(sppmi_file, dim=300):
    """Train embeddings with proper sparse matrix loading"""
    # Load sparse matrix components
    loaded = np.load(sppmi_file)
    sppmi_matrix = csr_matrix(
        (loaded["data"], loaded["indices"], loaded["indptr"]),
        shape=loaded["shape"]
    )

    # Dimensionality reduction
    svd = TruncatedSVD(n_components=dim, random_state=42)
    embeddings = svd.fit_transform(sppmi_matrix)

    return embeddings

In [9]:
def find_similar_words(word, embeddings, term_index, top_n=5):
    """ Find similar words using embeddings """
    if word not in term_index:
        return []

    reverse_index = {v: k for k, v in term_index.items()}
    word_vec = embeddings[term_index[word]]

    # Cosine similarity
    scores = embeddings.dot(word_vec)
    top_indices = np.argsort(scores)[::-1][1:top_n+1]  # Exclude self

    return [(reverse_index[i], scores[i]) for i in top_indices]

In [10]:
if __name__ == "__main__":
    # Configuration
    MAPPING_FILES = ["drive/MyDrive/BD4H_Final_Project/3_ID_Mappings/1_term_ID_to_string.txt", "drive/MyDrive/BD4H_Final_Project/3_ID_Mappings/2a_concept_ID_to_string.txt"]
    CONCEPT_MAP_FILE = "drive/MyDrive/BD4H_Final_Project/3_ID_Mappings/3_term_ID_to_concept_ID.txt"
    SINGLETON_FILE = "drive/MyDrive/BD4H_Final_Project/2_Singleton_Frequency_Counts/singlets_concepts_perBin_1d.txt"
    COFREQ_FILE = "drive/MyDrive/BD4H_Final_Project/1_Cofrequency_Counts/cofreqs_concepts_perBin_1d.txt"
    OUTPUT_COFREQ = "drive/MyDrive/BD4H_Final_Project/processed_cofreq.txt"
    EMBEDDING_DIM = 300
    SHIFT_PARAM = 5.0  # Typical value for SPPMI

    # 1. Load data
    print("Loading term mappings...")
    term_id_map = load_term_id_to_string(MAPPING_FILES)
    concept_to_term_map = load_concept_to_term_map(CONCEPT_MAP_FILE)



Loading term mappings...


In [11]:
    # 2. Process frequencies
    print("Processing singleton frequencies...")
    singleton_freq = load_singleton_frequencies(SINGLETON_FILE, concept_to_term_map, term_id_map)
    total_count = sum(singleton_freq.values())

    print("Processing co-occurrence frequencies...")
    process_cofreq_file(COFREQ_FILE, concept_to_term_map, term_id_map, OUTPUT_COFREQ)


Processing singleton frequencies...
Processing co-occurrence frequencies...


In [12]:
    # 3. Create term index
    print("Creating vocabulary index...")
    terms = list(singleton_freq.keys())
    term_index = {term: i for i, term in enumerate(terms)}


Creating vocabulary index...


In [13]:
    # 4. Compute SPPMI matrix
    print("Computing SPPMI matrix...")
    """sppmi_matrix = compute_sppmi_matrix(OUTPUT_COFREQ, singleton_freq, total_count, term_index, shift=SHIFT_PARAM, chunk_size=500_000)"""
    sppmi_matrix = compute_sppmi_matrix(OUTPUT_COFREQ, singleton_freq=singleton_freq, total_count=total_count, term_index=term_index, shift=5.0, chunk_size=500_000)


Computing SPPMI matrix...


In [14]:
    # 5. Train embeddings
    print(f"Training Word2Vec embeddings (dim={EMBEDDING_DIM})...")
    embeddings = train_word2vec_embeddings(sppmi_matrix, EMBEDDING_DIM)


Training Word2Vec embeddings (dim=300)...


In [15]:
    # 6. Example usage
    target_word = "leukemia"
    print(f"\nTop similar words for '{target_word}':")
    similar_words = find_similar_words(target_word, embeddings, term_index)
    for word, score in similar_words:
        print(f"{word}: {score:.4f}")


Top similar words for 'leukemia':
[m]leukemia nos: 9583.3760
[m]leukemias unspecified (morphologic abnormality): 9583.3760
leucocythaemias: 9583.3760
leukemias, general: 9583.3760
leukaemia: 9583.3760
