In [None]:
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import lil_matrix, csr_matrix

def load_term_id_to_string(file_paths):
    """ Load multiple term ID to string mappings and merge them into a single dictionary. """
    term_id_map = {}

    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            for line in f:
                try:
                    term_id, term_str = line.strip().split("\t")
                    #print(term_id, term_str)
                    term_id_map[term_id] = term_str
                except ValueError:
                    continue  # Skip malformed lines

    return term_id_map


def load_singleton_frequencies(file_path, term_id_map):
    """ Load singleton frequencies and convert IDs to terms """
    singleton_freq = {}
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            term_id, count = line.strip().split("\t")
            term_str = term_id_map.get(term_id, f"UNK_{term_id}")
            singleton_freq[term_str] = int(count)
    return singleton_freq

def load_cofreq_counts(file_path, term_id_map):
    """ Load cofrequency counts and convert IDs to terms """
    cofreq_counts = []
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            term1_id, term2_id, count = line.strip().split("\t")
            term1_str = term_id_map.get(term1_id, f"UNK_{term1_id}")
            term2_str = term_id_map.get(term2_id, f"UNK_{term2_id}")
            cofreq_counts.append((term1_str, term2_str, int(count)))
    return cofreq_counts

def compute_pmi(cofreq_counts, singleton_freq, total_count, k=1):
    """ Compute PMI using NumPy for better performance """
    terms = list(singleton_freq.keys())
    term_index = {term: i for i, term in enumerate(terms)}
    
    pmi_matrix = lil_matrix((len(terms), len(terms)), dtype=np.float32)  # Sparse matrix
    
    for term1, term2, co_count in cofreq_counts:
        if term1 not in term_index or term2 not in term_index:
            continue
        
        i, j = term_index[term1], term_index[term2]
        p_x = singleton_freq.get(term1, 1) / total_count
        p_y = singleton_freq.get(term2, 1) / total_count
        p_xy = co_count / total_count
        
        pmi = np.log2((p_xy + k) / (p_x * p_y + k))
        pmi_matrix[i, j] = pmi
        pmi_matrix[j, i] = pmi  # Since PMI is symmetric

    return pmi_matrix.tocsr(), term_index  # Convert to efficient sparse CSR format

def find_synonyms(pmi_matrix, term_index, target_word, top_n=5):
    """ Find synonyms using cosine similarity with sparse matrices """
    if target_word not in term_index:
        print(f"Word '{target_word}' not found in vocabulary.")
        return []
    
    words = list(term_index.keys())
    target_vector = pmi_matrix[term_index[target_word]].reshape(1, -1)
    
    similarities = cosine_similarity(target_vector, pmi_matrix)[0]
    sorted_indices = np.argsort(similarities)[::-1]  # Descending order
    
    return [(words[i], similarities[i]) for i in sorted_indices if words[i] != target_word][:top_n]

# File paths
mapping_files = [
    "../3_ID_Mappings/1_term_ID_to_string.txt"
    ,"../3_ID_Mappings/2a_concept_ID_to_string.txt"
    # ,"../3_ID_Mappings/2b_concept_ID_to_CUI.txt"
    # ,"../3_ID_Mappings/3_term_ID_to_concept_ID.txt"
]

singleton_file = "../2_Singleton_Frequency_Counts/singlets_concepts_perBin_1d.txt"
cofreq_file = "../1_Cofrequency_Counts/cofreqs_concepts_perBin_1d.txt"
term_id_map = load_term_id_to_string(mapping_files)

# Test files
# singleton_file = "../2_Singleton_Frequency_Counts/singleton_test.txt"
# cofreq_file = "../1_Cofrequency_Counts/cofreq_test.txt"
# term_map_file = "../3_ID_Mappings/term_id_to_string_test.txt"

# Load data
singleton_freq = load_singleton_frequencies(singleton_file, term_id_map)
cofreq_counts = load_cofreq_counts(cofreq_file, term_id_map)
total_count = sum(singleton_freq.values())

# Compute PMI matrix (sparse)
pmi_matrix, term_index = compute_pmi(cofreq_counts, singleton_freq, total_count)


In [20]:
term_id_map['9']

'history'

In [26]:
len(term_index)

23206

In [2]:
# Find synonyms
target_word = "leukemia"
synonyms = find_synonyms(pmi_matrix, term_index, target_word)

# Output results
print(f"Top synonyms for '{target_word}':")
for word, score in synonyms:
    print(f"{word}: {score:.4f}")

NameError: name 'find_synonyms' is not defined