In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity

def load_singleton_frequencies(file_path, term_id_map):
    singleton_freq = {}
    with open(file_path, 'r') as f:
        for line in f:
            term_id, count = line.strip().split("\t")
            term_str = term_id_map.get(term_id, f"UNK_{term_id}")  # Use ID if mapping is missing
            singleton_freq[term_str] = int(count)
    return singleton_freq

def load_cofreq_counts(file_path, term_id_map):
    cofreq_counts = {}
    with open(file_path, 'r') as f:
        for line in f:
            term1_id, term2_id, count = line.strip().split("\t")
            term1_str = term_id_map.get(term1_id, f"UNK_{term1_id}")
            term2_str = term_id_map.get(term2_id, f"UNK_{term2_id}")
            cofreq_counts[(term1_str, term2_str)] = int(count)
    return cofreq_counts



In [2]:
def load_term_id_to_string(file_path):
    term_id_map = {}
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            try:
                term_id, term_str = line.strip().split("\t")
                term_id_map[term_id] = term_str
            except ValueError:
                print(f"Skipping invalid line in {file_path}: {line}")
    return term_id_map


In [3]:
def compute_pmi(cofreq_counts, singleton_freq, total_count, k=1):
    pmi_matrix = {}

    for (term1, term2), co_count in cofreq_counts.items():
        p_x = singleton_freq.get(term1, 1) / total_count
        p_y = singleton_freq.get(term2, 1) / total_count
        p_xy = co_count / total_count

        # Apply smoothing (Laplace-style to avoid zero division)
        pmi = np.log2((p_xy + k) / (p_x * p_y + k))
        pmi_matrix[(term1, term2)] = pmi

    return pmi_matrix


In [4]:
def build_pmi_vectors(pmi_matrix):
    word_vectors = defaultdict(lambda: np.zeros(len(set([t for pair in pmi_matrix.keys() for t in pair]))))
    word_index = {word: i for i, word in enumerate(set([t for pair in pmi_matrix.keys() for t in pair]))}

    for (term1, term2), pmi in pmi_matrix.items():
        word_vectors[term1][word_index[term2]] = pmi
        word_vectors[term2][word_index[term1]] = pmi  # Since PMI is symmetric

    return word_vectors, word_index


In [5]:
def find_synonyms(word_vectors, word_index, target_word, top_n=5):
    if target_word not in word_vectors:
        print(f"Word '{target_word}' not found in vocabulary.")
        return []

    words = list(word_index.keys())
    vec_matrix = np.array([word_vectors[w] for w in words])

    # Compute cosine similarity
    target_vector = word_vectors[target_word].reshape(1, -1)
    similarities = cosine_similarity(target_vector, vec_matrix)[0]

    # Rank terms by similarity
    sorted_indices = np.argsort(similarities)[::-1]
    return [(words[i], similarities[i]) for i in sorted_indices if words[i] != target_word][:top_n]


In [None]:
# File paths
singleton_file = "../2_Singleton_Frequency_Counts/singlets_concepts_perBin_1d.txt"
cofreq_file = "../1_Cofrequency_Counts/cofreqs_concepts_perBin_1d.txt"
term_map_file = "../3_ID_Mappings/1_term_ID_to_string.txt"

term_id_map = load_term_id_to_string(term_map_file)

# Load Data
singleton_freq = load_singleton_frequencies(singleton_file, term_id_map)
cofreq_counts = load_cofreq_counts(cofreq_file, term_id_map)
total_count = sum(singleton_freq.values())

# Compute PMI
pmi_matrix = compute_pmi(cofreq_counts, singleton_freq, total_count)

# Convert PMI into word vectors
word_vectors, word_index = build_pmi_vectors(pmi_matrix)

# Find synonyms for a given word
target_word = "heart_attack"
synonyms = find_synonyms(word_vectors, word_index, target_word)

print(f"Top synonyms for '{target_word}':")
for word, score in synonyms:
    print(f"{word}: {score:.4f}")
