# Text Embeddings
## * Model Scales

In [None]:
import gensim.downloader as api
from transformers import AutoModel
from sentence_transformers import SentenceTransformer

MODELS = {
    "Glove": "glove-wiki-gigaword-300",
    "Word2Vec": "word2vec-google-news-300",
    "bert-base": "bert-base-uncased",
    "bert-large": "bert-large-uncased",
    "defsent-bert": "bert-base-uncased",
    "defsent-roberta": "sentence-transformers/all-roberta-large-v1",
    "distilroberta-vl": "sentence-transformers/all-distilroberta-v1",
    "mpnet-base-v2": "sentence-transformers/all-mpnet-base-v2",
    "sentence-t5-large": "sentence-transformers/sentence-t5-large"
}

def get_model_dimension(model_name: str) -> int:
    if model_name in ["Glove", "Word2Vec"]:
        wv = api.load(MODELS[model_name])
        return wv.vector_size

    elif model_name in ["bert-base", "bert-large", "defsent-bert", "defsent-roberta"]:
        model = AutoModel.from_pretrained(MODELS[model_name])
        return model.config.hidden_size

    else:  # sentence-transformers 
        st_model = SentenceTransformer(MODELS[model_name])
        return st_model.get_sentence_embedding_dimension()


if __name__ == "__main__":
    model_list = [
        "Glove", "Word2Vec", "bert-base", "bert-large",
        "defsent-bert", "defsent-roberta",
        "distilroberta-vl", "mpnet-base-v2", "sentence-t5-large"
    ]
    for model_name in model_list:
        dim = get_model_dimension(model_name)
        print(f"{model_name}: embedding dimension = {dim}")


## 1. Set Up Opnenai API Key

In [None]:
import openai
OPENAI_API_KEY = "your_openai_api_key_here"
openai.api_key = OPENAI_API_KEY

## 2. Embeddings

In [None]:
import json
import numpy as np
import os
from tqdm import tqdm
from typing import List
from sentence_transformers import SentenceTransformer
import gensim.downloader as api
from transformers import AutoTokenizer, AutoModel
import torch

# ==== Model Mapping ====
MODELS = {
    "Glove": "glove-wiki-gigaword-300",
    "Word2Vec": "word2vec-google-news-300",
    "bert-base": "bert-base-uncased",
    "bert-large": "bert-large-uncased",
    "defsent-bert": "bert-base-uncased",
    "defsent-roberta": "sentence-transformers/all-roberta-large-v1",
    "distilroberta-vl": "sentence-transformers/all-distilroberta-v1",
    "mpnet-base-v2": "sentence-transformers/all-mpnet-base-v2",
    "sentence-t5-large": "sentence-transformers/sentence-t5-large"
}

# --- Static Word Vector Embedding ---
def embed_with_static_vectors(texts: List[str], wv) -> List[np.ndarray]:
    vectors = []
    for text in tqdm(texts):
        words = text.lower().split()
        word_vectors = [wv[word] for word in words if word in wv]
        if word_vectors:
            avg_vector = np.mean(word_vectors, axis=0)
        else:
            avg_vector = np.zeros(wv.vector_size)
        vectors.append(avg_vector)
    return vectors

# --- BERT Class Model Embedding ---
def embed_with_bert(text: str, tokenizer, model) -> np.ndarray:
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        outputs = model(**inputs)
        last_hidden = outputs.last_hidden_state  # (1, seq_len, hidden)
        attention_mask = inputs['attention_mask'].unsqueeze(-1)
        masked_hidden = last_hidden * attention_mask
        pooled = masked_hidden.sum(dim=1) / attention_mask.sum(dim=1)
        return pooled.squeeze().numpy()

# --- Main Embedding Entry ---
def embed_texts(model_name: str, texts: List[str]) -> List[np.ndarray]:
    print(f"\nEmbedding with model: {model_name}")
    if model_name in ["Glove", "Word2Vec"]:
        return embed_with_static_vectors(texts, api.load(MODELS[model_name]))
    elif model_name in ["bert-base", "bert-large", "defsent-bert", "defsent-roberta"]:
        tokenizer = AutoTokenizer.from_pretrained(MODELS[model_name])
        model = AutoModel.from_pretrained(MODELS[model_name])
        model.eval()
        return [embed_with_bert(text, tokenizer, model) for text in tqdm(texts)]
    else:
        st_model = SentenceTransformer(MODELS[model_name])
        return st_model.encode(texts, convert_to_numpy=True, show_progress_bar=True)

# --- Batch File + Merge Processing ---
def process_all_files(input_folder: str, model_name: str, output_path: str):
    merged_data = {}
    texts = []
    concept_refs = []

    # Traverse all JSON files in the folder
    for filename in os.listdir(input_folder):
        if filename.endswith(".json"):
            file_path = os.path.join(input_folder, filename)
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)

            # Merge into main dictionary
            for key, value in data.items():
                merged_data[key] = value
                for definition in value.get("definitions", []):
                    concepts = definition.get("semantic_analysis", {}).get("concepts", [])
                    for concept in concepts:
                        if "informal_definition" in concept and concept["informal_definition"].strip():
                            texts.append(concept["informal_definition"])
                            concept_refs.append(concept)

    print(f"🔍 {model_name}: Total {len(texts)} texts to embed")

    # Generate embeddings
    embeddings = embed_texts(model_name, texts)

    # Write back to JSON
    for concept, emb in zip(concept_refs, embeddings):
        concept["embedding_vector"] = emb.tolist()

    # Save merged results
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(merged_data, f, ensure_ascii=False, indent=2)
    print(f"{model_name} embeddings saved to {output_path}")

# --- Main Execution ---
if __name__ == "__main__":
    input_folder = "./Informalisation_and_Mathematical_DSRL/informal_data"  # Folder containing multiple JSON files
    output_dir = "./embedding_results"

    model_list = [
        "Glove", "Word2Vec", "bert-base", "bert-large",
        "defsent-bert", "defsent-roberta",
        "distilroberta-vl", "mpnet-base-v2", "sentence-t5-large"
    ]

    for model_name in model_list:
        output_file = os.path.join(output_dir, f"merged_with_embeddings_{model_name}.json")
        process_all_files(input_folder, model_name, output_file)

## 3. Cleaning

In [None]:
import json
import re

def clean_concept_name(name: str) -> str:
    # Remove leading/trailing spaces and convert to lowercase
    name = name.strip().lower()
    # Remove special characters (keep alphanumeric and spaces)
    name = re.sub(r'[^a-z0-9\s]', '', name)
    # Compress extra spaces
    name = re.sub(r'\s+', ' ', name)
    return name

# Input file path
input_path = "./embedding_results/merged_with_embeddings_sentence-t5-large.json"
output_path = "./embedding_results/merged_with_embeddings_sentence-t5-large_cleaned.json"

# Read JSON
with open(input_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Traverse concepts for cleaning
for key, value in data.items():
    for definition in value.get("definitions", []):
        concepts = definition.get("semantic_analysis", {}).get("concepts", [])
        for concept in concepts:
            if "name" in concept:
                concept["name_cleaned"] = clean_concept_name(concept["name"])

# Save cleaned results
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print(f"Cleaning completed, saved to {output_path}")