In [4]:
import json
import torch
from transformers import BertTokenizer, BertModel
import pandas as pd
import numpy as np

In [5]:
def get_bert_embedding(text, tokenizer, model):
    """Get BERT embedding for a single word/phrase."""
    # Add special tokens and convert to tensor
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized)
    segments_ids = [1] * len(tokenized)
    
    # Convert to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    
    # Get embeddings
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs.last_hidden_state
    
    # Use [CLS] token embedding as sentence representation
    token_embeddings = hidden_states[0]
    return token_embeddings[0].numpy()  # Return the [CLS] token embedding

def process_stereotype_dictionary(json_data):
    # Load pre-trained model and tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    
    # Initialize lists to store data
    categories = []
    words = []
    embeddings = []
    
    # Process each category and its words
    for category, word_list in json_data.items():
        print(f"Processing category: {category}")
        for word in word_list:
            # Get embedding for the word
            embedding = get_bert_embedding(word, tokenizer, model)
            
            # Append to lists
            categories.append(category)
            words.append(word)
            embeddings.append(embedding)
    
    # Create DataFrame
    df = pd.DataFrame({
        'category': categories,
        'word': words,
        'embedding': embeddings
    })
    
    return df

# Load and process the data
file_path = 'Stereotype_Dictionary.json'
with open(file_path, 'r') as file:
    stereotype_dict = json.load(file)

# Process the dictionary and get embeddings
df = process_stereotype_dictionary(stereotype_dict)

# Save to CSV (embeddings will be stored as string representation)
df.to_csv('BERT_Stereotypes_word_embeddings.csv', index=False)

# # Optional: Save to pickle to preserve numpy arrays
# df.to_pickle('word_embeddings.pkl')

# Print first few rows
print("\nFirst few rows of the DataFrame:")
print(df.head())

# Print embedding shape
print("\nEmbedding shape:", df['embedding'].iloc[0].shape)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing category: Incompetence
Processing category: Warm
Processing category: Cold
Processing category: Competence
Processing category: Jewish
Processing category: Muslim
Processing category: Arabic
Processing category: Israeli
Processing category: IDF
Processing category: Hamas

First few rows of the DataFrame:
       category          word  \
0  Incompetence    unreliable   
1  Incompetence  supernatural   
2  Incompetence  uneconomical   
3  Incompetence          flat   
4  Incompetence   unconvinced   

                                           embedding  
0  [-0.31127185, 0.19724026, -0.10238215, -0.0328...  
1  [-0.24771827, 0.19920933, -0.23368365, -0.1174...  
2  [-0.9678349, 0.19151619, -0.5341579, 0.1020674...  
3  [-0.552664, 0.115240745, -0.06589937, -0.42486...  
4  [-0.4181416, 0.03374875, -0.28296804, 0.213325...  

Embedding shape: (768,)


In [6]:
# # Load and process the data
# file_path = 'Stereotype_Dictionary.json'
# with open(file_path, 'r') as file:
#     stereotype_dict = json.load(file)

# # Process the dictionary and get embeddings
# df = process_stereotype_dictionary(stereotype_dict)

# # Save to CSV (embeddings will be stored as string representation)
# df.to_csv('Stereotypes_word_embeddings.csv', index=False)

# # # Optional: Save to pickle to preserve numpy arrays
# # df.to_pickle('word_embeddings.pkl')

# # Print first few rows
# print("\nFirst few rows of the DataFrame:")
# print(df.head())

# # Print embedding shape
# print("\nEmbedding shape:", df['embedding'].iloc[0].shape)