In [1]:
import json
import torch
from transformers import BertTokenizer, BertModel
import pandas as pd
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [None]:
def get_bert_embedding(text, tokenizer, model):
    """Get BERT embedding for a single word/phrase."""
    # Add special tokens and convert to tensor
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized)
    segments_ids = [1] * len(tokenized)
    
    # Convert to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    
    # Get embeddings
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs.last_hidden_state
    
    # Use [CLS] token embedding as sentence representation
    token_embeddings = hidden_states[0]
    return token_embeddings[0].numpy()  # Return the [CLS] token embedding

def process_stereotype_dictionary(json_data):
    # Load pre-trained model and tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    
    # Initialize lists to store data
    categories = []
    words = []
    embeddings = []
    
    # Process each category and its words
    for category, word_list in json_data.items():
        print(f"Processing category: {category}")
        for word in word_list:
            # Get embedding for the word
            embedding = get_bert_embedding(word, tokenizer, model)
            
            # Append to lists
            categories.append(category)
            words.append(word)
            embeddings.append(embedding)
    
    # Create DataFrame
    df = pd.DataFrame({
        'category': categories,
        'word': words,
        'embedding': embeddings
    })
    
    return df

# Load and process the data
file_path = 'Stereotype_Dictionary.json'
with open(file_path, 'r') as file:
    stereotype_dict = json.load(file)

# Process the dictionary and get embeddings
df = process_stereotype_dictionary(stereotype_dict)

# Save to CSV (embeddings will be stored as string representation)
df.to_csv('Stereotypes_word_embeddings.csv', index=False)

# # Optional: Save to pickle to preserve numpy arrays
# df.to_pickle('word_embeddings.pkl')

# Print first few rows
print("\nFirst few rows of the DataFrame:")
print(df.head())

# Print embedding shape
print("\nEmbedding shape:", df['embedding'].iloc[0].shape)

Processing category: Incompetence
Processing category: Warm
Processing category: Cold
Processing category: Competence
Processing category: Jews
Processing category: Christians

First few rows of the DataFrame:
       category           word  \
0  Incompetence      unnatural   
1  Incompetence           back   
2  Incompetence   uneconomical   
3  Incompetence      dependent   
4  Incompetence  unworkmanlike   

                                           embedding  
0  [-0.21742886, 0.25219482, -0.08264218, 0.05514...  
1  [-0.18149848, 0.13819107, -0.16554144, -0.0107...  
2  [-0.9678343, 0.1915159, -0.53415775, 0.1020671...  
3  [-0.19741559, 0.088816985, 0.06223922, 0.03611...  
4  [-0.7659353, 0.08473118, -0.66141856, 0.282256...  

Embedding shape: (768,)


In [None]:
# Load and process the data
file_path = 'Stereotype_Dictionary.json'
with open(file_path, 'r') as file:
    stereotype_dict = json.load(file)

# Process the dictionary and get embeddings
df = process_stereotype_dictionary(stereotype_dict)

# Save to CSV (embeddings will be stored as string representation)
df.to_csv('Stereotypes_word_embeddings.csv', index=False)

# # Optional: Save to pickle to preserve numpy arrays
# df.to_pickle('word_embeddings.pkl')

# Print first few rows
print("\nFirst few rows of the DataFrame:")
print(df.head())

# Print embedding shape
print("\nEmbedding shape:", df['embedding'].iloc[0].shape)