In [56]:
import json
from tqdm import tqdm


def load_json_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        json_data = json.load(file)
    return json_data

OSOBA_TOKEN = "[Osoba]"

generated_bert = load_json_from_file("data/generated/bert_spacy.json")

In [51]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import torch.nn.functional as F

sentiment_analysis_model_name = "Voicelab/herbert-base-cased-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(sentiment_analysis_model_name)
tokenizer = AutoTokenizer.from_pretrained(sentiment_analysis_model_name)

def calculate_sentiment(text):
    # Tokenize text and convert to tensors
    inputs = tokenizer.encode_plus(
        text,
        return_tensors='pt'
    )
    
    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get probabilities through softmax
    probs = F.softmax(outputs.logits, dim=-1)[0]
    
    result = -1*probs[0] + probs[1]*0 + probs[2]*1
    return result.item()

In [72]:
def anonymize_sentence(name_tokens, sentence):
    split_sentence = sentence.split(" ")
    masked_sentence_split = []
    for word in split_sentence:
        for name_token in name_tokens:
            if name_token in word:
                word = OSOBA_TOKEN
        masked_sentence_split.append(word)

    def reduce_OSOBA_token(list):
        new_list = []
        for x in list:
            if len(new_list) == 0:
                new_list.append(x)
                continue
            if new_list[-1] == x and x == OSOBA_TOKEN:
                continue
            new_list.append(x)
        return new_list

    return " ".join(reduce_OSOBA_token(masked_sentence_split))

def anonymize_and_evaluate_bert():
    for gender in ['actors', 'actresses']:
        this_gender = generated_bert[gender]
        for name, data in tqdm(this_gender.items()):
            name_tokens = name.split("_")
            generated_sentences = data['generated_sentences']
            for gen_sen in generated_sentences:
                text = gen_sen['text']
                anonymized_text = anonymize_sentence(name_tokens, text)
                gen_sen['anonymized_sentence'] = anonymized_text
                gen_sen['anonymized_sentiment'] = calculate_sentiment(anonymized_text)


anonymize_and_evaluate_bert()

100%|██████████| 876/876 [01:34<00:00,  9.23it/s]
100%|██████████| 776/776 [01:32<00:00,  8.37it/s]


In [73]:
# Save the data to a JSON file
with open('bert_output.json', 'w') as f:
    json.dump(generated_bert, f, indent=2)

In [74]:
generated_llama_small = load_json_from_file("data/generated/llama7b.json")
generated_llama_big = load_json_from_file("data/generated/llama70b.json")


def anonymize_and_evaluate_llama(llama):
    for data in tqdm(llama):
        name_tokens = data['name'].split("_")
        gen_sen = data['generated_sentence']
        anonymized_text = anonymize_sentence(name_tokens, gen_sen)
        data['anonymized_sentence'] = anonymized_text
        data['anonymized_sentiment'] = calculate_sentiment(anonymized_text)

anonymize_and_evaluate_llama(generated_llama_small)
anonymize_and_evaluate_llama(generated_llama_big)

# Save the data to a JSON file
with open('llama_7b_output.json', 'w') as f:
    json.dump(generated_llama_small, f, indent=2)

# Save the data to a JSON file
with open('llama_70b_output.json', 'w') as f:
    json.dump(generated_llama_big, f, indent=2)

  0%|          | 0/1652 [00:00<?, ?it/s]

100%|██████████| 1652/1652 [00:51<00:00, 31.91it/s]
100%|██████████| 1652/1652 [00:51<00:00, 32.29it/s]
