In [1]:
import json
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm
import torch
import torch.nn.functional as F
import pandas as pd
import random
import spacy



In [29]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [30]:
seed = 42
random.seed(seed)

bert_model = pipeline('fill-mask', model='clarin-pl/herbert-kgr10')
MASK_TOKEN = "<mask>"

sentiment_analysis_model_name = "Voicelab/herbert-base-cased-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(sentiment_analysis_model_name)
tokenizer = AutoTokenizer.from_pretrained(sentiment_analysis_model_name)

In [32]:
def calculate_sentiment(text):
    # Tokenize text and convert to tensors
    inputs = tokenizer.encode_plus(
        text,
        return_tensors='pt'
    )
    
    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get probabilities through softmax
    probs = F.softmax(outputs.logits, dim=-1)[0]
    
    result = -1*probs[0] + probs[1]*0 + probs[2]*1
    return result.item()



In [33]:
def fill_sentence(sentence):
  assert MASK_TOKEN in sentence
  predicted_words = bert_model(sentence)
  predicted_word = predicted_words[0]['token_str']
  filled_sentence = sentence.replace(MASK_TOKEN, predicted_word)
  return filled_sentence

In [34]:
original_sentences_path = "data/pl/wikipedia/gender_wiki.json" 
prompts_path = "data/pl/prompts/gender_prompt.json"

with open(original_sentences_path, "r") as f:
    original_sentences = json.load(f)

with open(prompts_path, "r") as f:
    prompts = json.load(f)

actresses_prompt = prompts['American_actresses']
actors_prompt = prompts['American_actors']

actresses_wiki = original_sentences['American_actresses']
actors_wiki = original_sentences['American_actors']

len(actresses_wiki), len(actors_wiki)

(776, 876)

In [41]:
sum([len(prompts) for prompts in actors_wiki.values()])

1151

In [43]:
nlp = spacy.load("pl_core_news_sm")

# Function 1: Mask the entire sentence
def mask_sentence(sentence, prompt):
    if sentence.startswith(prompt):
        masked_string = f"{prompt}{' <mask>' * (len(sentence.split()) - len(prompt.split()))}."
    else:
        masked_string = sentence
    return masked_string

# Function 2: Mask the all adjectives 
def mask_all_adjectives(sentence, prompt):
    if not sentence.startswith(prompt):
        return sentence
    
    remaining_words = sentence.split()[len(prompt.split()):]
    
    doc = nlp(' '.join(remaining_words))
    adjectives = [token.text for token in doc if token.pos_ == 'ADJ']
    if not len(adjectives) > 0:
        return sentence
    
    remaining_words = ["<mask>" if word in adjectives else word for word in remaining_words]
    remaining_words = ' '.join(remaining_words)  
        
    masked_string = f"{prompt} {remaining_words}"
    
    assert masked_string.startswith(prompt)
    assert len(masked_string.split()) == len(sentence.split())
    return masked_string


In [44]:
def generate_full_sentence(partial_sentence, mask_model=bert_model): 
    while MASK_TOKEN in partial_sentence:
        result = mask_model(partial_sentence)
        
        # if the result is list of lists, use the first element of the nested list
        if isinstance(result[0], list):
            result = result[0]
        
        # Now result is guaranteed to be a list containing a single dictionary
        token_str = result[0]['token_str']

        partial_sentence = partial_sentence.replace('<mask>', token_str, 1)
        
    return partial_sentence

In [45]:
def evaluate_actors(actors_wiki, actors_prompt, sex, masking_function):
    actors_results = []

    for actor in tqdm(list(actors_wiki.keys())):
        sentences = actors_wiki[actor]
        prompts = actors_prompt[actor]
        masked_sentences = [masking_function(sentence, prompt) for sentence, prompt in zip(sentences, prompts)]
        generated_sentence = [generate_full_sentence(masked_sentence) for masked_sentence in masked_sentences]

        for sentence, masked_sentence, generated_sentence in zip(sentences, masked_sentences, generated_sentence):
            actors_results.append({
                'name': actor,
                'sex': sex,
                'original_sentence': sentence,
                'masked_sentence': masked_sentence,
                'generated_sentence': generated_sentence,
                'original_sentences_sentiment': calculate_sentiment(sentence),
                'generated_sentences_sentiment': calculate_sentiment(generated_sentence),
                'masking_function': masking_function.__name__
            })
    return actors_results

In [46]:
actors_all_masked = evaluate_actors(actors_wiki, actors_prompt, 'M', mask_sentence)
actresses_all_masked = evaluate_actors(actresses_wiki, actresses_prompt, 'F', mask_sentence)

actors_adjectives_masked = evaluate_actors(actors_wiki, actors_prompt, 'M', mask_all_adjectives)
actresses_adjectives_masked = evaluate_actors(actresses_wiki, actresses_prompt, 'F', mask_all_adjectives)

results = actors_all_masked + actresses_all_masked + actors_adjectives_masked + actresses_adjectives_masked


df = pd.DataFrame(results)
print(df)


100%|██████████| 876/876 [34:14<00:00,  2.35s/it]  
100%|██████████| 776/776 [30:29<00:00,  2.36s/it]  
100%|██████████| 876/876 [05:45<00:00,  2.54it/s]
100%|██████████| 776/776 [05:33<00:00,  2.33it/s]

                    name sex  \
0          Sammy_Jackson   M   
1      Samuel_L._Jackson   M   
2      Samuel_L._Jackson   M   
3         Stoney_Jackson   M   
4           Rusty_Jacobs   M   
...                  ...  ..   
4609         Jess_Walton   F   
4610       Suzanne_Whang   F   
4611  Tonya_Lee_Williams   F   
4612        Aloma_Wright   F   
4613      Ashlynn_Yennie   F   

                                      original_sentence  \
0     Sammy Jackson zmarł na niewydolność serca w wi...   
1     "Jak Samuel L. Jackson stał się własnym gatunk...   
2     Samuel L. Jackson - zebrane wiadomości i komen...   
3     Stoney Jackson był jednym z bardziej widocznyc...   
4       Rusty Jacobs to amerykański były aktor filmowy.   
...                                                 ...   
4609  Jess Walton to amerykańska aktorka, najlepiej ...   
4610  Suzanne Whang była amerykańską prezenterką tel...   
4611  Czasami występuje jako Tonya Lee Williams, naj...   
4612  Aloma Wright jest a




In [52]:
import json

# Define the data structure
data = {
    'actors': {},
    'actresses': {}
}

for actor in actors_wiki.keys():
    data['actors'][actor] = {
        'name': actor,
        'prompts': actors_prompt[actor],
        'original_sentences': actors_wiki[actor],
        'original_sentences_sentiment': [],
        'generated_sentences': []
    }
    for result in results:
        if result['name'] == actor:
            data['actors'][actor]['generated_sentences'].append({
                'method': result['masking_function'],
                'text': result['generated_sentence'],
                'sentiment': result['generated_sentences_sentiment']
            })
            
for actor in actresses_wiki.keys():
    data['actresses'][actor] = {
        'name': actor,
        'prompts': actresses_prompt[actor],
        'original_sentences': actresses_wiki[actor],
        'generated_sentences': []
    }
    for result in results:
        if result['name'] == actor:
            data['actresses'][actor]['generated_sentences'].append({
                'method': result['masking_function'],
                'text': result['generated_sentence'],
                'sentiment': result['generated_sentences_sentiment']
            })

# Save the data to a JSON file
with open('output.json', 'w') as f:
    json.dump(data, f)
