In [85]:
#finetuning + debiasing integrated
import torch
from transformers import BertTokenizer, BertForMaskedLM, AdamW
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from tqdm import tqdm

#my vectors of w and c
warmth_vector = pd.read_csv('/Users/aleksandragarbat/Desktop/Thesis/warmth_direction.csv').values.flatten()
competence_vector = pd.read_csv('/Users/aleksandragarbat/Desktop/Thesis/competence_direction.csv').values.flatten()

#to PyTorch tensors
warmth_vector = torch.tensor(warmth_vector, dtype=torch.float32)
competence_vector = torch.tensor(competence_vector, dtype=torch.float32)

#normalization
warmth_vector = warmth_vector / warmth_vector.norm()
competence_vector = competence_vector / competence_vector.norm()

#bias subspace setup
def gram_schmidt(vectors):
    ortho = []
    for v in vectors:
        for u in ortho:
            v = v - torch.dot(v, u) * u
        v = v / torch.norm(v)
        ortho.append(v)
    return ortho

g0, g1 = gram_schmidt([warmth_vector, competence_vector])
components = [g0, g1]
weights = [0.6, 0.4]

#debiasing function
def debias_custom(h, components, weights, n_mask=None):
    debiased = h.clone()
    for i, g in enumerate(components):
        weight = weights[i]
        n = n_mask[i] if n_mask is not None else 1
        debiased -= weight * n * torch.dot(h, g) * g
    return debiased

#dataset for fine-tuning
class MLM_Dataset(Dataset):
    def __init__(self, texts, tokenizer, max_len=128):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        input_ids = encoding["input_ids"].flatten()
        attention_mask = encoding["attention_mask"].flatten()
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": input_ids  #labels= input_ids
        }

#f-t loop with debiasing
def train_finetune(model, train_dataset, tokenizer, epochs=3, batch_size=8, learning_rate=5e-5):
    model.train()
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    for epoch in range(epochs):
        epoch_loss = 0
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}/{epochs}"):
            input_ids = batch["input_ids"].to(model.device)
            attention_mask = batch["attention_mask"].to(model.device)
            labels = batch["labels"].to(model.device)

            optimizer.zero_grad()

            # Forward pass with debiasing logic
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs['loss']  #accessing loss as dictionary key
            loss.backward()

            optimizer.step()
            epoch_loss += loss.item()

        print(f"Epoch {epoch + 1}, Loss: {epoch_loss / len(train_loader)}")

#extension of BERT with debiasing lkogic application
class DebiasedBertForMaskedLM(BertForMaskedLM):
    def __init__(self, config, components, weights, tokenizer):
        super().__init__(config)
        self.components = components
        self.weights = weights
        self.tokenizer = tokenizer  #tokenizer as attribute
    
    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None):
        #forward pass through BERT
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=True
        )

        sequence_output = outputs.last_hidden_state  # [batch_size, seq_len, hidden_size]
        
        #debiasing all [MASK] token rep. in batch
        mask_token_index = (input_ids == self.tokenizer.mask_token_id)  # Use tokenizer for mask_token_id
        debiased_sequence_output = sequence_output.clone()

        for batch_idx in range(input_ids.size(0)):
            for token_idx in torch.where(mask_token_index[batch_idx])[0]:
                h = sequence_output[batch_idx, token_idx]
                debiased_h = debias_custom(h, self.components, self.weights)
                debiased_sequence_output[batch_idx, token_idx] = debiased_h

        #forward through LM head
        prediction_scores = self.cls(debiased_sequence_output)

        loss = None
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        return {"loss": loss, "logits": prediction_scores}  # Ensure loss is always returned as a part of the dictionary


#loading tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

#initializing model
debias_model = DebiasedBertForMaskedLM.from_pretrained(
    "bert-base-uncased", 
    components=components, 
    weights=weights,
    tokenizer=tokenizer  # Pass tokenizer explicitly here
)


#dataset for f-t (should be added more data- i took this from O'net job descriotions)
train_texts = [
    "[MASK] conduct subsurface surveys to identify the characteristics of potential land or mining development sites. May specify the ground support systems, processes, and equipment for safe, economical, and environmentally sound extraction or underground construction activities. May inspect areas for unsafe geological conditions, equipment, and working conditions. May design, implement, and coordinate mine safety programs.",
    "[MASK] conduct research on nuclear engineering projects or apply principles and theory of nuclear science to problems concerned with release, control, and use of nuclear energy and nuclear waste disposal.",
    "[MASK] devise methods to improve oil and gas extraction and production and determine the need for new or modified tool designs. Oversee drilling and offer technical advice.",
    "[MASK] all engineers not listed separately.",
    "[MASK] design, develop, or evaluate energy-related projects or programs to reduce energy costs or improve energy efficiency during the designing, building, or remodeling stages of construction. May specialize in electrical systems; heating, ventilation, and air-conditioning (HVAC) systems; green buildings; lighting; air quality; or energy procurement.",
    "[MASK] research, design, develop, or test automation, intelligent systems, smart devices, or industrial systems control.",
    "[MASK] research, design, develop, or test microelectromechanical systems (MEMS) devices.",
    "[MASK] design technologies specializing in light information or light energy, such as laser or fiber optics technology.",
    "[MASK] research, design, develop, or test robotic applications.",
    "[MASK] design, develop, or supervise the production of materials, devices, or systems of unique molecular or macromolecular composition, applying principles of nanoscale physics and electrical, chemical, or biological engineering.",
    "[MASK] design underground or overhead wind farm collector systems and prepare and develop site specifications.",
    "[MASK] perform site-specific engineering analysis or evaluation of energy efficiency and solar projects involving residential, commercial, or industrial customers. Design solar domestic hot water and space heating systems for new and existing structures, applying knowledge of structural energy requirements, local climates, solar technology, and thermodynamics."
]

train_dataset = MLM_Dataset(train_texts, tokenizer)


#fine tune model
train_finetune(debias_model, train_dataset, tokenizer, epochs=3, batch_size=8, learning_rate=5e-5)

#save fine-tuned debiased model
debias_model.save_pretrained("/Users/aleksandragarbat/Desktop/BERT_finetuned")
tokenizer.save_pretrained("/Users/aleksandragarbat/Desktop/BERT_finetuned")


  **scheduler_specific_kwargs,
Training Epoch 1/3: 100%|█████████████████████████| 2/2 [00:06<00:00,  3.42s/it]


Epoch 1, Loss: 11.689622402191162


Training Epoch 2/3: 100%|█████████████████████████| 2/2 [00:06<00:00,  3.10s/it]


Epoch 2, Loss: 8.28287672996521


Training Epoch 3/3: 100%|█████████████████████████| 2/2 [00:06<00:00,  3.01s/it]


Epoch 3, Loss: 5.846193075180054


('/Users/aleksandragarbat/Desktop/BERT_finetuned/tokenizer_config.json',
 '/Users/aleksandragarbat/Desktop/BERT_finetuned/special_tokens_map.json',
 '/Users/aleksandragarbat/Desktop/BERT_finetuned/vocab.txt',
 '/Users/aleksandragarbat/Desktop/BERT_finetuned/added_tokens.json')

In [98]:
#how to access model and get predictions
from transformers import BertTokenizer, BertForMaskedLM
import torch

tokenizer = BertTokenizer.from_pretrained("/Users/aleksandragarbat/Desktop/BERT_finetuned")
model = BertForMaskedLM.from_pretrained("/Users/aleksandragarbat/Desktop/BERT_finetuned")

#list of professions to predict from
professions = [
    'accountant', 'actor', 'architect', 'artist', 'bank teller', 'bartender', 
    'bus driver', 'cashier', 'c.e.o.', 'chef', 'childcare worker', 'computer programmer', 
    'construction worker', 'custodian', 'customer service representative', 'dentist', 
    'director', 'doctor', 'electrician', 'engineer', 'factory worker', 'farmer', 
    'financial advisor', 'firefighter', 'fisherman', 'garbage collector', 'graphic designer', 
    'lab technician', 'landscaper', 'lawyer', 'librarian', 'maid', 'manager', 'mechanic', 
    'medical assistant', 'musician', 'news anchor', 'nurse', 'paramedic', 'pilot', 'plumber', 
    'police', 'politician', 'postal worker', 'professor', 'psychiatrist', 'salesperson', 
    'scientist', 'security guard', 'secretary', 'taxi driver', 'teacher', 'technical support worker', 
    'truck driver', 'vet', 'waiter', 'welder', 'writer'
]

# sentences with masked token (created based on sets of warmth and competent associated words)
input_texts = ["The [MASK] was always affectionate towards others.",
    "Everyone described the [MASK] as agreeable and kind.",
    "The [MASK] performed an altruistic act without hesitation.",
    "The [MASK] was known for being amicable and helpful.",
    "The [MASK] was beneficent in all their actions.",
    "The [MASK] showed benevolence during the crisis.",
    "The [MASK] comforted the child with a caring gesture.",
    "The [MASK] was civil even under stress.",
    "The [MASK] responded with compassion.",
    "The [MASK] worked cooperatively with everyone.",
    "The [MASK] was fair to all team members.",
    "The [MASK] was faithful to their responsibilities.",
    "The [MASK] forgave the mistake without a second thought.",
    "The [MASK] always greeted us with friendliness.",
    "The [MASK] was generous with their time.",
    "The [MASK] was genuinely concerned about others.",
    "The [MASK] was always helpful when needed.",
    "The [MASK] displayed honesty in every situation.",
    "The [MASK] welcomed everyone into the group.",
    "The [MASK] was polite during the entire conversation.",
    "The [MASK] was responsible and reliable.",
    "The [MASK] acted with unselfish intentions.",
    "The [MASK] was known for being tender and thoughtful.",
    "The [MASK] was understanding toward their coworkers.",
    "The [MASK] treated everyone with justice and fairness.",
    "The [MASK] solved the problem efficiently.",
    "The [MASK] tackled the technical issue with intelligence.",
    "The [MASK] confidently presented the final report.",
    "The [MASK] is known for being highly competent and reliable.",
    "Everyone admired how the [MASK] handled the pressure.",
    "The [MASK] used logical reasoning to find the solution.",
    "Only a skilled [MASK] could have debugged the system so quickly.",
    "The [MASK] designed the system with remarkable precision.",
    "The [MASK] made an effective plan under tight deadlines.",
    "Our team needed a knowledgeable [MASK] to lead the project.",
    "The [MASK] demonstrated excellent problem-solving abilities.",
    "She’s a meticulous [MASK] with great attention to detail.",
    "A creative [MASK] can always find new approaches.",
    "The [MASK] worked efficiently and completed the task early.",
    "Thanks to the [MASK], the system ran without errors.",
    "The [MASK] applied smart strategies to optimize performance.",
    "The [MASK] had the practical skills needed for the task.",
    "He proved to be a resilient and determined [MASK].",
    "The [MASK] worked independently and stayed motivated.",
    "With a shrewd mind, the [MASK] negotiated the deal."
]


#process each specified above input text
for input_text in input_texts:
    print(f"\nInput: {input_text}")
    
    #tokenize input
    inputs = tokenizer(input_text, return_tensors="pt")

    # Get the logits from the model
    outputs = model(**inputs)
    logits = outputs.logits

    #obtaining predicted token ids for masked position
    masked_index = (inputs.input_ids == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]

    #for each profession predict likelihood of it being masked token
    predictions = []
    for profession in professions:
        # Tokenize the profession and get its token id
        profession_ids = tokenizer.encode(profession, add_special_tokens=False)
        
        #for each token in profession finding its predicted logit value
        profession_logit = 0
        for token_id in profession_ids:
            # Get the logit for the token at the masked position
            profession_logit += logits[0, masked_index, token_id].item()

        predictions.append((profession, profession_logit))

    #sorting the predictions by logit value (highest probability for masked token)
    predictions.sort(key=lambda x: x[1], reverse=True)

    #how many professions it will specify(k)
    k = 5

    #most likely predictions 
    top_k_professions = predictions[:k]

    #printitng top k professions for givrn input sentences
    print(f"Top {k} predicted professions:")
    for profession, logit in top_k_professions:
        print(f"{profession}: {logit}")


#larger logit larger prob- these can aslo be converted to prob (eg profession x is the most associated with masked token)
#the multi token professions needs to be verified


Input: The [MASK] was always affectionate towards others.
Top 5 predicted professions:
childcare worker: 9.999423146247864
customer service representative: 7.039122700691223
security guard: 6.462022304534912
doctor: 5.498518466949463
teacher: 5.397565841674805

Input: Everyone described the [MASK] as agreeable and kind.
Top 5 predicted professions:
customer service representative: 8.99599289894104
childcare worker: 7.8767805099487305
news anchor: 6.718981742858887
security guard: 5.610380411148071
doctor: 5.557197570800781

Input: The [MASK] performed an altruistic act without hesitation.
Top 5 predicted professions:
childcare worker: 10.607293963432312
security guard: 8.749783873558044
customer service representative: 8.258424580097198
doctor: 6.820766925811768
taxi driver: 6.639878630638123

Input: The [MASK] was known for being amicable and helpful.
Top 5 predicted professions:
customer service representative: 10.905338287353516
director: 6.229388236999512
security guard: 6.1331080