In [1]:
%load_ext autoreload
%autoreload 2

# A complete example of the proposed pipeline.

In [2]:
from occ_models import HF_Model, find_occupations_aws
from utils import extract_occupations_from_resp, Knowledge
from gender import * 
import os 


## English
text_source = "A lawyer and a butcher."
language_source = "en"

### Greek
text_target = "Ένας δικηγόρος και ένας χασάπης."
language_target = "el"

### French
# text = "C'est un boucher"
# language = "fr"


#### Setting up the Pipeline #####
el = Greek()
en = English()
fr = French()

# Create the Knowledge
knowledge = Knowledge("ISCO-08-EN.csv", column = "Definition")


# model = HF_Model()
# model.find_occupations(text)

os.environ['aws_access_key_id'] = '...'
os.environ['aws_secret_access_key'] = '...'
#######


def extract_occupations_and_gender(text, language, knowledge):
    # Step 1: Find Occupations using LLM
    llm_resp = find_occupations_aws(text)
    # Step 2: Analyze output of LLM to extract occupations and definitions if are existed
    list_of_occs = extract_occupations_from_resp(llm_resp)

    responses = []
    for row in list_of_occs:
        index, p = knowledge.connect(row['definition'])[0]


        if language == "el":
            check_coreference = False
            nlp = el.nlp
        if language == "fr":
            check_coreference = True
            nlp = fr.nlp
        if language == "en":
            check_coreference = True
            nlp = en.nlp

        gender = find_gender(nlp, text, [row], check_coreference = check_coreference)

        responses.append({
                "index": index,
                "title": row['title'],
                "p": round (p*100, 2),
                "kg": knowledge.describe_occ_dict(index),
                "gender": gender,
                "text": text,
                "language": language

            })

    return responses

resp_source = extract_occupations_and_gender(text_source, language_source, knowledge)
resp_target = extract_occupations_and_gender(text_target, language_target, knowledge)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
[nltk_data] Downloading package omw-1.4 to /home/geofila/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
  warn(f"Failed to load image Python extension: {e}")
[nltk_data] Downloading package omw-1.4 to /home/geofila/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cuda


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

No chat template is set for this tokenizer, falling back to a default class-level template. This is very error-prone, because models are often trained with templates different from the class default! Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which point any code depending on them will stop working. We recommend setting a valid chat template before then to ensure that this model continues working without issues.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [3]:
import torch
import transformers
import itertools

model = transformers.BertModel.from_pretrained('bert-base-multilingual-cased')
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [4]:
def find_word_start_end(src, token_index):
    words = src.strip().split()
    current_position = 0
    
    for i, word in enumerate(words):
        start_position = current_position
        end_position = start_position + len(word) - 1
        
        if i == token_index:
            return [start_position, end_position + 1]
        
        current_position = end_position + 2  # +1 for the space, +1 because end_position is inclusive


def align(src, tgt, ds, dt, model, tokenizer):
#     sent_src, sent_tgt = src.strip().split(), tgt.strip().split()
    sent_src, sent_tgt = [str(s) for s in ds.nlp(src)], [str(s) for s in dt.nlp(tgt)]
    token_src, token_tgt = [tokenizer.tokenize(word) for word in sent_src], [tokenizer.tokenize(word) for word in sent_tgt]
    wid_src, wid_tgt = [tokenizer.convert_tokens_to_ids(x) for x in token_src], [tokenizer.convert_tokens_to_ids(x) for x in token_tgt]
    ids_src, ids_tgt = tokenizer.prepare_for_model(list(itertools.chain(*wid_src)), return_tensors='pt', model_max_length=tokenizer.model_max_length, truncation=True)['input_ids'], tokenizer.prepare_for_model(list(itertools.chain(*wid_tgt)), return_tensors='pt', truncation=True, model_max_length=tokenizer.model_max_length)['input_ids']
    sub2word_map_src = []
    for i, word_list in enumerate(token_src):
        sub2word_map_src += [i for x in word_list]
        sub2word_map_tgt = []
    for i, word_list in enumerate(token_tgt):
        sub2word_map_tgt += [i for x in word_list]

    # alignment
    align_layer = 8
    threshold = 1e-3
    model.eval()
    with torch.no_grad():
        out_src = model(ids_src.unsqueeze(0), output_hidden_states=True)[2][align_layer][0, 1:-1]
        out_tgt = model(ids_tgt.unsqueeze(0), output_hidden_states=True)[2][align_layer][0, 1:-1]

        dot_prod = torch.matmul(out_src, out_tgt.transpose(-1, -2))

        softmax_srctgt = torch.nn.Softmax(dim=-1)(dot_prod)
        softmax_tgtsrc = torch.nn.Softmax(dim=-2)(dot_prod)

        softmax_inter = (softmax_srctgt > threshold)*(softmax_tgtsrc > threshold)

    align_subwords = torch.nonzero(softmax_inter, as_tuple=False)
    align_words = set()
    for i, j in align_subwords:
        align_words.add( (sub2word_map_src[i], sub2word_map_tgt[j]) )

    al = {}
    for i, j in sorted(align_words):
        al[i] = [j, sent_src[i], sent_tgt[j]]

    return al

def search_g(resp_target, m):
    for row2 in resp_target:
        for r2 in row2["gender"]:
            s2, e2 = r2["tokens"]
            g2 = r2["gender"]
            if s2 == m:
                return row2["title"], g2

In [5]:
mapping = align(text_source, text_target, en, el, model, tokenizer)
for row1 in resp_source:
    for r1 in row1["gender"]:
        s1, e1 = r1["tokens"]
        g1 = r1["gender"]
        
        t2, g2 = search_g(resp_target, mapping[s1][0])
                
        if g1 != g2:
            print (f"Found gender shift in the word: {row1['title']} → {t2}, from {g1} to {g2}")
                

Found gender shift in the word: lawyer → δικηγόρος, from NC to Masc
Found gender shift in the word: butcher → χασάπης, from NC to Masc
