In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.utils import shuffle

In [1]:
# Function to read the dataset
def read_dataset(file_path):
    """
    Load the dataset from a CSV file.
    :param file_path: Path to the CSV file.
    :return: Pandas DataFrame.
    """
    dataset = pd.read_csv(file_path, dtype=str, index_col = 0)
    print(f"Dataset loaded. Shape: {dataset.shape}")
    print(dataset.head(10))
    return dataset


In [4]:
# File path to the adj aset
file_path = r"C:\Users\user1\Desktop\HarvestWE-main\HarvestWE-main\Data\Morphalou\all_adjs_v2.csv"

# Load the dataset
adjs = read_dataset(file_path)


Dataset loaded. Shape: (97966, 3)
                 Gender    Number       Lemma
Word                                         
a-humain      masculine  singular    a-humain
a-raciste     masculine  singular   a-raciste
aalénien      masculine  singular    aalénien
aaléniens     masculine    plural    aalénien
aalénienne     feminine  singular    aalénien
aaléniennes    feminine    plural    aalénien
aaronide      masculine  singular    aaronide
abactérien    masculine  singular  abactérien
abactériens   masculine    plural  abactérien
abactérienne   feminine  singular  abactérien


In [5]:
adjs = adjs[(adjs.Gender != 'invariable') & (adjs.Number != 'invariable')]
adjs

Unnamed: 0_level_0,Gender,Number,Lemma
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a-humain,masculine,singular,a-humain
a-raciste,masculine,singular,a-raciste
aalénien,masculine,singular,aalénien
aaléniens,masculine,plural,aalénien
aalénienne,feminine,singular,aalénien
...,...,...,...
œstroprogestatives,feminine,plural,œstroprogestatif
œuvé,masculine,singular,œuvé
œuvés,masculine,plural,œuvé
œuvée,feminine,singular,œuvé


In [6]:
# Function to encode features
def encode_feature(feature):
    """
    Encode categorical features as numeric values.
    :param feature: Pandas Series to encode.
    :return: Encoded feature.
    """
    le = LabelEncoder()
    le.fit(feature.unique())
    feature_encoded = le.transform(feature)
    return feature_encoded



In [7]:
def preprocess_dataset(dataset, feature_name, encode_as1=None, normalize_columns=None, remove_original=False):
    """
    Preprocess the dataset by encoding features and normalizing specified columns.
    :param dataset: Input DataFrame.
    :param feature_name: The feature to encode as binary.
    :param encode_as1: Map one of the feature's values to 1, others to 0.
    :param normalize_columns: List of columns to normalize.
    :param remove_original: Whether to remove the original categorical column.
    :return: Preprocessed DataFrame.
    """
    # Encode Gender
    if encode_as1:
        dataset[f"{feature_name}_encoded"] = (dataset[feature_name] == encode_as1).astype(int)
    else:
        dataset[f"{feature_name}_encoded"] = encode_feature(dataset[feature_name])

    print(f"Feature '{feature_name}' encoded. Sample:")
    print(dataset[[feature_name, f"{feature_name}_encoded"]].head(10))
    
    # Normalize specified columns
    if normalize_columns:
        scaler = MinMaxScaler()
        dataset[normalize_columns] = scaler.fit_transform(dataset[normalize_columns])
        print(f"Columns normalized: {normalize_columns}")
        print(dataset[normalize_columns].head(5))
    
    # Remove original categorical column if specified
    if remove_original:
        dataset = dataset.drop(columns=[feature_name])
        print(f"Original feature '{feature_name}' removed.")

    return dataset


In [8]:
adjs_cleaned = preprocess_dataset(
    dataset=adjs,
    feature_name='Gender',
    encode_as1='masculine',
    normalize_columns=None,
    remove_original=True
)



Feature 'Gender' encoded. Sample:
                 Gender  Gender_encoded
Word                                   
a-humain      masculine               1
a-raciste     masculine               1
aalénien      masculine               1
aaléniens     masculine               1
aalénienne     feminine               0
aaléniennes    feminine               0
aaronide      masculine               1
abactérien    masculine               1
abactériens   masculine               1
abactérienne   feminine               0
Original feature 'Gender' removed.


In [9]:
# Print the updated DataFrame
print(adjs_cleaned)

                      Number             Lemma  Gender_encoded
Word                                                          
a-humain            singular          a-humain               1
a-raciste           singular         a-raciste               1
aalénien            singular          aalénien               1
aaléniens             plural          aalénien               1
aalénienne          singular          aalénien               0
...                      ...               ...             ...
œstroprogestatives    plural  œstroprogestatif               0
œuvé                singular              œuvé               1
œuvés                 plural              œuvé               1
œuvée               singular              œuvé               0
œuvées                plural              œuvé               0

[67982 rows x 3 columns]


In [10]:
print(adjs_cleaned)

                      Number             Lemma  Gender_encoded
Word                                                          
a-humain            singular          a-humain               1
a-raciste           singular         a-raciste               1
aalénien            singular          aalénien               1
aaléniens             plural          aalénien               1
aalénienne          singular          aalénien               0
...                      ...               ...             ...
œstroprogestatives    plural  œstroprogestatif               0
œuvé                singular              œuvé               1
œuvés                 plural              œuvé               1
œuvée               singular              œuvé               0
œuvées                plural              œuvé               0

[67982 rows x 3 columns]


In [11]:
adjs_cleaned = adjs_cleaned.reset_index()

print(adjs_cleaned.columns)

Index(['Word', 'Number', 'Lemma', 'Gender_encoded'], dtype='object')


In [12]:
adjs_cleaned.to_csv("cleaned_adjs_gender.csv", index=False)


In [13]:

import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sn

class WENotFound(Exception):
    """Exception raised when a word embedding is not found."""
    pass

# Function to load model and tokenizer
def load_model_and_tokenizer(model_name):
    """
    Load a pre-trained masked language model and its tokenizer.
    :param model_name: Name of the pre-trained model.
    :return: Model and tokenizer objects.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForMaskedLM.from_pretrained(model_name)
    return model, tokenizer

# Function to get word embedding
def get_we(model, tokenizer, word):
    """
    Extract the embedding for a single word.
    :param model: Pre-trained model.
    :param tokenizer: Tokenizer corresponding to the model.
    :param word: Word to extract embedding for.
    :return: Numpy array containing the word embedding.
    """
    encoding = tokenizer.encode(word)
    if len(encoding) != 3:  # Word should be encoded as a single token
        raise WENotFound(f'{word}: the word doesn\'t exist in the vocab')

    word_id = encoding[1]  # Extract the actual word token ID
    token_ids = torch.tensor([[word_id]])

    with torch.no_grad():
        outputs = model(token_ids, output_hidden_states=True)
        last_layer_hidden_states = outputs.hidden_states[-1]

    return last_layer_hidden_states.squeeze().numpy()

# Function to generate embeddings for all words in dataset
def generate_embeddings(words_df, model, tokenizer):
    """
    Generate embeddings for a list of words.
    :param words_df: DataFrame containing words and their labels.
    :param model: Pre-trained language model.
    :param tokenizer: Corresponding tokenizer.
    :return: DataFrame with word embeddings.
    """
    embeddings = []
    skipped_words = []

    for _, row in words_df.iterrows():
        word = row["Word"]
        gender = row["Gender_encoded"]

        try:
            word_embedding = get_we(model, tokenizer, word)
            word_dict = {x[0]: x[1] for x in enumerate(word_embedding)}
            word_dict['Word'] = word
            word_dict['Gender'] = gender
            embeddings.append(word_dict)

        except WENotFound:
            skipped_words.append(word)

    print(f"Skipped words due to out-of-vocabulary: {len(skipped_words)}")
    emb_df = pd.DataFrame(embeddings)
    return emb_df



In [14]:
# Define model names
model_names = {
   
   "XLM-R_large": "xlm-roberta-large",
   "XLM-Roberta-Base": "xlm-roberta-base",
   "mBERT-Base-Uncased": "bert-base-multilingual-uncased",
   "mBERT-Base-Cased": "bert-base-multilingual-cased",
    "DistilBERT-Base-Cased": "distilbert-base-multilingual-cased"

}

# Load adjective dataset
adjs_cleaned = pd.read_csv("cleaned_adjs_gender.csv")

# Generate embeddings for each model
for model_label, model_name in model_names.items():
    print(f"Processing embeddings for {model_label}on adjectives...")
    model, tokenizer = load_model_and_tokenizer(model_name)
    embeddings_df = generate_embeddings(adjs_cleaned,model,tokenizer)
    
    # Set Word as Index
    embeddings_df.set_index("Word", inplace=True)

    # Display sample
    print(f"Sample embeddings for {model_label}:")
    print(embeddings_df.head())

    # Save outputs
    embeddings_df.to_csv(f"{model_label}_adjective_embeddings_with_gender.csv")
    embeddings_df.to_pickle(f"{model_label}_adjective_embeddings.pkl")

    print(f"Saved adjective embeddings for {model_label}.")

    
    

Processing embeddings for XLM-R_largeon adjectives...


Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Skipped words due to out-of-vocabulary: 67087
Sample embeddings for XLM-R_large:
                   0         1         2         3         4         5  \
Word                                                                     
abdominal  -0.035306 -0.068203  0.302270 -0.129037  0.472714 -0.213675   
abrupt      0.056057 -0.144392  0.012552 -0.372142 -0.091922  0.136170   
absent      0.131544 -0.391886 -0.075089 -0.358459 -0.235885  0.342959   
accepté     0.109618 -0.327010  0.183316 -0.358990 -0.368958  0.297833   
accompagné  0.080698  0.002290  0.134187  0.133029 -0.032265 -0.123115   

                   6         7         8         9  ...      1015      1016  \
Word                                                ...                       
abdominal  -0.502867  0.159517 -0.176921 -0.145372  ...  0.047238  0.332361   
abrupt     -0.534903  0.105950  0.042881 -0.218313  ...  0.547907 -0.288354   
absent     -0.024962 -0.320995  0.165644 -0.124614  ... -0.088018 -0.307133   
accep

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Skipped words due to out-of-vocabulary: 67087
Sample embeddings for XLM-Roberta-Base:
                   0         1         2         3         4         5  \
Word                                                                     
abdominal   0.076337  0.100679  0.046470 -0.019452  0.041524 -0.025531   
abrupt      0.076337  0.100678  0.046473 -0.019451  0.041521 -0.025532   
absent      0.076339  0.100679  0.046480 -0.019451  0.041518 -0.025530   
accepté     0.076339  0.100682  0.046480 -0.019450  0.041519 -0.025530   
accompagné  0.076339  0.100680  0.046480 -0.019450  0.041518 -0.025530   

                   6         7         8         9  ...       759       760  \
Word                                                ...                       
abdominal   0.011347  0.001995  0.076569 -0.106402  ...  0.007595  0.071308   
abrupt      0.011351  0.001994  0.076571 -0.106401  ...  0.007594  0.071309   
absent      0.011358  0.001990  0.076576 -0.106401  ...  0.007593  0.071316   


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Skipped words due to out-of-vocabulary: 65819
Sample embeddings for mBERT-Base-Uncased:
                  0         1         2         3         4         5  \
Word                                                                    
abandonné -0.516744  1.115343 -0.137685  0.288690 -1.494414  1.287974   
abrité    -0.522401  1.099905 -0.139889  0.300243 -1.492713  1.210976   
absent    -0.514369  1.121930 -0.141710  0.269820 -1.505412  1.276619   
absolue    0.011467  0.250145  0.061106  0.259305 -0.326736 -0.070246   
abusé     -0.446718  1.064334 -0.214545  0.285939 -1.405199  1.137035   

                  6         7         8         9  ...       759       760  \
Word                                               ...                       
abandonné -0.060082 -0.181060  0.312453  0.626785  ... -0.803875 -0.758321   
abrité    -0.037263 -0.202397  0.297605  0.614302  ... -0.766316 -0.765899   
absent    -0.070462 -0.178888  0.314820  0.636031  ... -0.786408 -0.755971   
absolue   

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Skipped words due to out-of-vocabulary: 66903
Sample embeddings for mBERT-Base-Cased:
                   0         1         2         3         4         5  \
Word                                                                     
abandonné   0.347691 -0.534404  0.285921  0.136949  0.116014  0.097085   
absent      0.349102 -0.534716  0.287126  0.137951  0.115974  0.096673   
absolue     0.348001 -0.534657  0.285109  0.137048  0.116047  0.098042   
accompagné  0.346162 -0.534618  0.283628  0.135667  0.115955  0.097196   
acteur      0.347982 -0.535534  0.285630  0.136340  0.116589  0.098362   

                   6         7         8         9  ...       759       760  \
Word                                                ...                       
abandonné   0.028410  0.941686 -0.610105  0.365626  ...  1.083181 -0.627648   
absent      0.028808  0.942350 -0.610411  0.365865  ...  1.082932 -0.628827   
absolue     0.028200  0.942263 -0.610996  0.366071  ...  1.083382 -0.628778   


In [None]:
 # File path to the nouns dataset
file_path = r"C:\Users\user1\Desktop\HarvestWE-main\mBERT-Base-Uncased_adjective_embeddings_with_gender.csv"

dataset = pd.read_csv(file_path)
print(dataset.head())
print(dataset.columns)

