In [16]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.utils import shuffle

In [17]:
# Function to read the dataset
def read_dataset(file_path):
    """
    Load the dataset from a CSV file.
    :param file_path: Path to the CSV file.
    :return: Pandas DataFrame.
    """
    dataset = pd.read_csv(file_path, dtype=str, index_col = 0)
    print(f"Dataset loaded. Shape: {dataset.shape}")
    print(dataset.head(10))
    return dataset


In [18]:
# File path to the nouns dataset
file_path = r"C:\Users\user1\Desktop\HarvestWE-main\HarvestWE-main\Data\Morphalou\all_nouns_v2.csv"

# Load the dataset
nouns = read_dataset(file_path)

Dataset loaded. Shape: (179950, 3)
                 Number     Gender       Lemma
Word                                          
100-mÃ¨tres   invariable  masculine  100-mÃ¨tres
2D           invariable   feminine          2D
3D           invariable   feminine          3D
ÂµA           invariable  masculine           A
a            invariable  masculine           a
a b c          singular  masculine       a b c
 a demi-mot    singular  masculine  a demi-mot
a-mi-la        singular  masculine     a-mi-la
aa           invariable  masculine          aa
aabam          singular  masculine       aabam


In [19]:
nouns = nouns[nouns.Gender != 'invariable']
nouns

Unnamed: 0_level_0,Number,Gender,Lemma
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100-mÃ¨tres,invariable,masculine,100-mÃ¨tres
2D,invariable,feminine,2D
3D,invariable,feminine,3D
ÂµA,invariable,masculine,A
a,invariable,masculine,a
...,...,...,...
Ï†,invariable,masculine,Ï†
Ï‡,invariable,masculine,Ï‡
Ïˆ,invariable,masculine,Ïˆ
Ï‰,invariable,masculine,Ï‰


In [20]:
# Function to clean the dataset
def clean_dataset(dataset, feature_columns):
    """
    Clean the dataset by removing rows with missing values in specified columns.
    :param dataset: Input DataFrame.
    :param feature_columns: List of columns to check for missing values.
    :return: Cleaned DataFrame.
    """
    cleaned_dataset = dataset.dropna(subset=feature_columns)

    print(f"Dataset cleaned. Remaining rows: {cleaned_dataset.shape[0]}")
    return cleaned_dataset



In [21]:
# Step 2: Clean the dataset (focus on Gender column)
nouns_cleaned  = clean_dataset(nouns, feature_columns=['Gender'])
nouns_cleaned 

Dataset cleaned. Remaining rows: 175708


Unnamed: 0_level_0,Number,Gender,Lemma
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100-mÃ¨tres,invariable,masculine,100-mÃ¨tres
2D,invariable,feminine,2D
3D,invariable,feminine,3D
ÂµA,invariable,masculine,A
a,invariable,masculine,a
...,...,...,...
Ï†,invariable,masculine,Ï†
Ï‡,invariable,masculine,Ï‡
Ïˆ,invariable,masculine,Ïˆ
Ï‰,invariable,masculine,Ï‰


In [22]:

# Function to encode features
def encode_feature(feature):
    """
    Encode categorical features as numeric values.
    :param feature: Pandas Series to encode.
    :return: Encoded feature.
    """
    le = LabelEncoder()
    le.fit(feature.unique())
    feature_encoded = le.transform(feature)
    return feature_encoded



In [23]:
def preprocess_dataset(dataset, feature_name, encode_as1=None, normalize_columns=None, remove_original=False):
    """
    Preprocess the dataset by encoding features and normalizing specified columns.
    :param dataset: Input DataFrame.
    :param feature_name: The feature to encode as binary.
    :param encode_as1: Map one of the feature's values to 1, others to 0.
    :param normalize_columns: List of columns to normalize.
    :param remove_original: Whether to remove the original categorical column.
    :return: Preprocessed DataFrame.
    """
    # Encode Gender
    if encode_as1:
        dataset[f"{feature_name}_encoded"] = (dataset[feature_name] == encode_as1).astype(int)
    else:
        dataset[f"{feature_name}_encoded"] = encode_feature(dataset[feature_name])

    print(f"Feature '{feature_name}' encoded. Sample:")
    print(dataset[[feature_name, f"{feature_name}_encoded"]].head(10))
    
    # Normalize specified columns
    if normalize_columns:
        scaler = MinMaxScaler()
        dataset[normalize_columns] = scaler.fit_transform(dataset[normalize_columns])
        print(f"Columns normalized: {normalize_columns}")
        print(dataset[normalize_columns].head(5))
    
    # Remove original categorical column if specified
    if remove_original:
        dataset = dataset.drop(columns=[feature_name])
        print(f"Original feature '{feature_name}' removed.")

    return dataset


In [24]:
# Preprocess the dataset and reassign the updated DataFrame
nouns_cleaned = preprocess_dataset(
    dataset=nouns_cleaned,
    feature_name='Gender',
    encode_as1='masculine',
    normalize_columns=None,
    remove_original=True  # Remove the categorical column
)



Feature 'Gender' encoded. Sample:
                Gender  Gender_encoded
Word                                  
100-mÃ¨tres   masculine               1
2D            feminine               0
3D            feminine               0
ÂµA           masculine               1
a            masculine               1
a b c        masculine               1
 a demi-mot  masculine               1
a-mi-la      masculine               1
aa           masculine               1
aabam        masculine               1
Original feature 'Gender' removed.


In [25]:
# Print the updated DataFrame
print(nouns_cleaned.head())

                Number       Lemma  Gender_encoded
Word                                              
100-mÃ¨tres  invariable  100-mÃ¨tres               1
2D          invariable          2D               0
3D          invariable          3D               0
ÂµA          invariable           A               1
a           invariable           a               1


In [26]:
print(nouns_cleaned)

                Number       Lemma  Gender_encoded
Word                                              
100-mÃ¨tres  invariable  100-mÃ¨tres               1
2D          invariable          2D               0
3D          invariable          3D               0
ÂµA          invariable           A               1
a           invariable           a               1
...                ...         ...             ...
Ï†           invariable           Ï†               1
Ï‡           invariable           Ï‡               1
Ïˆ           invariable           Ïˆ               1
Ï‰           invariable           Ï‰               1
â„¦           invariable           â„¦               1

[175708 rows x 3 columns]


In [27]:
nouns_cleaned = nouns_cleaned.reset_index()
print(nouns_cleaned.columns)

Index(['Word', 'Number', 'Lemma', 'Gender_encoded'], dtype='object')


In [28]:
nouns_cleaned.to_csv("cleaned_nouns_gender.csv", index=False)


In [29]:

import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sn

class WENotFound(Exception):
    """Exception raised when a word embedding is not found."""
    pass

# Function to load model and tokenizer
def load_model_and_tokenizer(model_name):
    """
    Load a pre-trained masked language model and its tokenizer.
    :param model_name: Name of the pre-trained model.
    :return: Model and tokenizer objects.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForMaskedLM.from_pretrained(model_name)
    return model, tokenizer

# Function to get word embedding
def get_we(model, tokenizer, word):
    """
    Extract the embedding for a single word.
    :param model: Pre-trained model.
    :param tokenizer: Tokenizer corresponding to the model.
    :param word: Word to extract embedding for.
    :return: Numpy array containing the word embedding.
    """
    encoding = tokenizer.encode(word)
    if len(encoding) != 3:  # Word should be encoded as a single token
        raise WENotFound(f'{word}: the word doesn\'t exist in the vocab')

    word_id = encoding[1]  # Extract the actual word token ID
    token_ids = torch.tensor([[word_id]])

    with torch.no_grad():
        outputs = model(token_ids, output_hidden_states=True)
        last_layer_hidden_states = outputs.hidden_states[-1]

    return last_layer_hidden_states.squeeze().numpy()

# Function to generate embeddings for all words in dataset
def generate_embeddings(words_df, model, tokenizer):
    """
    Generate embeddings for a list of words.
    :param words_df: DataFrame containing words and their labels.
    :param model: Pre-trained language model.
    :param tokenizer: Corresponding tokenizer.
    :return: DataFrame with word embeddings.
    """
    embeddings = []
    skipped_words = []

    for _, row in words_df.iterrows():
        word = row["Word"]
        gender = row["Gender_encoded"]

        try:
            word_embedding = get_we(model, tokenizer, word)
            word_dict = {x[0]: x[1] for x in enumerate(word_embedding)}
            word_dict['Word'] = word
            word_dict['Gender'] = gender
            embeddings.append(word_dict)

        except WENotFound:
            skipped_words.append(word)

    print(f"Skipped words due to out-of-vocabulary: {len(skipped_words)}")
    emb_df = pd.DataFrame(embeddings)
    return emb_df



In [None]:
# Define model names
model_names = {
 #   "TinyBERT": "huawei-noah/TinyBERT_General_4L_312D",
  #  "DistilBERT": "distilbert-base-uncased",
  #    "FlauBERT-small": "flaubert/flaubert_small_cased",
    #"CamemBERT-base": "camembert-base"
   
   "XLM-R_large": "xlm-roberta-large",
   "XLM-Roberta-Base": "xlm-roberta-base",
   #"mBERT-Base-Uncased": "bert-base-multilingual-uncased",
   "mBERT-Base-Cased": "bert-base-multilingual-cased",
    "DistilBERT-Base-Cased": "distilbert-base-multilingual-cased"

}
# Load dataset (cleaned nouns with Gender labels)
nouns_cleaned = pd.read_csv("cleaned_nouns_gender.csv")

# Generate embeddings for each model
for model_label, model_name in model_names.items():
    print(f"Processing embeddings for {model_label}...")
    model, tokenizer = load_model_and_tokenizer(model_name)
    embeddings_df = generate_embeddings(nouns_cleaned, model, tokenizer)
    
    # Set Word as Index
    embeddings_df.set_index("Word", inplace=True)

    # Display sample
    print(f"Sample embeddings for {model_label}:")
    print(embeddings_df.head())

    # Save embeddings to CSV
    embeddings_df.to_csv(f"{model_label}_embeddings_with_gender.csv")
    print(f"Saved embeddings for {model_label}.")

    # Save embeddings in pickle format
    embeddings_df.to_pickle(f"{model_label}_embeddings.pkl")
    print(f"Saved embeddings for {model_label}.")

    
    

Processing embeddings for XLM-R_large...


tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Skipped words due to out-of-vocabulary: 171851
Sample embeddings for XLM-R_large:
                  0         1         2         3         4         5  \
Word                                                                    
a          0.084792  0.019702  0.113618  0.132146 -0.027365 -0.116332   
aba        0.314796 -0.127990  0.359627 -0.249409  0.044068 -0.172515   
abandon    0.063121 -0.420607  0.473553 -0.230607 -0.404868  0.292548   
abba      -0.225727  0.108689  0.235761 -0.107173  0.285700  0.018964   
abdominal -0.035306 -0.068203  0.302270 -0.129037  0.472714 -0.213675   

                  6         7         8         9  ...      1015      1016  \
Word                                               ...                       
a          0.172065  0.042265  0.109370 -0.005246  ... -0.075809  0.073173   
aba       -0.421966 -0.244339 -0.157397 -0.053287  ...  0.024180 -0.251885   
abandon   -0.316067 -0.559579 -0.008430 -0.220676  ...  0.368944  0.107064   
abba      -0.419

model.safetensors:  62%|######2   | 692M/1.12G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Skipped words due to out-of-vocabulary: 171851
Sample embeddings for XLM-Roberta-Base:
                  0         1         2         3         4         5  \
Word                                                                    
a          0.076339  0.100680  0.046483 -0.019450  0.041517 -0.025529   
aba        0.076340  0.100680  0.046483 -0.019450  0.041517 -0.025529   
abandon    0.076338  0.100679  0.046478 -0.019451  0.041519 -0.025531   
abba       0.076340  0.100676  0.046482 -0.019451  0.041515 -0.025529   
abdominal  0.076337  0.100679  0.046470 -0.019452  0.041524 -0.025531   

                  6         7         8         9  ...       759       760  \
Word                                               ...                       
a          0.011361  0.001988  0.076578 -0.106401  ...  0.007592  0.071321   
aba        0.011362  0.001988  0.076578 -0.106402  ...  0.007593  0.071321   
abandon    0.011356  0.001991  0.076574 -0.106401  ...  0.007594  0.071314   
abba       

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Skipped words due to out-of-vocabulary: 171379
Sample embeddings for mBERT-Base-Cased:
                0         1         2         3         4         5         6  \
Word                                                                            
2D       0.370462 -0.543173  0.313777  0.114062  0.105755  0.101264  0.047496   
3D       0.354639 -0.545033  0.300137  0.125183  0.109216  0.103105  0.037388   
a        0.349609 -0.534463  0.286310  0.138533  0.116628  0.095070  0.029105   
aa       0.350173 -0.535272  0.285707  0.138325  0.116960  0.096526  0.028733   
abandon  0.348776 -0.534598  0.285159  0.137568  0.115842  0.096030  0.028421   

                7         8         9  ...       759       760       761  \
Word                                   ...                                 
2D       0.941036 -0.635095  0.351250  ...  1.044858 -0.638283 -0.908592   
3D       0.945385 -0.627031  0.362872  ...  1.067076 -0.626916 -0.920606   
a        0.942508 -0.610512  0.366846  ..

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Skipped words due to out-of-vocabulary: 171379
Sample embeddings for DistilBERT-Base-Cased:
                0         1         2         3         4         5         6  \
Word                                                                            
2D       1.044108  0.413215  0.149576  0.126064 -0.028639  0.923040 -0.880763   
3D       0.213090  0.312851  0.171519 -0.180463  0.048762  0.827893 -0.143109   
a        0.079794 -0.093255  0.553697  0.045576  0.298140  0.063133 -0.170536   
aa       0.152444 -0.343460  0.597466 -0.076894  0.045116  0.231373 -0.098664   
abandon -0.031310 -0.131179  0.523863  0.114205  0.392207  0.006046 -0.230375   

                7         8         9  ...       759       760       761  \
Word                                   ...                                 
2D       0.325376 -0.161253 -0.032791  ...  0.767849 -0.777275 -0.218974   
3D      -0.228023 -0.030123 -0.339493  ...  0.199764 -0.720399 -0.217338   
a       -0.048963  0.281889 -0.08412

In [33]:
 # File path to the nouns dataset
file_path = r"C:\Users\user1\Desktop\HarvestWE-main\mBERT-Base-Uncased_embeddings_with_gender.csv"

dataset = pd.read_csv(file_path)
print(dataset.head())
print(dataset.columns)



  Word         0         1         2         3         4         5         6  \
0   2D -0.458535  1.026120 -0.156639  0.282224 -1.437061  1.080444 -0.037819   
1   3D  0.056050  0.356133  0.071018  0.155525 -0.240813  0.212123 -0.128014   
2    a -0.528666  1.097722 -0.130273  0.290261 -1.492992  1.224025 -0.052981   
3   aa -0.533585  1.105228 -0.131850  0.289372 -1.501637  1.285083 -0.051327   
4  aba -0.512694  1.115770 -0.135617  0.293074 -1.500828  1.257268 -0.053031   

          7         8  ...       759       760       761       762       763  \
0 -0.188312  0.308035  ... -0.639540 -0.797463  0.159385 -0.637154  0.302692   
1 -0.157016  0.187442  ... -0.245784 -0.459303 -0.110808  0.154719 -0.240903   
2 -0.201115  0.315308  ... -0.775459 -0.770062  0.065690 -0.722923  0.384350   
3 -0.194557  0.318292  ... -0.794284 -0.759392  0.063101 -0.726022  0.362084   
4 -0.197878  0.316445  ... -0.792604 -0.775942  0.053793 -0.740963  0.364370   

        764       765       766       

In [32]:
import pandas as pd

# ðŸ“¥ Load the saved embeddings CSV
file_path = r"C:\Users\user1\Desktop\HarvestWE-main\mBERT-Base-Uncased_embeddings_with_gender.csv"

# âœ… Read CSV and set 'Word' as the index
dataset = pd.read_csv(file_path)

# âœ… Move 'Word' column to index (if it's not already set)
if 'Word' in dataset.columns:
    dataset.set_index('Word', inplace=True)

# âœ… Save the fixed CSV
fixed_file_path = r"C:\Users\user1\Desktop\HarvestWE-main\mBERT-Base-Uncased_embeddings_fixed.csv"
dataset.to_csv(fixed_file_path)

print(f"âœ… Fixed dataset saved successfully at: {fixed_file_path}")
print(dataset.head())  # Preview to confirm the fix


âœ… Fixed dataset saved successfully at: C:\Users\user1\Desktop\HarvestWE-main\mBERT-Base-Uncased_embeddings_fixed.csv
             0         1         2         3         4         5         6  \
Word                                                                         
2D   -0.458535  1.026120 -0.156639  0.282224 -1.437061  1.080444 -0.037819   
3D    0.056050  0.356133  0.071018  0.155525 -0.240813  0.212123 -0.128014   
a    -0.528666  1.097722 -0.130273  0.290261 -1.492992  1.224025 -0.052981   
aa   -0.533585  1.105228 -0.131850  0.289372 -1.501637  1.285083 -0.051327   
aba  -0.512694  1.115770 -0.135617  0.293074 -1.500828  1.257268 -0.053031   

             7         8         9  ...       759       760       761  \
Word                                ...                                 
2D   -0.188312  0.308035  0.545822  ... -0.639540 -0.797463  0.159385   
3D   -0.157016  0.187442 -0.101623  ... -0.245784 -0.459303 -0.110808   
a    -0.201115  0.315308  0.630490  ... -0