In [111]:
# Import necessary libraries
import pandas as pd
import numpy as np
import nltk
# Download required NLTK resources
nltk.download("stopwords")  # Stopwords for text processing
nltk.download('averaged_perceptron_tagger')  # POS tagger
nltk.download('punkt')  # Tokenizer

# Import tqdm for progress tracking
from tqdm.auto import tqdm


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [118]:
# Function to prepare an embedding map from GloVe vectors
def prepare_embedding_map(embedding_name):
    embedding_index = {}

    # Open the corresponding GloVe file based on the embedding name
    if embedding_name == 'glove50':
        f = open("/content/glove_models/glove.6B.50d.txt")
    elif embedding_name == 'glove100':
        f = open("/content/glove_models/glove.6B.100d.txt")

    # Read the embeddings and store them in a dictionary
    for line in tqdm(f):
        values = line.split()
        word = values[0]
        coef = np.array(values[1:], dtype="float")
        embedding_index[word] = coef

    f.close()
    return embedding_index


In [119]:
# Load word embeddings and filter nouns
word_to_vec_map = prepare_embedding_map("glove100")
print(len(word_to_vec_map))

try:
    # Attempt to get words from the embedding map's vocabulary
    words = word_to_vec_map.vocab.keys()
    stopword = set(stopwords.words("english"))

    # Filter out stopwords
    words = [w for w in words if w.lower() not in stopword]
    tokens = nltk.word_tokenize(" ".join(words))
    pos_tagged = nltk.pos_tag(tokens)

    # Extract nouns from the tokens
    nouns = [w for w, pos in pos_tagged if pos.startswith("N")]
    print(len(words), len(nouns))

    print(len(list(word_to_vec_map.vocab.keys())), 'Vectors Found')

except:
    # Fallback in case of an AttributeError (if the vocab attribute does not exist)
    words = word_to_vec_map.keys()
    stopword = set(stopwords.words("english"))

    # Filter out stopwords
    words = [w for w in words if w.lower() not in stopword]
    tokens = nltk.word_tokenize(" ".join(words))
    pos_tagged = nltk.pos_tag(tokens)

    # Extract nouns from the tokens
    nouns = [w for w, pos in pos_tagged if pos.startswith("N")]
    print(len(words), len(nouns))
    print(len(list(word_to_vec_map.keys())), 'Vectors Found')


0it [00:00, ?it/s]

400000
399851 210076
400000 Vectors Found


In [120]:
len(words),len(nouns),nouns[0]

(399851, 210076, 'year')

In [121]:
### Cosine Similarity Calculation
# Cosine Similarity: CosineSim(u, v) = (u.v) / (||u|| ||v||) = cos(θ)

def cosine_similarity(u, v):
    # Calculate the dot product of u and v
    dot_prod = np.dot(u, v)

    # Calculate the norms (lengths) of u and v
    norm_u = np.sqrt(np.sum(u**2))
    norm_v = np.sqrt(np.sum(v**2))

    # Compute cosine similarity
    distance = dot_prod / (norm_u * norm_v)

    return distance


In [122]:
### Cosine Similarity Analysis for Word Pairs

# Retrieve word vectors from the embedding map
father = word_to_vec_map["father"]
mother = word_to_vec_map["mother"]

basket = word_to_vec_map["basket"]
dog = word_to_vec_map["dog"]

france = word_to_vec_map["france"]
italy = word_to_vec_map["italy"]
paris = word_to_vec_map["paris"]
rome = word_to_vec_map["rome"]

# Calculate and print cosine similarities
print(f"Cosine similarity (father, mother): {cosine_similarity(father, mother)}")
print(f"Cosine similarity (basket, dog): {cosine_similarity(basket, dog)}")
print(f"Cosine similarity (france-paris, italy-rome): {cosine_similarity(france - paris, italy - rome)}")


Cosine similarity (father, mother): 0.8656661174315731
Cosine similarity (basket, dog): 0.23200237592671563
Cosine similarity (france-paris, italy-rome): 0.7056238800453161


In [124]:
### Word Analogy: Completing the analogy a is to b as c is to __

def complete_analogy(word_a, word_b, word_c, words, word_to_vec_map):

    word_a = word_a.lower()
    word_b = word_b.lower()
    word_c = word_c.lower()

    e_a = word_to_vec_map[word_a]
    e_b = word_to_vec_map[word_b]
    e_c = word_to_vec_map[word_c]

    max_cosine_similarity = -100
    best_word = None

    # Iterate through the list of words to find the best match
    for w in words:
        if w in [word_a, word_b, word_c]:
            continue  # Skip if the word is part of the analogy

        if w in word_to_vec_map.keys():
            # Calculate cosine similarity for the analogy
            cosine_sim = cosine_similarity((e_b - e_a), (word_to_vec_map[w] - e_c))
            if cosine_sim > max_cosine_similarity:
                max_cosine_similarity = cosine_sim
                best_word = w  # Update the best matching word

    return best_word  # Return the best word that completes the analogy


In [125]:
### Testing Word Analogy Function with Examples

examples = [
    ('italy', 'italian', 'spain'),
    ('india', 'delhi', 'japan'),
    ('man', 'woman', 'boy'),
    ('small', 'smaller', 'large')
]

# Iterate through each example to print the analogy results
for ele in examples:
    print(f"Analogy of {ele[0]} : {ele[1]} :: {ele[2]} : {complete_analogy(ele[0], ele[1], ele[2], words, word_to_vec_map)}")


Analogy of italy : italian :: spain : spanish
Analogy of india : delhi :: japan : osaka
Analogy of man : woman :: boy : girl
Analogy of small : smaller :: large : larger


In [126]:
### Debiasing Word Vectors

# Calculate the gender direction vector
g = word_to_vec_map['woman'] - word_to_vec_map['man']

# Get the length of the gender direction vector
length_g = len(g)
print(f"Length of the gender direction vector: {length_g}")


Length of the gender direction vector: 100


In [127]:
### Similarity of Names with Gender Vector (Man-Woman)

# List of names to compare with the gender direction vector
names_list = ['john', 'marie', 'sophie', 'ronaldo', 'priya', 'rahul', 'danielle', 'reza']

print("List of names and their similarity with the vector (man-woman):")

# Calculate and print cosine similarity of each name vector with the gender vector g
for name in names_list:
    similarity = cosine_similarity(word_to_vec_map[name], g)
    print(f"{name}: {similarity}")

# Note: Female names are expected to show positive correlation with the constructed vector g.


List of names and their similarity with the vector (man-woman):
john: -0.22835017436721872
marie: 0.24537338638963357
sophie: 0.20268358510223194
ronaldo: -0.3328964498414265
priya: 0.13922857114427084
rahul: -0.0639072688424743
danielle: 0.14913149265020167
reza: -0.08192654617678002


In [129]:
### Similarity of Other Words with Gender Vector (Man-Woman)

# List of words to compare with the gender direction vector
word_list = [
    'lipstick', 'guns', 'science', 'arts', 'literature',
    'warrior', 'doctor', 'tree', 'receptionist',
    'technology', 'fashion', 'teacher', 'engineer',
    'pilot', 'computer', 'singer'
]

print("Other words and their similarity with the vector (man-woman):")

# Calculate and print cosine similarity of each word vector with the gender vector g
for word in word_list:
    similarity = cosine_similarity(word_to_vec_map[word], g)
    print(f"{word}: {similarity}")

# This analysis highlights how various words relate to the gender bias represented by the vector g.


Other words and their similarity with the vector (man-woman):
lipstick: 0.18037245461893886
guns: -0.09964446323350887
science: -0.02147576571900459
arts: 0.01484674744156461
literature: 0.08261854474431136
warrior: -0.15634200481756028
doctor: 0.10942282324077059
tree: -0.08868359642037957
receptionist: 0.2806875926160281
technology: -0.14474526940138327
fashion: 0.08097436821066459
teacher: 0.1523369596791014
engineer: -0.12300012058033458
pilot: -0.04113394172314754
computer: -0.11545715478097537
singer: 0.11372642801434334


In [None]:
"""
Highest Similarity: The word "receptionist" has the highest positive similarity score (0.2807),
indicating a stronger association with the vector (man-woman).

Lowest Similarity: The word "warrior" has the most negative similarity (-0.1563),
suggesting it is least associated with the concept represented by (man-woman).

Overall Trends: Words related to traditionally feminine roles (like "lipstick" and "receptionist")
tend to have positive similarity values, while terms associated
with masculinity or neutrality (like "guns" and "warrior") tend to show negative values.

This is a biased trend as expected.

"""

In [None]:
### Neutralize bias for non-gender specific words

"""
This formula helps in debiasing word embeddings by removing the component of a word vector that lies in the bias direction.
Here’s a step-by-step explanation:

Word Embeddings: Words are represented as vectors in an n-dimensional space.
For example, the word "receptionist" is represented by a vector e.
In this vector space, certain directions can encode bias, such as gender bias.

Bias Direction
g: The vector g represents the "bias direction" in the embedding space,
 which might, for example, encode male/female associations.

Neutralization: The goal is to "neutralize" a word like "receptionist" by removing any gender-related information,
i.e., the component of the word vector that lies in the direction of bias g.

Step 1 – Calculating the Bias Component:

The formula
bias_component = (𝑒⋅𝑔 / ∣∣𝑔∣∣2)∗𝑔 computes the projection of the word vector e onto the bias direction g.
This projection is the portion of the word vector that lies along the bias direction.
It tells us how much of the vector e is aligned with g.

Step 2 – Debiasing:

The next step is to subtract this bias component from the original word vector:
debias = 𝑒 − e bias_component

This subtraction "neutralizes" the bias by removing the part of the word vector that lies in the bias direction g,
leaving only the part of the vector that is orthogonal (perpendicular) to g.

By performing this operation, the vector for "receptionist" no longer contains gender information
but retains all other meaningful information.
This ensures that words like "receptionist" are not biased by gender in downstream tasks.

"""

In [136]:
### Neutralize Gender Bias in Word Vectors

def neutralize(word, g, word_to_vec_map):
    # Calculate the dot product of the word vector and the gender vector
    dot_p = np.dot(word_to_vec_map[word], g)

    # Determine the bias component to be removed
    e_bias = (dot_p / np.sum(g ** 2)) * g

    # Remove the bias from the original word vector
    e_unbiased = word_to_vec_map[word] - e_bias
    return e_unbiased


In [138]:
# Define the word to analyze
e = "receptionist"

# Calculate and print cosine similarity before neutralizing bias
similarity_before = cosine_similarity(word_to_vec_map[e], g)
print(f"Cosine similarity between '{e}' and g before neutralizing: {similarity_before}")

# Neutralize the bias and calculate the similarity again
e_unbiased = neutralize(e, g, word_to_vec_map)
similarity_after = cosine_similarity(e_unbiased, g)
print(f"Cosine similarity between '{e}' and g after neutralizing: {similarity_after}")


Cosine similarity between 'receptionist' and g before neutralizing: 0.2806875926160281
Cosine similarity between 'receptionist' and g after neutralizing: -2.9382195433746986e-17


In [None]:
### Equalization algorithm for gender-specific words
"""
Equalization is applied to pairs of words that you might want to have differ only through the gender property.
As a concrete example, suppose that "actress" is closer to "babysit" than "actor." By applying neutralizing to "babysit" we can reduce the gender-stereotype
associated with babysitting. But this still does not guarantee that "actor" and "actress" are equidistant from "babysit."
The equalization algorithm takes care of this.
"""

In [104]:
def equalize(pair, bias_axis, word_to_vec_map):
    w1, w2 = pair
    e_w1 = word_to_vec_map[w1]
    e_w2 = word_to_vec_map[w2]

    mu = (e_w1 + e_w2) / 2
    mu_b = (np.dot(mu, bias_axis) / np.sum(bias_axis**2)) * bias_axis  # Normalized
    mu_orth = mu - mu_b

    e_w1B = (np.dot(e_w1, bias_axis) / np.sum(bias_axis**2)) * bias_axis  # Normalized
    e_w2B = (np.dot(e_w2, bias_axis) / np.sum(bias_axis**2)) * bias_axis  # Normalized

    # Avoid division by zero
    norm_w1 = np.linalg.norm(e_w1B - mu_orth)
    norm_w2 = np.linalg.norm(e_w2B - mu_orth)

    if norm_w1 == 0:
        e_w1B_unbiased = np.zeros_like(e_w1B)
    else:
        e_w1B_unbiased = np.sqrt(np.abs(1 - mu_orth**2)) * (e_w1B - mu_b) / norm_w1

    if norm_w2 == 0:
        e_w2B_unbiased = np.zeros_like(e_w2B)
    else:
        e_w2B_unbiased = np.sqrt(np.abs(1 - mu_orth**2)) * (e_w2B - mu_b) / norm_w2

    e_w1B_corrected = e_w1B_unbiased + mu_orth
    e_w2B_corrected = e_w2B_unbiased + mu_orth

    return e_w1B_corrected, e_w2B_corrected


In [139]:
# Display cosine similarities before equalizing
print("Cosine similarities before equalizing:")
print("cosine_similarity(word_to_vec_map[\"man\"], gender) =", cosine_similarity(word_to_vec_map["man"], g))
print("cosine_similarity(word_to_vec_map[\"woman\"], gender) =", cosine_similarity(word_to_vec_map["woman"], g))
print()

# Equalize the embeddings for 'man' and 'woman'
e1, e2 = equalize(("man", "woman"), g, word_to_vec_map)

# Display cosine similarities after equalizing
print("Cosine similarities after equalizing:")
print(f"cosine_similarity(e1, gender) = {cosine_similarity(e1, g)}")
print(f"cosine_similarity(e2, gender) = {cosine_similarity(e2, g)}")


Cosine similarities before equalizing:
cosine_similarity(word_to_vec_map["man"], gender) = -0.18769064329512627
cosine_similarity(word_to_vec_map["woman"], gender) = 0.388177000149502

Cosine similarities after equalizing:
cosine_similarity(e1, gender) = -0.045804611352558575
cosine_similarity(e2, gender) = 0.04331400245481595


In [None]:
############################################################################### END OF NOTEBOOK #########################################################################