# Word Embedding
### Gender bias: Science VS Art

In [1]:
%pip install numpy pandas gensim matplotlib

Collecting numpy
  Downloading numpy-1.24.4-cp38-cp38-macosx_10_9_x86_64.whl.metadata (5.6 kB)
Collecting pandas
  Downloading pandas-2.0.3-cp38-cp38-macosx_10_9_x86_64.whl.metadata (18 kB)
Collecting gensim
  Downloading gensim-4.3.2-cp38-cp38-macosx_10_9_x86_64.whl.metadata (8.5 kB)
Collecting matplotlib
  Downloading matplotlib-3.7.4-cp38-cp38-macosx_10_12_x86_64.whl.metadata (5.7 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2023.3.post1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.1 (from pandas)
  Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m341.8/341.8 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting scipy>=1.7.0 (from gensim)
  Downloading scipy-1.10.1-cp38-cp38-macosx_10_9_x86_64.whl (35.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.0/35.0 MB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting sma

In [2]:
# Core packages for word embeddings
import gensim
from gensim.models import Word2Vec
import gensim.downloader as api


In [4]:
# more pachages
import numpy as np  # For creating the embedding matrix
# prepare a small model
model_name = 'glove-twitter-25'
w2v_small = api.load(model_name)



In [6]:
# Define the target and attribute word sets
target_male = ['he', 'his', 'him', 'son', 'father', 'boy', 'brother', 'man', 'male']
target_female = ['she', 'her', 'hers', 'daughter', 'mother', 'girl', 'sister', 'woman', 'female']
attribute_science = ['physics', 'chemistry', 'biology', 'mathematics', 'research',
                  'experiment', 'scientist', 'lab', 'hypothesis', 'discovery']
attribute_arts = ['music', 'painting', 'literature', 'poetry', 'dance',
               'sculpture', 'theater', 'drawing', 'creativity', 'performance']

In [7]:
# Calculate the WEAT score
def cosine_similarity(word, attribute_set):
        word_vector = w2v_small[word]
        attribute_vectors = [w2v_small[attr] for attr in attribute_set]
        return np.mean([np.dot(word_vector, attr_vector) / (np.linalg.norm(word_vector) * np.linalg.norm(attr_vector))
                        for attr_vector in attribute_vectors])

def weat_score(word_set_X, word_set_Y, word_set_A, word_set_B):

    X_association = np.mean([cosine_similarity(x, attribute_science) - cosine_similarity(x, attribute_arts)
                             for x in word_set_X])
    Y_association = np.mean([cosine_similarity(y, attribute_science) - cosine_similarity(y, attribute_arts)
                             for y in word_set_Y])
    A_association = np.mean([cosine_similarity(a, attribute_science) - cosine_similarity(a, attribute_arts)
                             for a in word_set_A])
    B_association = np.mean([cosine_similarity(b, attribute_science) - cosine_similarity(b, attribute_arts)
                             for b in word_set_B])

    return X_association - Y_association / np.sqrt(np.var([X_association, Y_association]) +
                                                    np.var([A_association, B_association]))


In [8]:
# Calculate the WEAT score for the given word sets
weat_result = weat_score(target_male, target_female, attribute_science, attribute_arts)

# Step 4: Print the WEAT result
print(f"WEAT score: {weat_result}")

WEAT score: 0.8146985173225403


In [10]:
?np.var

[0;31mSignature:[0m
[0mnp[0m[0;34m.[0m[0mvar[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0ma[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0maxis[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdtype[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mout[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mddof[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkeepdims[0m[0;34m=[0m[0;34m<[0m[0mno[0m [0mvalue[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mwhere[0m[0;34m=[0m[0;34m<[0m[0mno[0m [0mvalue[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Compute the variance along the specified axis.

Returns the variance of the array elements, a measure of the spread of a
distribution.  The variance is computed for the flattened array by
default, otherwise over th