# Bias in word embeddings
Word embeddings is a technique in NLP and text mining to represent words in order to be able to compare words based on similarity. The word embeddings are trained on large data sets with text written by humans. Therefore, the bias we have and include in our writungs will be transferred to the word embeddings. This project looks at what bias there are in the swedish word embeddings "Swectors" based on text from Göteborgsposten. They will also be compared with another set of word ebedding that will be trained on antoher set of data.

In [1]:
import bz2
import pandas as pd
import numpy as np
import time

## Importing datasets
Importing the large set of vectors might take a little while, please be patient.

In [2]:
colnames = ["word"] + ["dim" + str(x) for x in range(1,301)]
with bz2.open("swectors-300dim.txt.bz2") as source:
    swectors = pd.read_csv(source, header=None, names=colnames, delimiter=" ", skiprows=[0])
    
swectors.tail()

Unnamed: 0,word,dim1,dim2,dim3,dim4,dim5,dim6,dim7,dim8,dim9,...,dim291,dim292,dim293,dim294,dim295,dim296,dim297,dim298,dim299,dim300
173239,todasjev,0.553938,-1.896754,0.113465,0.024538,1.762654,0.935111,-0.624131,0.525502,-0.427481,...,-0.130935,-0.801437,-0.826258,-0.379275,0.196962,-0.732813,-0.129504,0.545349,-0.344061,-0.068219
173240,chalkida,0.857497,0.43781,0.654625,-1.195745,0.517182,-1.495259,-0.327865,-0.981732,-0.834751,...,-0.593594,-0.443583,0.425608,-0.378365,-0.535857,0.441556,-0.77059,1.113439,1.034144,0.063788
173241,kompressionskläder,-0.094503,-0.671708,-0.225601,1.264434,-0.098052,1.013893,1.089978,0.199295,0.691439,...,-0.45582,0.724241,-0.76077,0.445536,-0.6873,-0.138718,-0.248716,-0.057516,0.927693,-0.746867
173242,geziparken,-0.921283,-2.042769,0.297705,0.265134,0.694947,-0.057586,-1.094668,0.120941,-1.732892,...,-0.836532,-0.230353,-0.744922,-0.497342,-1.273071,-1.724398,-1.052579,-0.134821,-0.197775,0.47228
173243,yellen,1.236722,-1.233162,1.953644,-0.15951,-0.067402,-1.22527,0.396844,-0.979577,0.916076,...,1.009823,0.130401,-0.318467,-1.169484,0.045342,1.56512,-1.110025,-0.283444,-2.150597,0.081822


### Import small sized dataset

In [3]:
colnames = ["word"] + ["dim" + str(x) for x in range(1,301)]
with bz2.open("swectors_short-300dim.txt.bz2") as source:
    swectors_short = pd.read_csv(source, header=None, names=colnames, delimiter=" ", skiprows=[0])
    
swectors_short.tail()

Unnamed: 0,word,dim1,dim2,dim3,dim4,dim5,dim6,dim7,dim8,dim9,...,dim291,dim292,dim293,dim294,dim295,dim296,dim297,dim298,dim299,dim300
7592,morötter,1.377063,-0.320867,-0.891954,2.830373,0.825997,1.932514,-2.163011,1.587833,1.440585,...,0.179074,0.17022,-2.506192,-2.462804,0.740179,0.003449,2.325763,1.43701,0.303144,0.385211
7593,hjältar,2.921456,-0.467303,-0.351971,-1.563029,2.213287,2.744328,0.347316,2.747632,-0.504926,...,-3.08441,-1.780909,-1.892354,0.929151,-1.926698,1.750462,-0.188903,-0.364937,0.745645,2.605387
7594,träff,0.857412,-1.90517,1.293573,1.165565,2.914954,0.528677,-1.657378,-1.05424,-1.095805,...,-2.051143,1.165688,-1.32577,-1.513558,-2.06685,-0.498036,1.108481,2.500199,-1.232332,0.314965
7595,varmare,0.044312,-3.489696,-3.128916,-0.886223,1.268917,-0.423831,1.517649,1.637759,1.587272,...,2.403448,2.708916,2.563368,0.494266,2.623131,-0.113758,-1.264208,0.773594,-1.736908,2.501356
7596,meddela,-2.005358,0.995493,-1.234387,-2.401334,-1.615172,-1.57513,2.422385,-1.812807,-0.771301,...,0.91842,-1.519924,0.996619,-2.035296,0.857632,2.701465,-0.780385,-0.184154,-3.955059,0.887238


Extract the vector for the word 'kvinna', in order to look att similar words for bias measures.
Get the 300 dimensions from the dataframe, convert to numpy and get the following format: `[[dim1 dim2 ... dim299 dim300]]`, take the first element to get a single list. Save it as a tuple with word first and vector second.

In [4]:
kvinna = ('kvinna', swectors.loc[swectors['word'] == 'kvinna'].loc[:, 'dim1':'dim300'].to_numpy()[0])
man = ('man', swectors.loc[swectors['word'] == 'man'].loc[:, 'dim1':'dim300'].to_numpy()[0])

In [5]:
def cosine_similarity(word1, word2):
    # Takes two vectors and calculates the cosine similarity between them
    # @ is dot product
    v1 = word1[1]
    v2 = word2[1]
    return (v1 @ v2) / (np.linalg.norm(v1)*np.linalg.norm(v2))

In [6]:
cosine_similarity(kvinna, man)

0.32839477881769635

## Applying function to whole dataframe
This way of doing the calculation will apply a function to each row of the dataframe, and return a Series of same length with all results in it.

In [21]:
def n_most_similar(n, word, adj=False):
    
    if adj:
        df = swectors_filtered
        print("filtered")
    else:
        df = swectors
        print("not filtered")
        
    word_vec = (word, swectors.loc[swectors['word'] == word].loc[:, 'dim1':'dim300'].to_numpy()[0])
    
    def similarity(row):
        row_vec = (row['word'], row.loc['dim1':'dim300'].to_numpy())
        return cosine_similarity(word_vec, row_vec)

    start = time.time()
    similarities = df.apply(similarity, axis=1)

    # Concatenate the top n words (plus the word itself) to the similarity values of each word.
    # Also set the correct coulmn name.
    s1 = df.loc[similarities.nlargest(n+1).index, 'word']
    s2 = similarities.nlargest(n+1)
    similars = pd.concat([s1, s2], axis=1).rename(columns={0: "similarity"})
    end = time.time()
    print("Time elapsed: ", end - start)
    return similars

In [8]:
similars_kvinna = n_most_similar(10, 'kvinna')
print(similars_kvinna)

Time elapsed:  115.17177414894104
               word  similarity
493          kvinna    1.000000
2089         flicka    0.809585
2744          pojke    0.707318
588         kvinnan    0.679172
451          person    0.671369
2760           tjej    0.669906
54028  tonårsflicka    0.649514
10239       väninna    0.604904
15129       yngling    0.597423
2182          kille    0.594290
14060       kvinnas    0.591184


In [9]:
similars_män = n_most_similar(10, 'män')
print(similars_män)

Time elapsed:  74.31563758850098
                word  similarity
375              män    1.000000
287          kvinnor    0.787824
1077          männen    0.772569
1712         flickor    0.710005
1492       kvinnorna    0.698098
2108          pojkar    0.689605
163         personer    0.651116
1642          killar    0.640503
5826      tonåringar    0.625749
31239  tonårsflickor    0.622238
22484      ynglingar    0.596201


## Filtering out all adjectives in the word embeddings
To see bias in adjectives, the dataframe with the swectors is filtered to only keep words that are in the dataframe with adjectves from Språkrådet.

In [10]:
with bz2.open("adjektiv.txt.bz2") as source:
    adjectives = pd.read_csv(source)
    
swectors_filtered = swectors.loc[swectors['word'].isin(adjectives['Word'])]

swectors_filtered.tail()

Unnamed: 0,word,dim1,dim2,dim3,dim4,dim5,dim6,dim7,dim8,dim9,...,dim291,dim292,dim293,dim294,dim295,dim296,dim297,dim298,dim299,dim300
76,nya,-0.729743,-0.449422,-1.287769,0.516173,0.007949,-0.250131,0.413187,-0.834256,-1.210332,...,-1.447273,-0.578143,3.350518,1.259971,1.164465,-3.448653,0.770229,1.136032,2.21996,1.946708
77,första,1.882675,2.777934,-3.836164,0.102493,-2.66155,4.04724,3.636628,5.325146,-0.883794,...,2.248815,0.405223,1.828185,0.516869,1.052677,-5.365753,-1.827669,-3.354366,1.580842,2.091712
106,stora,-4.209235,0.697929,1.326392,-1.678565,-1.118567,-0.23046,2.493664,1.760369,-2.091855,...,-1.836666,-0.910354,-1.760951,-0.298046,-0.862193,-2.568033,-2.611245,0.198783,1.491845,-1.954573
166,förra,-1.872293,-2.111872,-0.642844,0.804811,-0.556115,0.995949,-1.326142,2.775627,4.454869,...,0.449069,0.24488,2.26037,0.567367,-0.740135,-5.40545,-0.423753,-2.765256,3.046515,2.580383
168,samtidigt,-2.116588,-0.7603,-1.383661,-2.212478,0.856674,-0.129116,1.010777,-3.779737,-1.128603,...,-2.616983,-0.308054,-0.227243,0.824155,0.570524,1.142409,3.110606,0.934515,-0.068022,0.613209


In [45]:
swectors_filtered.shape

(14706, 301)

In [42]:
similars_kvinna_adj = n_most_similar(10, 'kvinna', adj=True)
print(similars_kvinna_adj)

filtered
Time elapsed:  5.812798976898193
                  word  similarity
23018     prostituerad    0.578047
3534          kvinnlig    0.520999
94347   pistolbeväpnad    0.506082
10471          anhörig    0.485582
71722        halvnaken    0.471111
21093         jämnårig    0.465865
47714     knivbeväpnad    0.450802
8349             naken    0.437641
77713   butiksanställd    0.436922
126828     sjuttonårig    0.416900
5792            manlig    0.416660
