In [16]:
! pip install adjustText



In [17]:
# These are all the modules we'll be using later. Make sure you can import them before proceeding further.
%matplotlib inline
import zipfile
import re
import numpy as np
import pandas as pd
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='3'
import random
import tensorflow as tf
import matplotlib.pyplot as plt
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE
from adjustText import adjust_text

In [None]:
url = 'http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip'


def download_data(url, data_dir):
    """Download a file if not present, and make sure it's the right size."""

    os.makedirs(data_dir, exist_ok=True)

    file_path = os.path.join(data_dir, 'bbc-fulltext.zip')

    if not os.path.exists(file_path):
        print('Downloading file...')
        filename, _ = urlretrieve(url, file_path)
    else:
        print("File already exists")

    extract_path = os.path.join(data_dir, 'bbc')
    if not os.path.exists(extract_path):

        with zipfile.ZipFile(os.path.join(data_dir, 'bbc-fulltext.zip'), 'r') as zipf:
            zipf.extractall(data_dir)

    else:
        print("bbc-fulltext.zip has already been extracted")

download_data(url, 'data')

In [18]:
def read_data(data_dir):

    # This will contain the full list of stories
    news_stories = []

    print("Reading files")

    i = 0 # Just used for printing progress
    for root, dirs, files in os.walk(data_dir):

        for fi, f in enumerate(files):

            # We don't read the README file
            if 'README' in f:
                continue

            # Printing progress
            i += 1
            print("."*i, f, end='\r')

            # Open the file
            with open(os.path.join(root, f), encoding='latin-1') as f:

                story = []
                # Read all the lines
                for row in f:

                    story.append(row.strip())

                # Create a single string with all the rows in the doc
                story = ' '.join(story)
                # Add that to the list
                news_stories.append(story)

        print('', end='\r')

    print(f"\nDetected {len(news_stories)} stories")
    return news_stories


news_stories = read_data(os.path.join('data', 'bbc'))

# Printing some stats and sample data
print(f"{sum([len(story.split(' ')) for story in news_stories])} words found in the total news set")
print('Example words (start): ',news_stories[0][:50])
print('Example words (end): ',news_stories[-1][-50:])

Reading files
..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [30]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(
    num_words=None,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True, split=' '
)

tokenizer.fit_on_texts(news_stories)
print("Data fitted on the tokenizer")

Data fitted on the tokenizer


In [31]:
n_vocab = len(tokenizer.word_index.items()) + 1
print(f"Vocabulary size: {n_vocab}")

print("\nWords at the top")
print('\t', dict(list(tokenizer.word_index.items())[:10]))
print("\nWords at the bottom")
print('\t', dict(list(tokenizer.word_index.items())[-10:]))

Vocabulary size: 32360

Words at the top
	 {'the': 1, 'to': 2, 'of': 3, 'and': 4, 'a': 5, 'in': 6, 'for': 7, 'is': 8, 'that': 9, 'on': 10}

Words at the bottom
	 {'counsellor': 32350, "'frag'": 32351, 'relasing': 32352, "'real'": 32353, 'hrs': 32354, 'enviroment': 32355, 'trifling': 32356, '24hours': 32357, 'ahhhh': 32358, 'lol': 32359}


In [32]:
from tensorflow.keras.preprocessing.text import Tokenizer

n_vocab = 15000 + 1
tokenizer = Tokenizer(
    num_words=n_vocab-1,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True, split=' ', oov_token='',
)

tokenizer.fit_on_texts(news_stories)
print("Data fitted on the tokenizer")

Data fitted on the tokenizer


In [33]:
news_sequences = tokenizer.texts_to_sequences(news_stories)

In [34]:
sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(n_vocab, sampling_factor=1e-05)

In [36]:
def cbow_grams(sequence, vocabulary_size,
              window_size=4, negative_samples=1., shuffle=True,
              categorical=False, sampling_table=None, seed=None):

    targets, contexts, labels = [], [], []

    for i, wi in enumerate(sequence):


        if not wi or i < window_size or i + 1 > len(sequence)-window_size:
            continue
        if sampling_table is not None:
            if sampling_table[wi] < random.random():
                continue

        window_start = max(0, i - window_size)
        window_end = min(len(sequence), i + window_size + 1)

        context_words = [wj for j, wj in enumerate(sequence[window_start:window_end]) if j+window_start != i]
        target_word = wi

        context_classes = tf.expand_dims(tf.constant(context_words, dtype="int64"), 0)

        negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_classes,
          num_true=window_size * 2,
          num_sampled=negative_samples,
          unique=True,
          range_max=vocabulary_size,
          name="negative_sampling")

        # Build context and label vectors (for one target word)
        negative_targets = negative_sampling_candidates.numpy().tolist()

        target = [target_word] + negative_targets
        label = [1] + [0]*negative_samples

        # Append each element from the training example to global lists.
        targets.extend(target)
        contexts.extend([context_words]*(negative_samples+1))
        labels.extend(label)

    couples = list(zip(targets, contexts))

    seed = random.randint(0, 10e6)
    random.seed(seed)
    random.shuffle(couples)
    random.seed(seed)
    random.shuffle(labels)


    return couples, labels


window_size = 1 # How many words to consider left and right.


inputs, labels = cbow_grams(
    tokenizer.texts_to_sequences(["I am going to the store"])[0],
    vocabulary_size=len(tokenizer.word_index.items())+1,
    window_size=window_size, negative_samples=4, shuffle=False,
    categorical=False, sampling_table=None, seed=None
)

print(tokenizer.texts_to_sequences(["I am going to the store"]))
i = 0
for inp, lbl in zip(inputs, labels):
    i += 1
    print(f"Input: {inp} ({[[tokenizer.index_word[wi] for wi in inp[1] ]] + [tokenizer.index_word[inp[0]] if inp[0] > 0 else None]})/ Label: {lbl}")
    #
    if i >= 20:
        break

[[28, 428, 132, 3, 2, 1505]]
Input: (3, [132, 2]) ([['going', 'the'], 'to'])/ Label: 1
Input: (58, [3, 1505]) ([['to', 'store'], 'can'])/ Label: 0
Input: (540, [132, 2]) ([['going', 'the'], 'asked'])/ Label: 0
Input: (132, [428, 3]) ([['am', 'to'], 'going'])/ Label: 1
Input: (399, [3, 1505]) ([['to', 'store'], 'major'])/ Label: 0
Input: (671, [132, 2]) ([['going', 'the'], '17'])/ Label: 0
Input: (387, [28, 132]) ([['i', 'going'], 'support'])/ Label: 0
Input: (2, [428, 3]) ([['am', 'to'], 'the'])/ Label: 0
Input: (22, [132, 2]) ([['going', 'the'], 'by'])/ Label: 0
Input: (106, [3, 1505]) ([['to', 'store'], 'while'])/ Label: 0
Input: (9284, [28, 132]) ([['i', 'going'], 'bulletin'])/ Label: 0
Input: (25, [132, 2]) ([['going', 'the'], 'are'])/ Label: 0
Input: (31, [428, 3]) ([['am', 'to'], 'they'])/ Label: 0
Input: (105, [428, 3]) ([['am', 'to'], 'her'])/ Label: 0
Input: (428, [28, 132]) ([['i', 'going'], 'am'])/ Label: 1
Input: (7669, [28, 132]) ([['i', 'going'], 'mogul'])/ Label: 0
Input

In [37]:
window_size = 1 # How many words to consider left and right.

inputs, labels = tf.keras.preprocessing.sequence.skipgrams(
    tokenizer.texts_to_sequences([news_stories[0][:150]])[0],
    vocabulary_size=len(tokenizer.word_index.items())+1,
    window_size=window_size, negative_samples=4, shuffle=False,
    categorical=False, sampling_table=None, seed=None
)

i = 0
for inp, lbl in zip(inputs, labels):
    i += 1
    print(f"Input: {inp} ({[tokenizer.index_word[wi] for wi in inp]}) / Label: {lbl}")

Input: [4223, 187] (['ad', 'sales']) / Label: 1
Input: [187, 4223] (['sales', 'ad']) / Label: 1
Input: [187, 716] (['sales', 'boost']) / Label: 1
Input: [716, 187] (['boost', 'sales']) / Label: 1
Input: [716, 66] (['boost', 'time']) / Label: 1
Input: [66, 716] (['time', 'boost']) / Label: 1
Input: [66, 3596] (['time', 'warner']) / Label: 1
Input: [3596, 66] (['warner', 'time']) / Label: 1
Input: [3596, 1050] (['warner', 'profit']) / Label: 1
Input: [1050, 3596] (['profit', 'warner']) / Label: 1
Input: [1050, 3938] (['profit', 'quarterly']) / Label: 1
Input: [3938, 1050] (['quarterly', 'profit']) / Label: 1
Input: [3938, 626] (['quarterly', 'profits']) / Label: 1
Input: [626, 3938] (['profits', 'quarterly']) / Label: 1
Input: [626, 21] (['profits', 'at']) / Label: 1
Input: [21, 626] (['at', 'profits']) / Label: 1
Input: [21, 49] (['at', 'us']) / Label: 1
Input: [49, 21] (['us', 'at']) / Label: 1
Input: [49, 303] (['us', 'media']) / Label: 1
Input: [303, 49] (['media', 'us']) / Label: 1


In [38]:
batch_size = 4096 # Data points in a single batch

embedding_size = 128 # Dimension of the embedding vector.

window_size=1 # We use a window size of 1 on either side of target word
epochs = 5 # Number of epochs to train for
negative_samples = 4 # Number of negative samples generated per example

# We pick a random validation set to sample nearest neighbors
valid_size = 16 # Random set of words to evaluate similarity on.
# We sample valid data points randomly from a large window without always being deterministic
valid_window = 250

# When selecting valid examples, we select some of the most frequent words as well as
# some moderately rare words
np.random.seed(54321)
random.seed(54321)

valid_term_ids = np.array(random.sample(range(valid_window), valid_size))
valid_term_ids = np.append(
    valid_term_ids, random.sample(range(1000, 1000+valid_window), valid_size),
    axis=0
)

In [57]:
import tensorflow.keras.backend as K

K.clear_session()


# Inputs; target input layer will have the final shape [None]
# context will have [None, 2xwindow_size] shape
input_1 = tf.keras.layers.Input(shape=())

input_2 = tf.keras.layers.Input(shape=(window_size*2,))

# Target and context embedding layers
target_embedding_layer = tf.keras.layers.Embedding(
    input_dim=n_vocab, output_dim=embedding_size, name='target_embedding'
)

context_embedding_layer = tf.keras.layers.Embedding(
    input_dim=n_vocab, output_dim=embedding_size, name='context_embedding'
)

# Outputs of the target and context embedding lookups
context_out = context_embedding_layer(input_2)
target_out = target_embedding_layer(input_1)

# Taking the mean over the all the context words to produce [None, embedding_size]
mean_context_out = tf.keras.layers.Lambda(lambda x: tf.reduce_mean(x, axis=1))(context_out)

# Computing the dot product between the two
out = tf.keras.layers.Dot(axes=-1)([mean_context_out, target_out])

cbow_model = tf.keras.models.Model(inputs=[input_1, input_2], outputs=out, name='cbow_model')

cbow_model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    optimizer='adam'
)

cbow_model.summary()

In [58]:
class ValidationCallback(tf.keras.callbacks.Callback):

    def __init__(self, valid_term_ids, model_with_embeddings, tokenizer):

        self.valid_term_ids = valid_term_ids
        self.model_with_embeddings = model_with_embeddings
        self.tokenizer = tokenizer

        super().__init__()

    def on_epoch_end(self, epoch, logs=None):
        """ Validation logic """

        # We will use context embeddings to get the most similar words
        # Other strategies include: using target embeddings, mean embeddings after avaraging context/target
        embedding_weights = self.model_with_embeddings.get_layer("context_embedding").get_weights()[0]
        normalized_embeddings = embedding_weights / np.sqrt(np.sum(embedding_weights**2, axis=1, keepdims=True))

        # Get the embeddings corresponding to valid_term_ids
        valid_embeddings = normalized_embeddings[self.valid_term_ids, :]

        # Compute the similarity between valid_term_ids and all the embeddings
        # V x d (d x D) => V x D
        top_k = 5 # Top k items will be displayed
        similarity = np.dot(valid_embeddings, normalized_embeddings.T)

        # Invert similarity matrix to negative
        # Ignore the first one because that would be the same word as the probe word
        similarity_top_k = np.argsort(-similarity, axis=1)[:, 1: top_k+1]

        # Print the output
        for i, term_id in enumerate(valid_term_ids):

            similar_word_str = ', '.join([self.tokenizer.index_word[j] for j in similarity_top_k[i, :] if j >= 1])
            print(f"{self.tokenizer.index_word[term_id]}: {similar_word_str}")

        print('\n')

In [59]:
def cbow_data_generator(sequences, window_size, batch_size, negative_samples):

    rand_sequence_ids = np.arange(len(sequences))
    np.random.shuffle(rand_sequence_ids)

    for si in rand_sequence_ids:
        inputs, labels = cbow_grams(
            sequences[si],
            vocabulary_size=n_vocab,
            window_size=window_size,
            negative_samples=negative_samples,
            shuffle=True,
            sampling_table=sampling_table,
            seed=None
        )

        inputs_context, inputs_target, labels = np.array([inp[1] for inp in inputs]), np.array([inp[0] for inp in inputs]), np.array(labels).reshape(-1,1)

        assert inputs_context.shape[0] == inputs_target.shape[0]
        assert inputs_context.shape[0] == labels.shape[0]

        #print(inputs_context.shape, inputs_target.shape, labels.shape)
        for eg_id_start in range(0, inputs_context.shape[0], batch_size):

            yield (
                inputs_target[eg_id_start: min(eg_id_start+batch_size, inputs_target.shape[0])],
                inputs_context[eg_id_start: min(eg_id_start+batch_size, inputs_context.shape[0]),:]
            ), labels[eg_id_start: min(eg_id_start+batch_size, labels.shape[0])]

In [60]:
cbow_validation_callback = ValidationCallback(valid_term_ids, cbow_model, tokenizer)

for ei in range(epochs):
    print(f"Epoch: {ei+1}/{epochs} started")
    news_cbow_gen = cbow_data_generator(news_sequences, window_size, batch_size, negative_samples)
    cbow_model.fit(
        news_cbow_gen,
        epochs=1,
        callbacks=cbow_validation_callback,
    )

Epoch: 1/5 started
   2226/Unknown [1m47s[0m 21ms/step - loss: 0.4978election: lee, rubber, titled, argentine, keith
me: around, whether, airport, websites, virtual
with: but, during, absolutely, these, some
you: they, not, we, help, could
were: or, until, v, ross, these
win: china, michalak, arsenal's, takes, before
those: v, before, around, i'm, lions
music: china's, some, illegal, uk, despite
also: spokesman, weren't, how, mcgeady, worked
third: fourth, 40, first, russian, most
best: irish, idol, csi, including, scotland's
him: fish, beat, outside, must, v
too: neither, goalkeeper, paid, explains, v
some: our, reported, greek, my, illegal
through: v, pretty, criminal, greater, clocked
mr: tony, michael, who, ron, said
file: whose, presenter, v, euros, include
pair: china, presenter, candidate, whose, islamic
ceremony: whose, turns, candidate, plasma, chairman
believed: be, doing, sinking, chorus, infection
post: singer, infected, rapper, spam, men
indian: euros, became, receive, q

  self.gen.throw(typ, value, traceback)


   2226/Unknown [1m48s[0m 22ms/step - loss: 0.3888election: motors, attorney, spanish, land, cities
me: them, didn't, whether, him, tuned
with: reacting, tough, bringing, declaring, voice
you: they, we, not, help, don't
were: are, being, was, get, if
win: delivered, yen, â£4, 4m, gold
those: ellen, antonio, arrival, dates, gael
music: business, â£48, bond, north, china's
also: never, already, quite, been, thought
third: fourth, first, final, fifth, total
best: category, festival, expectations, supporting, musical
him: them, doesn't, himself, successfully, whether
too: pretty, dedicated, extremely, pig, fresh
some: marrying, tim's, taxi, gameplay, scots
through: specially, renner, henry's, successfully, publicly
mr: tony, bernie, michael, hughes, gordon
file: illegally, hopes, carmaker, presenter, older
pair: presenter, subs, fashion, afi, turns
ceremony: turns, winner, whose, monsanto, barroso
believed: stallone, dominic, interviewed, farrell's, featured
post: athlete, ingram, perry,

In [62]:
# example 1 : word vector for "dog"
word_vector_dog = cbow_model.get_layer("context_embedding").get_weights()[0][tokenizer.word_index["dog"]]

print(len(word_vector_dog))
print(word_vector_dog)

128
[ 0.11085108 -0.25469235 -0.15203784  0.07325227 -0.14737785  0.14830768
 -0.27429166 -0.09608977  0.03704134 -0.17361996  0.01531993 -0.0465077
 -0.23753951 -0.25014573 -0.0253557   0.40826246 -0.16949935  0.01692696
 -0.20239356  0.14066818  0.22502121  0.14511508  0.21380092 -0.32108378
  0.15176813  0.13242695 -0.2073735   0.0660826  -0.14078793  0.30938998
  0.2563888  -0.11537783 -0.057417    0.16718397 -0.06057169  0.12312715
 -0.26447493  0.14412111 -0.24513826  0.20877922 -0.10309254  0.26646736
  0.1685377   0.12303187 -0.1645519   0.09164843  0.19170655 -0.13821886
  0.21663491  0.05457383 -0.31042415 -0.17717057 -0.12289714  0.20443682
  0.23057601  0.0138363  -0.11138823 -0.2888349  -0.28080335  0.16014308
 -0.20665485  0.09823053 -0.25267482  0.11103029  0.10458331 -0.20751747
 -0.15295331  0.02635166 -0.15592682 -0.31063843  0.14527409  0.37395683
  0.28519127 -0.04358221  0.0892965  -0.20123228 -0.25392875  0.251636
 -0.0364037   0.05037522 -0.22806315 -0.4000495   

In [64]:
# example 2 : word vector for "cat"
word_vector_cat = cbow_model.get_layer("context_embedding").get_weights()[0][tokenizer.word_index["cat"]]

print(len(word_vector_cat))
print(word_vector_cat)

128
[ 0.127196   -0.11693152  0.09308042  0.12874417 -0.2682941   0.2792257
 -0.16069162 -0.18182817  0.14409874 -0.18055683  0.18285744 -0.15229836
 -0.16488212 -0.02559988  0.05636497  0.18549453 -0.18495917  0.14344405
 -0.2388128   0.1661385   0.02947032  0.2424594   0.2088696  -0.06145832
  0.21484898  0.20510563 -0.27681673  0.14117916 -0.18595581  0.20219317
  0.175478   -0.11449622 -0.23579505  0.16107257 -0.09269111  0.19590852
 -0.25171912  0.10727876 -0.2352388   0.16605994 -0.12294747  0.1159846
  0.24661736  0.32330567 -0.25087246  0.24554014  0.11941528 -0.31488204
  0.00839808 -0.21290123 -0.03791186 -0.16514829 -0.14536686  0.08121478
  0.18461722  0.1893544  -0.15681821 -0.2482141  -0.1162774   0.19971709
 -0.05162466  0.30691355 -0.10760359  0.23771767  0.23367436  0.15234634
 -0.1659805  -0.27187628 -0.14614631 -0.11600802  0.0781921  -0.10022354
  0.13923761  0.17586756  0.26802883 -0.21206842 -0.26949224 -0.01797035
 -0.15069278 -0.00377987 -0.23673578 -0.14481755 

In [65]:
# example 3 : word vector for "man"
word_vector_man = cbow_model.get_layer("context_embedding").get_weights()[0][tokenizer.word_index["man"]]

print(len(word_vector_man))
print(word_vector_man)

128
[ 6.86640143e-02 -5.90903699e-01  3.10078174e-01  1.72983125e-01
  2.41807267e-01  5.43345094e-01  2.31243357e-01 -2.29310557e-01
 -1.76859405e-02 -1.75965786e-01  3.34119797e-01  5.26852794e-02
  7.04236478e-02 -2.15318263e-01  2.41668685e-03  2.40028322e-01
 -3.42839777e-01  4.21943843e-01 -1.38924852e-01 -2.80066907e-01
 -1.63855374e-01  4.69146715e-03 -8.69998187e-02 -6.29312634e-01
  3.13283026e-01 -1.95082814e-01 -3.25315177e-01 -1.87254459e-01
 -1.81876370e-04  5.31756580e-01 -2.61715055e-02  2.41853774e-01
 -4.42755371e-01 -5.55196293e-02 -9.73677486e-02 -1.19620278e-01
 -4.69525784e-01 -8.47011358e-02 -5.05936861e-01  3.19596171e-01
 -1.28067613e-01  4.31536585e-01  3.07860300e-02 -5.19085899e-02
 -5.43045290e-02 -1.22711517e-01  1.36814132e-01  5.43368421e-02
 -5.22495471e-02 -1.80196136e-01  5.21681488e-01 -2.45501578e-01
 -1.76035956e-01  2.38810629e-01 -1.66951284e-01 -1.26013026e-01
 -1.09804198e-01 -3.73923123e-01 -1.64061233e-01  3.14823955e-01
 -1.07101746e-01 -2.6

In [66]:
# example 4 : word vector for "woman"
word_vector_woman = cbow_model.get_layer("context_embedding").get_weights()[0][tokenizer.word_index["woman"]]

print(len(word_vector_woman))
print(word_vector_woman)

128
[ 0.39148128 -0.34370282  0.19532591  0.06584781 -0.15752243  0.44233474
  0.01061332 -0.07237288  0.21496254 -0.02258785  0.07142099 -0.0707683
 -0.58668065 -0.23823674  0.06922557  0.00662435 -0.07818599 -0.15131135
  0.01688558  0.02565692  0.03561234 -0.00243727  0.04619027 -0.3215033
  0.09538077  0.074044   -0.13624005 -0.13199133 -0.13095568  0.4468145
  0.07778834 -0.02715021 -0.25235435 -0.11935147 -0.26052564  0.02816335
  0.02029142  0.2005923   0.09127384  0.0535125   0.02829764  0.33092332
  0.32427287  0.09690747 -0.00766895  0.24605803 -0.05985556 -0.10512602
  0.0754553  -0.19038695  0.1628446  -0.26858997 -0.3392183   0.13743076
  0.14457273  0.32185763 -0.08765571 -0.30394438 -0.25770885  0.10989341
 -0.11284456 -0.01070174 -0.10117409 -0.01480832  0.27981734  0.09945339
 -0.2003053   0.01811188 -0.52966523 -0.16829725 -0.15140848  0.09773593
  0.30029017 -0.2774232   0.00939985 -0.12097496 -0.07648011  0.5156046
  0.06540091 -0.36145192  0.05068285  0.01117715  0

In [67]:
# example 1 : similarity score between dog and cat
similarity = np.dot(word_vector_dog, word_vector_cat) / (np.linalg.norm(word_vector_dog) * np.linalg.norm(word_vector_cat))

print(similarity)

0.7114462


In [68]:
# example 2 : similarity score between dog and man
similarity = np.dot(word_vector_dog, word_vector_man) / (np.linalg.norm(word_vector_dog) * np.linalg.norm(word_vector_man))

print(similarity)

0.4212371


In [69]:
# example 3 : similarity score between dog and man
similarity = np.dot(word_vector_man, word_vector_woman) / (np.linalg.norm(word_vector_man) * np.linalg.norm(word_vector_woman))

print(similarity)

0.4896157


In [71]:
king_vector = cbow_model.get_layer("context_embedding").get_weights()[0][tokenizer.word_index["king"]]
man_vector = cbow_model.get_layer("context_embedding").get_weights()[0][tokenizer.word_index["man"]]
woman_vector = cbow_model.get_layer("context_embedding").get_weights()[0][tokenizer.word_index["woman"]]

In [72]:
tokenizer.word_index["Queen"] ## doesnn't exist in the word index

KeyError: 'Queen'

In [73]:
# application of analogy to calculate Queen vector

queen_vector = king_vector - man_vector + woman_vector
print(len(queen_vector))
print(queen_vector)

128
[ 0.7886466  -0.3211837  -0.30918813 -0.128811   -0.45703763  0.5096866
 -0.25101283  0.04727484  0.20742783  0.33350104 -0.27290428 -0.0850784
 -1.0004001  -0.39230192  0.12155542  0.05303115  0.20978881  0.21571879
 -0.16429172  0.7593962   0.34604892  0.02282587  0.18897085  0.2065222
 -0.25181735  0.27411595 -0.38478714  0.0096622  -0.3301951   0.23001012
  0.2630404  -0.17466757 -0.3652876   0.03548632 -0.20340088  0.23894495
  0.21910855  0.4870485  -0.00606919 -0.23775662  0.27549258  0.39024654
  0.36853722  0.30672196  0.21343686  0.5633356   0.05765507 -0.2453014
  0.0044115  -0.8759689  -0.6061471  -0.632746   -0.49215776  0.3611431
  0.32715482  0.5493635  -0.28762263  0.02026632 -0.7933093  -0.07088664
  0.05956777  0.4000916  -0.28030956  0.08640276  0.21596453  0.27023456
 -0.21178803  0.13050488 -0.6706782  -0.6504279   0.22607262 -0.33090222
  0.45109263 -0.3017638  -0.2966171   0.35299832 -0.03177927  1.5211345
 -0.04413279 -0.21507785 -0.10389377 -0.02145489 -0.3