In [27]:
! pip install adjustText



In [28]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
%matplotlib inline
import zipfile
import re
import numpy as np
import pandas as pd
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='3'
import random
import tensorflow as tf
import matplotlib.pyplot as plt
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE
from adjustText import adjust_text

# 1. Downloading the data

In [6]:
url = 'http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip'


def download_data(url, data_dir):
    """Download a file if not present, and make sure it's the right size."""

    os.makedirs(data_dir, exist_ok=True)

    file_path = os.path.join(data_dir, 'bbc-fulltext.zip')

    if not os.path.exists(file_path):
        print('Downloading file...')
        filename, _ = urlretrieve(url, file_path)
    else:
        print("File already exists")

    extract_path = os.path.join(data_dir, 'bbc')
    if not os.path.exists(extract_path):

        with zipfile.ZipFile(os.path.join(data_dir, 'bbc-fulltext.zip'), 'r') as zipf:
            zipf.extractall(data_dir)

    else:
        print("bbc-fulltext.zip has already been extracted")

download_data(url, 'data')

Downloading file...


# 2. Reading the data without preprocessing

In [29]:
def read_data(data_dir):

    # This will contain the full list of stories
    news_stories = []

    print("Reading files")

    i = 0 # Just used for printing progress
    for root, dirs, files in os.walk(data_dir):

        for fi, f in enumerate(files):

            # We don't read the README file
            if 'README' in f:
                continue

            # Printing progress
            i += 1
            print("."*i, f, end='\r')

            # Open the file
            with open(os.path.join(root, f), encoding='latin-1') as f:

                story = []
                # Read all the lines
                for row in f:

                    story.append(row.strip())

                # Create a single string with all the rows in the doc
                story = ' '.join(story)
                # Add that to the list
                news_stories.append(story)

        print('', end='\r')

    print(f"\nDetected {len(news_stories)} stories")
    return news_stories


news_stories = read_data(os.path.join('data', 'bbc'))

# Printing some stats and sample data
print(f"{sum([len(story.split(' ')) for story in news_stories])} words found in the total news set")
print('Example words (start): ',news_stories[0][:50])
print('Example words (end): ',news_stories[-1][-50:])

Reading files
..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

# 3. Building a tokenizer

In [30]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(
    num_words=None,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True, split=' '
)

tokenizer.fit_on_texts(news_stories)
print("Data fitted on the tokenizer")

Data fitted on the tokenizer


# 4. Exploring the tokenizer

In [31]:
n_vocab = len(tokenizer.word_index.items()) + 1
print(f"Vocabulary size: {n_vocab}")

print("\nWords at the top")
print('\t', dict(list(tokenizer.word_index.items())[:10]))
print("\nWords at the bottom")
print('\t', dict(list(tokenizer.word_index.items())[-10:]))

Vocabulary size: 32360

Words at the top
	 {'the': 1, 'to': 2, 'of': 3, 'and': 4, 'a': 5, 'in': 6, 'for': 7, 'is': 8, 'that': 9, 'on': 10}

Words at the bottom
	 {'counsellor': 32350, "'frag'": 32351, 'relasing': 32352, "'real'": 32353, 'hrs': 32354, 'enviroment': 32355, 'trifling': 32356, '24hours': 32357, 'ahhhh': 32358, 'lol': 32359}


# 5. Building a refined tokenizer

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer

n_vocab = 15000 + 1
tokenizer = Tokenizer(
    num_words=n_vocab-1,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True, split=' ', oov_token='',
)

tokenizer.fit_on_texts(news_stories)
print("Data fitted on the tokenizer")

Data fitted on the tokenizer


# 6. Checking the results of the tokenizer

In [8]:
print(f"Original: {news_stories[0][:100]}")
print(f"Sequence IDs: {tokenizer.texts_to_sequences([news_stories[0][:100]])[0]}")

Original: Ad sales boost Time Warner profit  Quarterly profits at US media giant TimeWarner jumped 76% to $1.1
Sequence IDs: [4223, 187, 716, 66, 3596, 1050, 3938, 626, 21, 49, 303, 717, 8263, 2972, 5321, 3, 108, 108]


# 7. Converting all articles to word ID sequences

In [9]:
news_sequences = tokenizer.texts_to_sequences(news_stories)

# 8. Generating skip-grams from the corpus

In [10]:
sample_word_ids = news_sequences[0][:5]
sample_phrase = ' '.join([tokenizer.index_word[wid] for wid in sample_word_ids])
print(f"Sample phrase: {sample_phrase}")
print(f"Sample word IDs: {sample_word_ids}\n")

window_size = 1 # How many words to consider left and right.

inputs, labels = tf.keras.preprocessing.sequence.skipgrams(
    sample_word_ids,
    vocabulary_size=n_vocab,
    window_size=window_size, negative_samples=1.0, shuffle=False,
    categorical=False, sampling_table=None, seed=None
)

print("Sample skip-grams")

for inp, lbl in zip(inputs, labels):
    print(f"\tInput: {inp} ({[tokenizer.index_word[wi] for wi in inp]}) / Label: {lbl}")

Sample phrase: ad sales boost time warner
Sample word IDs: [4223, 187, 716, 66, 3596]

Sample skip-grams
	Input: [4223, 187] (['ad', 'sales']) / Label: 1
	Input: [187, 4223] (['sales', 'ad']) / Label: 1
	Input: [187, 716] (['sales', 'boost']) / Label: 1
	Input: [716, 187] (['boost', 'sales']) / Label: 1
	Input: [716, 66] (['boost', 'time']) / Label: 1
	Input: [66, 716] (['time', 'boost']) / Label: 1
	Input: [66, 3596] (['time', 'warner']) / Label: 1
	Input: [3596, 66] (['warner', 'time']) / Label: 1
	Input: [4223, 11275] (['ad', 'instability']) / Label: 0
	Input: [187, 6894] (['sales', 'merit']) / Label: 0
	Input: [3596, 4062] (['warner', 'cinemas']) / Label: 0
	Input: [716, 12161] (['boost', 'chaplin']) / Label: 0
	Input: [187, 10395] (['sales', 'destiny']) / Label: 0
	Input: [716, 3285] (['boost', 'parker']) / Label: 0
	Input: [66, 5499] (['time', 'incredibles']) / Label: 0
	Input: [66, 10219] (['time', 'buccaneers']) / Label: 0


# 9. Generating negative candidates

In [11]:
inputs, labels = tf.keras.preprocessing.sequence.skipgrams(
    sample_word_ids,
    vocabulary_size=len(tokenizer.word_index.items())+1,
    window_size=window_size, negative_samples=0, shuffle=False,
)

inputs, labels = np.array(inputs), np.array(labels)

negative_sampling_candidates, true_expected_count, sampled_expected_count = tf.random.log_uniform_candidate_sampler(
    # A true context word that appears in the context of the target
    true_classes=inputs[:1,1:], # [b, 1] sized tensor
    num_true=1, # number of true words per example
    num_sampled=10,
    unique=True,
    range_max=n_vocab,
    name="negative_sampling"
)

print(f"Positive sample: {inputs[:1,1:]}")
print(f"Negative samples: {negative_sampling_candidates}")
print(f"true_expected_count: {true_expected_count}")
print(f"sampled_expected_count: {sampled_expected_count}")

Positive sample: [[187]]
Negative samples: [   7  390 1336    0   98   78 7943 1701 6331 1507]
true_expected_count: [[0.00605192]]
sampled_expected_count: [1.2678023e-01 2.9180504e-03 8.5494545e-04 5.6086338e-01 1.1437029e-02
 1.4295553e-02 1.4398129e-04 6.7170913e-04 1.8063012e-04 7.5806427e-04]


In [12]:
sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(n_vocab, sampling_factor=1e-05)

# 10. Generating data ( positive + negative candidates )

In [13]:
def skip_gram_data_generator(sequences, window_size, batch_size, negative_samples, vocab_size, seed=None):

    rand_sequence_ids = np.arange(len(sequences))
    np.random.shuffle(rand_sequence_ids)


    for si in rand_sequence_ids:

        positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
            sequences[si],
            vocabulary_size=vocab_size,
            window_size=window_size,
            negative_samples=0.0,
            shuffle=False,
            sampling_table=sampling_table,
            seed=seed
        )

        targets, contexts, labels = [], [], []

        for target_word, context_word in positive_skip_grams:
            context_class = tf.expand_dims(tf.constant([context_word], dtype="int64"), 1)

            negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
              true_classes=context_class,
              num_true=1,
              num_sampled=negative_samples,
              unique=True,
              range_max=vocab_size,
              name="negative_sampling")

            # Build context and label vectors (for one target word)
            context = tf.concat(
                [tf.constant([context_word], dtype='int64'), negative_sampling_candidates],
                axis=0
            )

            label = tf.constant([1] + [0]*negative_samples, dtype="int64")

            # Append each element from the training example to global lists.
            targets.extend([target_word]*(negative_samples+1))
            contexts.append(context)
            labels.append(label)

        contexts, targets, labels = np.concatenate(contexts), np.array(targets), np.concatenate(labels)

        assert contexts.shape[0] == targets.shape[0]
        assert contexts.shape[0] == labels.shape[0]

        # If seed is not provided, generate a random one
        if not seed:
            seed = random.randint(0, 10e6)

        np.random.seed(seed)
        np.random.shuffle(contexts)
        np.random.seed(seed)
        np.random.shuffle(targets)
        np.random.seed(seed)
        np.random.shuffle(labels)


        for eg_id_start in range(0, contexts.shape[0], batch_size):
            yield (
                targets[eg_id_start: min(eg_id_start+batch_size, targets.shape[0])],
                contexts[eg_id_start: min(eg_id_start+batch_size, contexts.shape[0])]
            ), labels[eg_id_start: min(eg_id_start+batch_size, labels.shape[0])]


news_skip_gram_gen = skip_gram_data_generator(
    news_sequences, 4, 10, 5, n_vocab
)

for btc, bl in news_skip_gram_gen:

    print(btc)
    print(bl)

    break

(array([ 3369,  8176, 10982, 10981,   817,   798,  2469,  2963,  3796,
        2469]), array([   4, 2169,   21,    8, 2004,  539,    0, 2157,   61,   35],
      dtype=int64))
[0 0 0 0 0 0 0 0 0 0]


# 11. Defining the hyperparameters

In [14]:
batch_size = 4096 # Data points in a single batch

embedding_size = 128 # Dimension of the embedding vector.

window_size=1 # We use a window size of n on either side of target word
negative_samples = 4 # Number of negative samples generated per example

epochs = 5 # Number of epochs to train for

# We pick a random validation set to sample nearest neighbors
valid_size = 16 # Random set of words to evaluate similarity on.
# We sample valid data points randomly from a large window without always being deterministic
valid_window = 250

# When selecting valid examples, we select some of the most frequent words as well as
# some moderately rare words
np.random.seed(54321)
random.seed(54321)

valid_term_ids = np.array(random.sample(range(valid_window), valid_size))
valid_term_ids = np.append(
    valid_term_ids, random.sample(range(1000, 1000+valid_window), valid_size),
    axis=0
)

# 12. Defining the model

In [15]:
import tensorflow.keras.backend as K

K.clear_session()

# Inputs - skipgrams() function outputs target, context in that order
# we will use the same order
input_1 = tf.keras.layers.Input(shape=(), name='target')
input_2 = tf.keras.layers.Input(shape=(), name='context')

# Two embeddings layers are used, one for the context and one for the target
context_embedding_layer = tf.keras.layers.Embedding(
    input_dim=n_vocab, output_dim=embedding_size, name='context_embedding'
)
target_embedding_layer = tf.keras.layers.Embedding(
    input_dim=n_vocab, output_dim=embedding_size, name='target_embedding'
)

# Look up outputs of the embedding layers
target_out = target_embedding_layer(input_1)
context_out = context_embedding_layer(input_2)

# Computing the dot product between the two
out = tf.keras.layers.Dot(axes=-1)([context_out, target_out])

# Defining the model
skip_gram_model = tf.keras.models.Model(inputs=[input_1, input_2],
                                        outputs=out, name='skip_gram_model')

# Compiling the model
skip_gram_model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                        optimizer='adam', metrics=['accuracy'])

skip_gram_model.summary()




# 13. Training the model

In [16]:
class ValidationCallback(tf.keras.callbacks.Callback):

    def __init__(self, valid_term_ids, model_with_embeddings, tokenizer):

        self.valid_term_ids = valid_term_ids
        self.model_with_embeddings = model_with_embeddings
        self.tokenizer = tokenizer

        super().__init__()

    def on_epoch_end(self, epoch, logs=None):
        """ Validation logic """

        # We will use context embeddings to get the most similar words
        # Other strategies include: using target embeddings, mean embeddings after avaraging context/target
        embedding_weights = self.model_with_embeddings.get_layer("context_embedding").get_weights()[0]
        normalized_embeddings = embedding_weights / np.sqrt(np.sum(embedding_weights**2, axis=1, keepdims=True))

        # Get the embeddings corresponding to valid_term_ids
        valid_embeddings = normalized_embeddings[self.valid_term_ids, :]

        # Compute the similarity between valid_term_ids and all the embeddings
        # V x d (d x D) => V x D
        top_k = 5 # Top k items will be displayed
        similarity = np.dot(valid_embeddings, normalized_embeddings.T)

        # Invert similarity matrix to negative
        # Ignore the first one because that would be the same word as the probe word
        similarity_top_k = np.argsort(-similarity, axis=1)[:, 1: top_k+1]

        # Print the output
        for i, term_id in enumerate(valid_term_ids):

            similar_word_str = ', '.join([self.tokenizer.index_word[j] for j in similarity_top_k[i, :] if j >= 1])
            print(f"{self.tokenizer.index_word[term_id]}: {similar_word_str}")

        print('\n')


# 14. Running the skip-gram algorithms

In [17]:
skipgram_validation_callback = ValidationCallback(valid_term_ids, skip_gram_model, tokenizer)

for ei in range(epochs):

    print(f"Epoch: {ei+1}/{epochs} started")

    news_skip_gram_gen = skip_gram_data_generator(
        news_sequences, window_size, batch_size, negative_samples, n_vocab
    )

    skip_gram_model.fit(
        news_skip_gram_gen, epochs=1,
        callbacks=skipgram_validation_callback,
    )

Epoch: 1/5 started
   2233/Unknown [1m85s[0m 38ms/step - accuracy: 0.8001 - loss: 0.6272election: promise, elliot, look, decline, fill
me: ability, lot, bafta, result, him
with: or, over, against, jealous, switzerland
you: them, him, do, just, we
were: already, now, have, do, did
win: attempting, 2003, produce, work, want
those: named, lot, ensure, used, work
music: â£21m, misled, slashed, seafarers, games
also: already, now, do, she, did
third: second, edge, taken, conduct
best: 1997, ignore, tight, appears, capability
him: them, help, look, me, forsyth
too: challenge, better, way, resolved, ensure
some: quest, peak, way, charge
through: come, ensure, produce, get
mr: tony, said, gordon, jack, charles
file: dominates, fabulous, mainstay, someone, affected
pair: kind, chance, release, keypad, nothing
ceremony: wanted, sure, able, democracy, capable
believed: lot, trying, better, chance, decision
post: way, peak, lot, wake, sure
indian: island, prescott, hosford, times, released
succe

  self.gen.throw(typ, value, traceback)


   2233/Unknown [1m84s[0m 37ms/step - accuracy: 0.8059 - loss: 0.4579election: advertising, broad, 1997, anticipated, consulates
me: him, squandered, achieving, things, probably
with: between, stringer, tanks, over, or
you: we, they, don't, doesn't, didn't
were: are, being, have, already, been
win: attempting, mention, cheapest, survive, backbenchers
those: publish, encouraged, shore, treasure, navigation
music: games, women's, extended, sought, entertainment
also: already, never, previously, widely, being
third: fourth, second, premiere, historic, recruitment
best: supporting, roles, association, title, actor
him: me, them, didn't, things, probably
too: extremely, better, easier, councillors, very
some: overwhelming, value, dates, potential, petrov
through: tactically, lvmh, zheng, gonna, obscene
mr: tony, charles, jack, said, michael
file: privately, centres, hosts, properties, hilarious
pair: hampered, verge, objections, 2014, stolen
ceremony: ignored, series, â£15, criticised, ma

# 15. Usecase 1 : Visualize the word embeddings of some words

In [38]:
# example 1 : word vector for "dog"
word_vector_dog = skip_gram_model.get_layer("context_embedding").get_weights()[0][tokenizer.word_index["dog"]]

print(len(word_vector_dog))
print(word_vector_dog)

128
[ 0.22746673  0.07138389  0.11764798  0.03281855 -0.21460196 -0.5240372
 -0.3249284   0.23186986 -0.204936    0.14132696 -0.09498704 -0.4589789
 -0.19136354  0.22529069  0.31748393 -0.15478028 -0.12603697 -0.06850798
 -0.33499986  0.09418409 -0.1837122  -0.06763232 -0.06832171 -0.13301939
  0.16878468  0.19128338  0.0676093   0.13918488 -0.2992557  -0.13232473
  0.04048631 -0.26357535  0.42646378  0.6451285   0.46789193  0.13119088
 -0.4487828  -0.15849559 -0.31067586  0.19438551  0.05548285 -0.16670148
 -0.6951338  -0.14330108  0.50013965  0.20417349 -0.03738758 -0.09156351
  0.02752526  0.524384    0.0429455   0.08014193 -0.06908175 -0.15712656
 -0.06504448  0.17285568  0.5792302   0.04035402  0.09936652 -0.11419605
  0.04411833  0.03865562  0.17002721 -0.32316512  0.5061779   0.06208441
  0.01708469 -0.07603393 -0.7451564   0.10636906  0.06881762  0.29216084
 -0.01405409  0.34407157  0.16130112  0.07232776  0.07603461  0.3722347
  0.5033964  -0.21041484  0.11422145  0.32182866  

In [42]:
# example 2 : word vector for "cat"
word_vector_cat = skip_gram_model.get_layer("context_embedding").get_weights()[0][tokenizer.word_index["cat"]]

print(len(word_vector_cat))
print(word_vector_cat)

128
[ 0.22118916  0.1273138   0.13676815  0.3060822   0.08332124 -0.18960623
 -0.2369323   0.23993364 -0.37014     0.3379539  -0.20337565 -0.30013242
  0.24026766  0.26052666 -0.08621705  0.13927417  0.17124383  0.12321689
 -0.14217646 -0.2471389  -0.16004278 -0.38916877  0.13592231 -0.274355
  0.25753668  0.24689075 -0.08672582  0.21177335 -0.02355324 -0.13768663
  0.06724598 -0.12024601  0.1784991   0.17655182  0.15259972  0.16850936
 -0.23959818 -0.18561083 -0.12562609  0.21827549  0.1752871  -0.24447939
  0.01715354 -0.2675737   0.13125795 -0.00123536  0.2473496  -0.16863863
 -0.10351441  0.18114561  0.4576279  -0.18226895 -0.28612912  0.0771334
  0.10465913  0.1921185   0.09149513 -0.07210246  0.05147637  0.07265592
 -0.25594163 -0.13637617  0.25891873 -0.17306577 -0.14051317  0.34425125
  0.20302482  0.10817692 -0.17956921  0.20847966  0.3505246  -0.01836295
  0.15275636  0.3135562   0.22703566 -0.33653134  0.22314174  0.27401575
  0.07934818 -0.3662857   0.03376219  0.19614582  

In [46]:
# example 3 : word vector for "man"
word_vector_man = skip_gram_model.get_layer("context_embedding").get_weights()[0][tokenizer.word_index["man"]]

print(len(word_vector_man))
print(word_vector_man)

128
[ 0.02534899  0.12047102  0.3764374  -0.07255813  0.51958036 -0.17348845
 -0.08853953  0.16519456  0.14518167  0.17024663  0.05739426 -0.15342963
  0.12667334  0.00944092  0.42350096  0.22238661 -0.16007358  0.12519915
 -0.11583033 -0.35662946 -0.12330738 -0.20741612  0.14250234 -0.45800918
  0.166765    0.15616044 -0.12254994  0.22289832 -0.04220576 -0.10405624
  0.33918127 -0.51516974  0.48797056  0.03449334  0.1049895  -0.06961536
 -0.3008898  -0.24954242 -0.263484    0.2840222   0.06046656 -0.19347697
 -0.33714348 -0.01203065  0.21022484 -0.09915531  0.02330409  0.26657274
  0.16765523  0.2313001  -0.13988364  0.2389606   0.03046129  0.22584376
  0.06365097 -0.22275613  0.17984752 -0.12988643 -0.03387088  0.1413761
 -0.33251852  0.08744261 -0.1585932  -0.00141644  0.2432045   0.2246335
  0.04431519 -0.05536337 -0.48505428  0.38819832  0.04292851  0.00095119
  0.26299125  0.25060925  0.00234479 -0.24130784  0.20687337  0.04303828
 -0.19877939 -0.08837221  0.32335946 -0.26095125 

In [47]:
# example 4 : word vector for "woman"
word_vector_woman = skip_gram_model.get_layer("context_embedding").get_weights()[0][tokenizer.word_index["woman"]]

print(len(word_vector_woman))
print(word_vector_woman)

128
[ 0.26309174  0.4147605   0.11572185  0.18393585  0.37184054 -0.18069829
 -0.19339332  0.17029996 -0.13269204  0.15365273 -0.283435   -0.19152485
  0.34755978  0.11869116  0.08013584  0.16610299  0.22172268  0.4283985
 -0.17933767 -0.13001654 -0.41411307 -0.22224103  0.26052335 -0.3568804
  0.26483512  0.212596   -0.3139042   0.38621023 -0.01998098 -0.1838497
 -0.1726185  -0.28903136  0.3464941   0.27167618  0.18794398  0.31031498
 -0.12902099 -0.18391809 -0.14488178  0.12919763  0.14627652 -0.3605748
 -0.23907977  0.0346491   0.02198527 -0.41710517  0.11936651 -0.20457141
 -0.24958594  0.21314938  0.21846868 -0.1826636  -0.3087275   0.36232403
  0.38507396  0.23603722  0.181535   -0.30046725 -0.13386211  0.28226987
 -0.28213    -0.28206238  0.06141251 -0.07637422  0.04504185  0.3058593
  0.22075182  0.20905247 -0.02321019  0.1084257   0.12345778 -0.09653618
  0.26543063  0.23348318  0.14701082 -0.18342857  0.23847738  0.21532544
  0.04154864 -0.37797916  0.15749688  0.3358833   0.

# 16 . Usecase 2 : Similarity of the word embeddings

In [43]:
# example 1 : similarity score between dog and cat
similarity = np.dot(word_vector_dog, word_vector_cat) / (np.linalg.norm(word_vector_dog) * np.linalg.norm(word_vector_cat))

print(similarity)

0.45174366


In [48]:
# example 2 : similarity score between dog and man
similarity = np.dot(word_vector_dog, word_vector_man) / (np.linalg.norm(word_vector_dog) * np.linalg.norm(word_vector_man))

print(similarity)

0.38474867


In [50]:
# example 3 : similarity score between woman and man
similarity = np.dot(word_vector_man, word_vector_woman) / (np.linalg.norm(word_vector_man) * np.linalg.norm(word_vector_woman))

print(similarity)

0.57171756


# 17. Usecase 3 : Analogy task

In [54]:
king_vector = skip_gram_model.get_layer("context_embedding").get_weights()[0][tokenizer.word_index["king"]]
man_vector = skip_gram_model.get_layer("context_embedding").get_weights()[0][tokenizer.word_index["man"]]
woman_vector = skip_gram_model.get_layer("context_embedding").get_weights()[0][tokenizer.word_index["woman"]]

In [55]:
tokenizer.word_index["Queen"] ## doesnn't exist in the word index

KeyError: 'Queen'

In [56]:
# application of analogy to calculate Queen vector

queen_vector = king_vector - man_vector + woman_vector
print(len(queen_vector))
print(queen_vector)

128
[ 3.3882433e-01  4.2410660e-01  2.2715658e-01  3.9626640e-01
  1.3184062e-01 -4.4364864e-01 -3.5429621e-01  1.7759013e-01
 -6.3046116e-01  1.7294665e-01 -4.6839964e-01 -1.2489870e-02
  3.5591775e-01  4.0181404e-01 -5.6488550e-01  1.4087935e-01
  4.0124860e-01  3.9552587e-01 -2.9045260e-01  8.4689781e-02
 -4.7596377e-01 -1.8774298e-01  1.9676805e-01  7.4532449e-02
  5.0112104e-01  4.7105271e-01 -4.3799841e-01  6.4600265e-01
 -6.5998752e-03  2.6150757e-01 -6.9362462e-01 -1.9329089e-01
  1.0410756e-01  4.3062824e-01  2.3346071e-01  5.8081830e-01
 -2.0331767e-01 -5.8929153e-02  1.3925320e-01  4.3812141e-02
  3.7249768e-01 -6.9328189e-02  3.1111437e-01 -1.0825166e-01
 -7.4586645e-04 -5.0805795e-01  2.0949735e-01 -5.0011224e-01
 -6.4535534e-01  2.7712733e-01  5.6053448e-01 -5.3525692e-01
 -6.4959365e-01  3.4441936e-01  6.3610405e-01  6.4036858e-01
  1.7155142e-01 -3.7057552e-01 -8.2692251e-02  2.8188753e-01
  1.2343082e-01 -4.7418472e-01  1.9500925e-01 -5.7620931e-01
 -2.2975923e-01  2.7