# Load Word2Vec Models

## Helper Functions

In [188]:
from transformers import pipeline
from gensim.models import Word2Vec, Phrases
from gensim.models import KeyedVectors
import gensim.downloader
import random 
import numpy as np
import re
from scipy.spatial import distance
import keyboard

def get_vector(model, word):
    '''
    Model must be gensim object Word2Vec model
    Returns word vector if key present in its vocab
    '''
    try:
        vect = model[word]
    except:
        vect = None
        print("Word not in model vocabulary")
    return vect

def vec2word(model, vectors, top_n=1):
    '''
    Find most similar words as list in model given vector
    '''
    if isinstance(vectors, float):
        vectors = [vectors]

    # Find the word most similar to the given vector
    ave_vector = np.mean(vectors, axis=0)
    most_similar_words = model.similar_by_vector(ave_vector, topn=top_n+len(vectors))
    most_similar_words = [word for word, _ in most_similar_words]

    return most_similar_words[len(vectors):]

def cossim(model, vocab, word_1, word_2):
    # make sure both words in Word2Vec model
    if word_1 in vocab and word_2 in vocab:
        return (1 - distance.cosine(model[word_1], model[word_2])) 
    else:
        print("At least one word not in model vocab")
        return None
        

def n_most_similar_words(model, vocab, words, neg=None, n=10):
    '''
    negative is a list of words opposite of most similar n words
    '''

    if isinstance(words, str):
        words = [words]

    if (neg is None) and all(w in vocab for w in words):
        return model.most_similar(words, topn=n)
    elif (words is None) and all(n in vocab for n in neg):
        return model.most_similar(negative=neg, topn=n)
    elif all(w in vocab for w in words) and all(n in vocab for n in neg):
        return model.most_similar(positive=words, negative=neg, topn=n)
    else:
        print("Words not in model vocabulary")
        return None
    
def skip_gram(model, vocab, context_words, n=10):
    # Predict the most similar n words
    
    if isinstance(context_words, str):
        context_words = [context_words]

    if all(w in vocab for w in context_words):
        context_vectors = [model[word] for word in context_words]
        avg_vector = np.mean(context_vectors, axis=0)
        similar_words = model.similar_by_vector(avg_vector, topn=n+len(context_words))
        return similar_words[len(context_words):]

    else:
        print("Words not in model vocabulary")
        print("Not found:", context_words)
        return None
    
def normalize(v):
    '''
    Normalize a vector to unit magnitude
    '''
    v_norm = v / np.linalg.norm(v)
    #v_norm = v_norm / np.linalg.norm(v_norm)
    return v_norm

def get_orthogonal(model, word, max_return=5):
    '''
    Return N words from vectors orthogonal to word vector by finding projections on hyperplane
    '''
    v = get_vector(model, word)
    n = len(v)
    words = []
    valid = []

    # Create an identity matrix of size 300x300
    identity_matrix = np.eye(n)

    # Subtract the projection of v onto each standard basis vector
    # to obtain orthogonal vectors on the hyperplane perpendicular to v
    orthogonal_matrix = np.array([identity_matrix[i] - np.dot(v, identity_matrix[i]) / np.dot(v, v) * v for i in range(n)])

    # add valid words from orthogonal matrix
    count = 0
    for index, v in enumerate(orthogonal_matrix):
        #if count == max_return:
        #    break
        w = vec2word(model, [normalize(v)])
        if re.match("^[a-z]+$", w[0]):
            valid.append(w[0])
            #words.append(w)
            count += 1
    
    #orthogonal_vectors = orthogonal_matrix[:max_return]
    #normalized_vectors = [normalize(v) for v in orthogonal_vectors] # orthogonal_vectors / magnitude(orthogonal_vectors, axis=1)[:, np.newaxis]
    return np.random.choice(valid, 10, replace=False) #words # normalized_vectors

def magnitude(vector):
    '''
    Compute magnitude of vector
    '''
    return np.linalg.norm(vector)

def replace(model, words):
    '''
    words: tuple of words, 3rd word replaces concept of 2nd word in the context of 1st word
    '''
    vects = [get_vector(model, w) for w in words]
    v1, v2, v3 = vects
    new_vec = v1 - v2 + v3

    return vec2word(model, [new_vec])

def interpolate(model, vocab, words, scores):
    vects = [get_vector(model, word) for word in words]
    n = len(vects[0])
    ave_vect = np.zeros(n)

    for i in range(n):
        for j in range(len(vects)):
            ave_vect[i] += scores[j] * vects[j][i]

    ave_vect = ave_vect / np.linalg.norm(ave_vect)
    ave_word = vec2word(model, [ave_vect])
    ave_words = skip_gram(model, vocab, ave_word, 10)
    
    return ave_vect, ave_words


## Compute Vector Embeddings

In [2]:
import gensim.downloader

model = gensim.downloader.load('word2vec-google-news-300')

vocab = [word for word in model.index_to_key if re.match("^[a-zA-Z.-]+$", word)]



In [51]:
guesses = ['man', 'boy']

guess_vect = get_vector(model, guesses)
#print(guess_vect)
word = vec2word(model, guess_vect)
print(word)

score = cossim(model, vocab, 'man', 'girl')
print(score)

top_n = n_most_similar_words(model, vocab, guesses, None, 10)
print(top_n)

['girl']
At least one word not in model vocab
None
[('girl', 0.7885462045669556), ('teenager', 0.7737529277801514), ('woman', 0.743568480014801), ('teenage_girl', 0.7086503505706787), ('teen_ager', 0.6568499207496643), ('toddler', 0.6250425577163696), ('youngster', 0.5938030481338501), ('lad', 0.5809065699577332), ('kid', 0.5788402557373047), ('son', 0.577247142791748)]


In [61]:
score = cossim(model, vocab, 'man', 'female')
print(score)

0.2551112473011017


## Load Word2Vec Model

In [150]:
#your "sentences" object with the cleaned text data. 

#bigram_transformer = phrases.Phrases(docs)
#bigram = phrases.Phraser(bigram_transformer)

# model = Word2Vec(bigram[docs], workers=4, sg=0, min_count=5, window=5, sample=1e-3) #size=700 # N-dimensions
# model =  Word2Vec.load("./Word2Vec_Models/English_Sample") #name of YOUR model here, or file path to your 

# Load Google News Word2Vec Model

model_path = './Word2Vec_Models/GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(model_path, limit=500000, binary=True)
model.init_sims(replace=True) #Precompute L2-normalized vectors. If replace is set to TRUE, forget the original vectors and only keep the normalized ones. Saves lots of memory, but can't continue to train the model.
vocab = [word for word in model.index_to_key if re.match("^[a-z]+$", word)]

'''
# Filter and clean the model
filtered_index_to_key = [word for word in model.index_to_key if re.match("^[a-z]+$", word)]
filtered_vectors = model.vectors[[model.index_to_key.index(word) for word in filtered_index_to_key]]

# Update the original model with the filtered version
model.index_to_key = filtered_index_to_key
model.vectors = filtered_vectors

# Now, vocab contains only lowercase valid words
vocab = list(filtered_index_to_key)
'''


  model.init_sims(replace=True) #Precompute L2-normalized vectors. If replace is set to TRUE, forget the original vectors and only keep the normalized ones. Saves lots of memory, but can't continue to train the model.


'\n# Filter and clean the model\nfiltered_index_to_key = [word for word in model.index_to_key if re.match("^[a-z]+$", word)]\nfiltered_vectors = model.vectors[[model.index_to_key.index(word) for word in filtered_index_to_key]]\n\n# Update the original model with the filtered version\nmodel.index_to_key = filtered_index_to_key\nmodel.vectors = filtered_vectors\n\n# Now, vocab contains only lowercase valid words\nvocab = list(filtered_index_to_key)\n'

In [175]:
word = 'dog'
print(word in vocab)
print(len(vocab))

words = ['king', 'man', 'woman']
new_word = replace(model, words)
print(new_word)
print(cossim(model, vocab, 'man', 'boy'))

w = 'ontext'
if re.match("^[a-z]+$", w):
    print("yayy")
print(vects)

True
75045
['queen']
0.6824870109558105
yayy
[]


In [153]:
import pickle

# Save the Word2Vec model using pickle
model_path = './Word2Vec_Models/google_word2vec.pkl'
with open(model_path, 'wb') as file:
    pickle.dump(model, file)

'''
# Load the Word2Vec model from the pickle file
with open(model_path, 'rb') as file:
    model = pickle.load(file)
'''


"\n# Load the Word2Vec model from the pickle file\nwith open(model_path, 'rb') as file:\n    model = pickle.load(file)\n"

In [13]:
# preprocess vocab

with open("./Word2Vec_Models/eng_vocab.txt", "w") as f:
    for word in vocab:
        f.write(word + "\n")
print("Model Complete")

Model Complete


In [62]:
# helper function usage

guess, target = "woman", random.choice(vocab)
guesses = ['target', 'goal']

print("Guess:", guess, "Target:", target)

guess_vect = get_vector(model, guess)
#print(guess_vect)

word = vec2word(model, [guess_vect])
print("Word:", word)

score = cossim(model, vocab, guess, target)
print("Guess to target similarity:", score)

top_n = n_most_similar_words(model, vocab, guesses, None, 10)
print("Top K Similar Words:\n", top_n)

context = skip_gram(model, vocab, guesses, 10)
print(context)


Guess: woman Target: harpoon
Word: ['man']
Guess to target similarity: 0.026805367320775986
Top K Similar Words:
 [('targets', 0.6589586734771729), ('goals', 0.6300962567329407), ('objective', 0.5616191625595093), ('aim', 0.5383525490760803), ('Goal', 0.4901687204837799), ('objectives', 0.4835728704929352), ('targeted', 0.4605602025985718), ('targeting', 0.45360010862350464), ('aiming', 0.4502216875553131), ('achievable', 0.4414016008377075)]
[('targets', 0.6589586734771729), ('goals', 0.6300962567329407), ('objective', 0.5616191625595093), ('aim', 0.5383525490760803), ('Goal', 0.4901687204837799), ('objectives', 0.4835728704929352), ('targeted', 0.4605602025985718), ('targeting', 0.45360010862350464), ('aiming', 0.4502216875553131), ('achievable', 0.4414016008377075)]


In [189]:
#print(guess_vect)
guess_vect = get_vector(model, "man")
print("reconvert:", vec2word(model, [guess_vect])) #vects[0]))
mag = magnitude(guess_vect)

words = get_orthogonal(model, 'kill')

for w in words:
    print(w)

context_words = ['man', 'dog']
scores = [0.5, 0.9]
vec, words = interpolate(model, vocab, context_words, scores)
print("Interpolate:", words)


reconvert: ['woman']
condensing
licensors
admired
haphazard
substitution
tigers
underplaying
pointed
goldrush
stinger
Interpolate: [('dog', 0.8680490255355835), ('canines', 0.8181710243225098), ('cats', 0.76517653465271), ('pit_bulls', 0.7548302412033081), ('pets', 0.7424418330192566), ('puppies', 0.7385991811752319), ('pooches', 0.7162365913391113), ('German_shepherds', 0.707106351852417), ('animals', 0.6985694169998169), ('pit_bull', 0.6983615159988403)]


In [190]:
s = 'o1'
print(s[-1])

1


## Model Inference

In [97]:
# iterate until guess close enough

delta = 0.05
score = 0
turns = 0
max_turns = 100
scores = []


while abs(score) < 1 - delta and turns < max_turns:
    turns += 1
    guess = vec2word(model, [guess_vect])[0]
    print("Guess: ", guess)
    score = cossim(model, vocab, guess, target)
    scores.append(score)
    
    # keyboard.wait('space')
    guess_vect = guess_vect #+ guess_vect/2 # update guess according to complex hueristic 

    print("Turn: ", turns)
    print("Guess: ", guess)
    print("Similarity Score: ", round(score * 100, 2))
    print()

print("Close Enough! Target Word: ", target)

Guess:  loved
Turn:  1
Guess:  loved
Similarity Score:  -1.71

Guess:  loved
Turn:  2
Guess:  loved
Similarity Score:  -1.71

Guess:  loved
Turn:  3
Guess:  loved
Similarity Score:  -1.71

Guess:  loved
Turn:  4
Guess:  loved
Similarity Score:  -1.71

Guess:  loved
Turn:  5
Guess:  loved
Similarity Score:  -1.71

Guess:  loved
Turn:  6
Guess:  loved
Similarity Score:  -1.71

Guess:  loved
Turn:  7
Guess:  loved
Similarity Score:  -1.71

Guess:  loved
Turn:  8
Guess:  loved
Similarity Score:  -1.71

Guess:  loved
Turn:  9
Guess:  loved
Similarity Score:  -1.71

Guess:  loved
Turn:  10
Guess:  loved
Similarity Score:  -1.71

Guess:  loved
Turn:  11
Guess:  loved
Similarity Score:  -1.71

Guess:  loved
Turn:  12
Guess:  loved
Similarity Score:  -1.71

Guess:  loved
Turn:  13
Guess:  loved
Similarity Score:  -1.71

Guess:  loved
Turn:  14
Guess:  loved
Similarity Score:  -1.71

Guess:  loved
Turn:  15
Guess:  loved
Similarity Score:  -1.71

Guess:  loved
Turn:  16
Guess:  loved
Similarity 

KeyboardInterrupt: 

# Querying Text Database Using Embeddings

In [1]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load a pre-trained model
model = SentenceTransformer('paraphrase-distilroberta-base-v1')

# Your text message
text_message = "This is an example text message."

# Generate embeddings
embeddings = model.encode([text_message])

# Print the embeddings
#print("Embeddings:", embeddings)

# If you want to convert the embeddings to a numpy array
#embeddings_np = np.array(embeddings)
#print("Embeddings as numpy array:", embeddings_np)
