# Load Word2Vec Models

## Helper Functions

In [39]:
from transformers import pipeline
from gensim.models import Word2Vec, Phrases
from gensim.models import KeyedVectors
import gensim.downloader
import random 
import numpy as np
import re
from scipy.spatial import distance
import keyboard

def get_vector(model, word):
    '''
    Model must be gensim object Word2Vec model
    Returns word vector if key present in its vocab
    '''
    try:
        vect = model[word]
    except:
        vect = None
        print("Word not in model vocabulary")
    return vect

def vec2word(model, vectors, top_n=1):
    '''
    Find most similar word in model given vector
    '''
    if isinstance(vectors, float):
        vectors = [vectors]

    # Find the word most similar to the given vector
    ave_vector = np.mean(vectors, axis=0)
    most_similar_words = model.similar_by_vector(ave_vector, topn=top_n+len(vectors))
    most_similar_words = [word for word, _ in most_similar_words]

    return most_similar_words[len(vectors):]

def cossim(model, vocab, word_1, word_2):
    # make sure both words in Word2Vec model
    if word_1 in vocab and word_2 in vocab:
        return (1 - distance.cosine(model[word_1], model[word_2])) 
    else:
        print("At least one word not in model vocab")
        return None
        

def n_most_similar_words(model, vocab, words, neg=None, n=10):
    '''
    negative is a list of words opposite of most similar n words
    '''

    if isinstance(words, str):
        words = [words]

    if (neg is None) and all(w in vocab for w in words):
        return model.most_similar(words, topn=n)
    elif (words is None) and all(n in vocab for n in neg):
        return model.most_similar(negative=neg, topn=n)
    elif all(w in vocab for w in words) and all(n in vocab for n in neg):
        return model.most_similar(positive=words, negative=neg, topn=n)
    else:
        print("Words not in model vocabulary")
        return None
    
def skip_gram(model, vocab, context_words, n):
    # Predict the most similar n words
    
    if isinstance(context_words, str):
        context_words = [context_words]

    if all(w in vocab for w in context_words):
        context_vectors = [model[word] for word in context_words]
        avg_vector = np.mean(context_vectors, axis=0)
        similar_words = model.similar_by_vector(avg_vector, topn=n+len(context_words))
        return similar_words[len(context_words):]

    else:
        print("Words not in model vocabulary")
        return None
    

## Compute Vector Embeddings

In [None]:
import gensim.downloader

model = gensim.downloader.load('word2vec-google-news-300')

vocab = [word for word in model.index_to_key if re.match("^[a-zA-Z.-]+$", word)]



In [51]:
guesses = ['man', 'boy']

guess_vect = get_vector(model, guesses)
#print(guess_vect)
word = vec2word(model, guess_vect)
print(word)

score = cossim(model, vocab, 'man', 'girl')
print(score)

top_n = n_most_similar_words(model, vocab, guesses, None, 10)
print(top_n)

['girl']
At least one word not in model vocab
None
[('girl', 0.7885462045669556), ('teenager', 0.7737529277801514), ('woman', 0.743568480014801), ('teenage_girl', 0.7086503505706787), ('teen_ager', 0.6568499207496643), ('toddler', 0.6250425577163696), ('youngster', 0.5938030481338501), ('lad', 0.5809065699577332), ('kid', 0.5788402557373047), ('son', 0.577247142791748)]


In [61]:
score = cossim(model, vocab, 'man', 'female')
print(score)

0.2551112473011017


In [62]:
#your "sentences" object with the cleaned text data. 

#bigram_transformer = phrases.Phrases(docs)
#bigram = phrases.Phraser(bigram_transformer)

# model = Word2Vec(bigram[docs], workers=4, sg=0, min_count=5, window=5, sample=1e-3) #size=700 # N-dimensions
# model =  Word2Vec.load("./Word2Vec_Models/English_Sample") #name of YOUR model here, or file path to your 

# Load Google News Word2Vec Model

model_path = './Word2Vec_Models/GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(model_path, limit=500000, binary=True)
model.init_sims(replace=True) #Precompute L2-normalized vectors. If replace is set to TRUE, forget the original vectors and only keep the normalized ones. Saves lots of memory, but can't continue to train the model.
#vocab = list(model.index_to_key)

# filter model

# Filter out vectors not in the vocab list
vocab = [word for word in model.index_to_key if re.match("^[a-zA-Z.-]+$", word)]
#model = KeyedVectors(vector_size=model.vector_size)
#model.vocab = {word: model.vocab[word] for word in vocab if word in model.vocab}

# If you want to keep the normalized vectors, you can initialize sims
model.init_sims(replace=True)

  model.init_sims(replace=True) #Precompute L2-normalized vectors. If replace is set to TRUE, forget the original vectors and only keep the normalized ones. Saves lots of memory, but can't continue to train the model.
  model.init_sims(replace=True)


In [13]:
# preprocess vocab



with open("./Word2Vec_Models/eng_vocab.txt", "w") as f:
    for word in vocab:
        f.write(word + "\n")
print("Model Complete")

Model Complete


In [76]:
# helper function usage

guess, target = "woman", random.choice(vocab)
guesses = ['target', 'goal']

print(guess, target)

guess_vect = get_vector(model, guess)
#print(guess_vect)

word = vec2word(model, [guess_vect])
print(word)

score = cossim(model, vocab, guess, target)
print(score)

top_n = n_most_similar_words(model, vocab, guesses, None, 10)
print(top_n)

context = skip_gram(model, vocab, guesses, 10)
print(context)


woman nba
['man']
0.030750319361686707
[('targets', 0.6589586734771729), ('goals', 0.6300962567329407), ('objective', 0.5616191625595093), ('aim', 0.5383525490760803), ('Goal', 0.4901687204837799), ('objectives', 0.4835728704929352), ('targeted', 0.4605602025985718), ('targeting', 0.45360010862350464), ('aiming', 0.4502216875553131), ('achievable', 0.4414016008377075)]
[('targets', 0.6589586734771729), ('goals', 0.6300962567329407), ('objective', 0.5616191625595093), ('aim', 0.5383525490760803), ('Goal', 0.4901687204837799), ('objectives', 0.4835728704929352), ('targeted', 0.4605602025985718), ('targeting', 0.45360010862350464), ('aiming', 0.4502216875553131), ('achievable', 0.4414016008377075)]


In [98]:
print(guess_vect)

[ 6.38618047e+33 -9.44307285e+33  1.60411125e+33  1.02299936e+34
 -1.02299936e+34  4.14647552e+33  1.81597460e+34 -1.63437697e+34
 -8.71668306e+33  1.24696957e+34 -1.62681089e+33 -5.08473101e+33
 -1.71761039e+33 -2.72396172e+33 -1.45883410e+34  1.05326563e+34
  7.99029265e+33  9.74573805e+33  4.69126817e+32 -4.32807668e+33
 -2.39103353e+33  4.93339965e+33  1.42251393e+34 -8.89828082e+33
  1.04115882e+34 -2.17917076e+33  3.42008759e+33  6.62831287e+33
  6.93096631e+33 -1.01089268e+34 -6.93096631e+33  8.65615336e+33
  9.64736499e+32  7.92975986e+33  9.56413410e+33  4.78206705e+33
  1.62227066e+34  5.35712424e+33 -1.55871197e+33  2.07021257e+34
  1.15617113e+34 -1.30144872e+34  4.38860576e+33  1.61167739e+33
 -6.59804338e+33 -6.35591345e+33  7.62709652e+33  2.92069236e+33
  1.36954777e+33  3.61681947e+33 -6.80990444e+33  9.26147819e+33
 -6.62831287e+33  9.64736499e+32  5.56899087e+33  6.96123333e+33
 -2.70883130e+33 -7.05204180e+33 -2.31726050e+32 -1.12741815e+33
  1.50726046e+34  5.23606

In [97]:
# iterate until guess close enough

delta = 0.05
score = 0
turns = 0
max_turns = 100
scores = []


while abs(score) < 1 - delta and turns < max_turns:
    turns += 1
    guess = vec2word(model, [guess_vect])[0]
    print("Guess: ", guess)
    score = cossim(model, vocab, guess, target)
    scores.append(score)
    
    # keyboard.wait('space')
    guess_vect = guess_vect #+ guess_vect/2 # update guess according to complex hueristic 

    print("Turn: ", turns)
    print("Guess: ", guess)
    print("Similarity Score: ", round(score * 100, 2))
    print()

print("Close Enough! Target Word: ", target)

Guess:  loved
Turn:  1
Guess:  loved
Similarity Score:  -1.71

Guess:  loved
Turn:  2
Guess:  loved
Similarity Score:  -1.71

Guess:  loved
Turn:  3
Guess:  loved
Similarity Score:  -1.71

Guess:  loved
Turn:  4
Guess:  loved
Similarity Score:  -1.71

Guess:  loved
Turn:  5
Guess:  loved
Similarity Score:  -1.71

Guess:  loved
Turn:  6
Guess:  loved
Similarity Score:  -1.71

Guess:  loved
Turn:  7
Guess:  loved
Similarity Score:  -1.71

Guess:  loved
Turn:  8
Guess:  loved
Similarity Score:  -1.71

Guess:  loved
Turn:  9
Guess:  loved
Similarity Score:  -1.71

Guess:  loved
Turn:  10
Guess:  loved
Similarity Score:  -1.71

Guess:  loved
Turn:  11
Guess:  loved
Similarity Score:  -1.71

Guess:  loved
Turn:  12
Guess:  loved
Similarity Score:  -1.71

Guess:  loved
Turn:  13
Guess:  loved
Similarity Score:  -1.71

Guess:  loved
Turn:  14
Guess:  loved
Similarity Score:  -1.71

Guess:  loved
Turn:  15
Guess:  loved
Similarity Score:  -1.71

Guess:  loved
Turn:  16
Guess:  loved
Similarity 

KeyboardInterrupt: 

# Querying Text Database Using Embeddings

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load a pre-trained model
model = SentenceTransformer('paraphrase-distilroberta-base-v1')

# Your text message
text_message = "This is an example text message."

# Generate embeddings
embeddings = model.encode([text_message])

# Print the embeddings
print("Embeddings:", embeddings)

# If you want to convert the embeddings to a numpy array
embeddings_np = np.array(embeddings)
print("Embeddings as numpy array:", embeddings_np)
