# Load Word2Vec Models

## Helper Functions

In [88]:
from transformers import pipeline
from gensim.models import Word2Vec, Phrases
from gensim.models import KeyedVectors
import random 
import numpy as np
import re
from scipy.spatial import distance
import keyboard

def get_vector(model, word):
    '''
    Model must be gensim object Word2Vec model
    Returns word vector if key present in its vocab
    '''
    try:
        vect = model[word]
    except:
        vect = None
        print("Word not in model vocabulary")
    return vect

def vec2word(model, vectors, top_n=1):
    '''
    Find most similar word in model given vector
    '''
    if isinstance(vectors, float):
        vectors = [vectors]

    # Find the word most similar to the given vector
    ave_vector = np.mean(vectors, axis=0)
    most_similar_words = model.similar_by_vector(ave_vector, topn=top_n+len(vectors))
    most_similar_words = [word for word, _ in most_similar_words]

    return most_similar_words[len(vectors):]

def cossim(model, vocab, word_1, word_2):
    # make sure both words in Word2Vec model
    if word_1 in vocab and word_2 in vocab:
        return (1 - distance.cosine(model[word_1], model[word_2])) 
    else:
        print("At least one word not in model vocab")
        return None
        

def n_most_similar_words(model, vocab, words, neg=None, n=10):
    '''
    negative is a list of words opposite of most similar n words
    '''

    if isinstance(words, str):
        words = [words]

    if (neg is None) and all(w in vocab for w in words):
        return model.most_similar(words, topn=n)
    elif (words is None) and all(n in vocab for n in neg):
        return model.most_similar(negative=neg, topn=n)
    elif all(w in vocab for w in words) and all(n in vocab for n in neg):
        return model.most_similar(positive=words, negative=neg, topn=n)
    else:
        print("Words not in model vocabulary")
        return None
    
def skip_gram(model, vocab, context_words, n):
    # Predict the most similar n words
    
    if isinstance(context_words, str):
        context_words = [context_words]

    if all(w in vocab for w in context_words):
        context_vectors = [model[word] for word in context_words]
        avg_vector = np.mean(context_vectors, axis=0)
        similar_words = model.similar_by_vector(avg_vector, topn=n+len(context_words))
        return similar_words[len(context_words):]

    else:
        print("Words not in model vocabulary")
        return None
    

## Compute Vector Embeddings

In [11]:
#your "sentences" object with the cleaned text data. 

#bigram_transformer = phrases.Phrases(docs)
#bigram = phrases.Phraser(bigram_transformer)

# model = Word2Vec(bigram[docs], workers=4, sg=0, min_count=5, window=5, sample=1e-3) #size=700 # N-dimensions
# model =  Word2Vec.load("./Word2Vec_Models/English_Sample") #name of YOUR model here, or file path to your 

# Load Google News Word2Vec Model

model_path = './Word2Vec_Models/GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(model_path, limit=500000, binary=True)
model.init_sims(replace=True) #Precompute L2-normalized vectors. If replace is set to TRUE, forget the original vectors and only keep the normalized ones. Saves lots of memory, but can't continue to train the model.
#vocab = list(model.index_to_key)
vocab = [word for word in model.index_to_key if re.match("^[a-zA-Z.-]+$", word)]

  model.init_sims(replace=True) #Precompute L2-normalized vectors. If replace is set to TRUE, forget the original vectors and only keep the normalized ones. Saves lots of memory, but can't continue to train the model.


In [13]:
with open("./Word2Vec_Models/eng_vocab.txt", "w") as f:
    for word in vocab:
        f.write(word + "\n")
print("Model Complete")

Model Complete


In [83]:
# helper function usage

guess, target = "love", random.choice(vocab)
print(guess, target)

guess_vect = get_vector(model, guess)
#print(guess_vect)

word = vec2word(model, [guess_vect])
print(word)

score = cossim(model, vocab, guess, target)
print(score)

top_n = n_most_similar_words(model, vocab, [guess], None, 5)
print(top_n)

context = skip_gram(model, vocab, [guess], 5)
print(context)


love Barington
[('love', 1.0), ('loved', 0.6907792091369629)]
['loved']
-0.01563189923763275
[('loved', 0.6907792091369629), ('adore', 0.6816874146461487), ('loves', 0.661863386631012), ('passion', 0.6100709438323975), ('hate', 0.600395679473877)]
[('loved', 0.6907792091369629), ('adore', 0.6816874146461487), ('loves', 0.661863386631012), ('passion', 0.6100709438323975), ('hate', 0.600395679473877)]


In [90]:
# iterate until guess close enough

delta = 0.05
score = 0
turns = 0

scores = []


while abs(score) < 1 - delta:
    turns += 1
    guess = vec2word(model, [guess_vect])[0]
    score = cossim(model, vocab, guess, target)
    scores.append(score)
    
    # keyboard.wait('space')
    guess_vect = guess_vect + guess_vect/2 # update guess according to complex hueristic 

    print("Turn: ", turns)
    print("Guess: ", guess)
    print("Similarity Score: ", round(score * 100, 2))
    print()

print("Close Enough! Target Word: ", target)

Similarity Score:  -1.71
Guess:  loved
Turns:  1
Similarity Score:  -1.71
Guess:  loved
Turns:  2
Similarity Score:  -1.71
Guess:  loved
Turns:  3
Similarity Score:  -1.71
Guess:  loved
Turns:  4
Similarity Score:  -1.71
Guess:  loved
Turns:  5
Similarity Score:  -1.71
Guess:  loved
Turns:  6
Similarity Score:  -1.71
Guess:  loved
Turns:  7
Similarity Score:  -1.71
Guess:  loved
Turns:  8
Similarity Score:  -1.71
Guess:  loved
Turns:  9
Similarity Score:  -1.71
Guess:  loved
Turns:  10
Similarity Score:  -1.71
Guess:  loved
Turns:  11
Similarity Score:  -1.71
Guess:  loved
Turns:  12
Similarity Score:  -1.71
Guess:  loved
Turns:  13
Similarity Score:  -1.71
Guess:  loved
Turns:  14
Similarity Score:  -1.71
Guess:  loved
Turns:  15
Similarity Score:  -1.71
Guess:  loved
Turns:  16
Similarity Score:  -1.71
Guess:  loved
Turns:  17
Similarity Score:  -1.71
Guess:  loved
Turns:  18
Similarity Score:  -1.71
Guess:  loved
Turns:  19
Similarity Score:  -1.71
Guess:  loved
Turns:  20
Similarit

KeyboardInterrupt: 

# Querying Text Database Using Embeddings

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load a pre-trained model
model = SentenceTransformer('paraphrase-distilroberta-base-v1')

# Your text message
text_message = "This is an example text message."

# Generate embeddings
embeddings = model.encode([text_message])

# Print the embeddings
print("Embeddings:", embeddings)

# If you want to convert the embeddings to a numpy array
embeddings_np = np.array(embeddings)
print("Embeddings as numpy array:", embeddings_np)
