In [11]:
import nltk
print(nltk.__version__)
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
nltk.download('words')
nltk.download('punkt')

#stores every word in english dictionary into a list called 'words'. It imports strings so we have to tokenize them
words = nltk.corpus.words.words()

#tokenizing the words - puts them into individual words so we can vectorize them
tokenized_words = [word_tokenize(word) for word in words]

#list comprehension checks if its alphabetic with isalpha(),
#if so it puts it in lowercase and stores it in processed_words
#the preprocessing of the words cleans them so that they can be effectively used in nlp
processed_words = [word.lower() for words in tokenized_words for word in words if word.isalpha()]

#training the model, min_count=1 ensures that each word appear one time only
#wordtovec maps words to their vector representation
#wordtovec is a neural network, each word is a neuron, probability correction improves overtime
wordToVecModel = Word2Vec(processed_words, min_count=1)

print("working cell")

3.8.1


[nltk_data] Downloading package words to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


working cell


In [12]:
from sklearn.neighbors import NearestNeighbors
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

def findKNearestNeighbor(wordInput, wordToVecModel, numNeighbors):

    
    #if word is in the wordtovec model...
    if wordInput in wordToVecModel.wv.key_to_index:
    #first get vector representation of the word that we input so that we can...
    #now find the cosine similarities of this and all other words to perform KNN algorithm
    #cosine similarities are essential in natural language processing, they are performed on vectors
        word_vector = wordToVecModel.wv.word_vec(wordInput)
        print("Input word is PRESENT in model vocabulary.")
    # word not found
    else:
        print("Input word NOT found in model vocabulary.")
        
    #this is all the vectors inside the wordToVecModel that we created containing every english word
    all_word_vectors = wordToVecModel.wv.vectors

    #we are going to pass in the number of neighbors below.
    #metric is the distance metric, we are using cosine as the distance metric to find similarities
    #model is being trained on the variable 'all_word_vectors'
    neighbors = NearestNeighbors(numNeighbors, metric='cosine').fit(all_word_vectors)

    # kneighbors returns a 2d array with the distances and the indices. 
    # distances is index 0 / [0]
    # indices is index 1 / [1]
    #storing this info in the variable 'nearest_neighbors_info'
    nearest_neighbors_info = neighbors.kneighbors([word_vector])

    #extracting the information and storing it into two variables 
    # 'distances' variable tells us how far each of the most similar words are from the input word in terms of cosine similarity. 
    #distances = nearest_neighbors_info[0]

    #indices is referring to the closest indices to the input word
    indices = nearest_neighbors_info[1]

    #list comprehension
    # get the most similar words and putting them in list of strings
    #we have to remove the input word because we dont want it as a kneighbor
    similar_words = [wordToVecModel.index_to_key[i] for i in indices[0] if wordToVecModel.index_to_key[i] != wordInput]

    return similar_words


In [7]:
similar_words = findKNearestNeighbor('hello', wordToVecModel, 5)
print(similar_words)

AttributeError: The vocab attribute was removed from KeyedVector in Gensim 4.0.0.
Use KeyedVector's .key_to_index dict, .index_to_key list, and methods .get_vecattr(key, attr) and .set_vecattr(key, attr, new_val) instead.
See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4

In [None]:
#creating file for all  words

filename = "allwords.txt"

file = open(filename, "a")

#converting all the words from corpus to string format so we can put in file
words_str = str(words)

#saving to file
file.write(words_str)

file.flush()
file.close

In [None]:
#creating file for all processed words

filename = "allprocessedwords.txt"

file = open(filename, "a")

#converting all the tokenized words to string format so we can put in file
processed_words_str = str(processed_words)

#saving to file
file.write(processed_words_str)

file.flush()
file.close

In [None]:
#creating file for all tokenized words

filename = "alltokenizedwords.txt"

file = open(filename, "a")

#converting all the tokenized words to string format so we can put in file
tokenized_words_str = str(tokenized_words)

#saving to file
file.write(tokenized_words_str)

file.flush()
file.close