In [11]:
#FIRST: Install requirements.txt
#we first import Gensim library to use word2vec, and also NLTK to tokenize (could be done with any other library)
#See https://radimrehurek.com/gensim/models/word2vec.html for API details
from gensim.models import Word2Vec
from nltk.tokenize import NLTKWordTokenizer, sent_tokenize
from nltk.corpus import stopwords
import stanza
#Input is a list containing lists of sentences.
#See from gensim.test.utils import common_texts for a sample text

#We will work with the Gutenberg datasets from NLTK as an example
from nltk.corpus import gutenberg

txt = gutenberg.raw("burgess-busterbrown.txt")
#Tokenize sentences first.
sentences = sent_tokenize(txt)
#We remove the title
sentences = sentences[1:]

#Normalize the input text by removing stopwords
word_tokenizer = NLTKWordTokenizer()
stop_words = set(stopwords.words('english'))
normalized_sentences = []
normalized_stanza = []
#We compare NLTK with Stanza to see differences and add POS to lemma.
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma')
for txt in sentences:
    tkns = word_tokenizer.tokenize(txt)
    tkns = [''.join(t.split('-')).lower() for t in tkns if
		        t not in stop_words and t not in '@.,!#$%*:;"' and len(t)>2]
    sent = nlp(' '.join(tkns))
    sent = [s.lemma for s in sent.iter_words()]
    normalized_sentences.append(sent)
    sentence = nlp(txt)
    lemmas = [w.lemma for w in sentence.iter_words() if w.text not in stop_words and w.text not in '@.,!#$%*:;"' and len(w.text)>2]
    normalized_stanza.append(lemmas)
#Print a single sentence to see the result of the word tokenization, normalization, and lemmatization from both libraries
print(f" NLTK result: {normalized_sentences[0]}")
print(f" Stanza result: {normalized_stanza[0]}")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json:   0%|   â€¦

2022-06-13 13:56:07 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |

2022-06-13 13:56:07 INFO: Use device: cpu
2022-06-13 13:56:07 INFO: Loading: tokenize
2022-06-13 13:56:07 INFO: Loading: pos
2022-06-13 13:56:07 INFO: Loading: lemma
2022-06-13 13:56:07 INFO: Done loading processors!


 NLTK result: ['once', 'yawn', 'slowly', 'get', 'foot', 'shake']
 Stanza result: ['once', 'yawn', 'slowly', 'get', 'foot', 'shake']


In [14]:
#Word2vec parameterization is simple. Just provide list of lists of the sentences, the dimension of the output vector (100), window for the skip-gram
# and the minimum count for words in the corpus (sentences list).
model = Word2Vec(sentences=normalized_stanza, vector_size=100, window=3, min_count=3)
model.wv.similar_by_word('boy', 10)

[('Buster', 0.9837964177131653),
 ('one', 0.9802712202072144),
 ('little', 0.9796777367591858),
 ('could', 0.9788732528686523),
 ('Farmer', 0.9780943393707275),
 ('know', 0.9780401587486267),
 ('Brown', 0.97797691822052),
 ('berry', 0.9773219227790833),
 ('see', 0.9767408967018127),
 ('Green', 0.9766647219657898)]