- https://stackabuse.com/implementing-word2vec-with-gensim-library-in-python/

# Creating Corpus

In [1]:
import bs4 as bs
import urllib.request
import re
import nltk

scrapped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence')
article = scrapped_data .read()

parsed_article = bs.BeautifulSoup(article,'lxml')

paragraphs = parsed_article.find_all('p')

article_text = ""

for p in paragraphs:
    article_text += p.text

In [2]:
# Cleaing the text
processed_article = article_text.lower()
processed_article = re.sub('[^a-zA-Z]', ' ', processed_article )
processed_article = re.sub(r'\s+', ' ', processed_article)

# Preparing the dataset
all_sentences = nltk.sent_tokenize(processed_article)

all_words = [nltk.word_tokenize(sent) for sent in all_sentences]

# Removing Stop Words
from nltk.corpus import stopwords
for i in range(len(all_words)):
    all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]

In [3]:
from gensim.models import Word2Vec

word2vec = Word2Vec(all_words, min_count=2)

In [4]:
vocabulary = word2vec.wv.vocab
print(vocabulary)

{'artificial': <gensim.models.keyedvectors.Vocab object at 0x7f56e523ae50>, 'intelligence': <gensim.models.keyedvectors.Vocab object at 0x7f56cfb4fd90>, 'ai': <gensim.models.keyedvectors.Vocab object at 0x7f56e523ac90>, 'sometimes': <gensim.models.keyedvectors.Vocab object at 0x7f56e523a990>, 'called': <gensim.models.keyedvectors.Vocab object at 0x7f56db80f1d0>, 'machine': <gensim.models.keyedvectors.Vocab object at 0x7f56cf298b10>, 'machines': <gensim.models.keyedvectors.Vocab object at 0x7f56cf29cc50>, 'unlike': <gensim.models.keyedvectors.Vocab object at 0x7f56cf23f150>, 'natural': <gensim.models.keyedvectors.Vocab object at 0x7f56e523c7d0>, 'displayed': <gensim.models.keyedvectors.Vocab object at 0x7f56db80f250>, 'humans': <gensim.models.keyedvectors.Vocab object at 0x7f56cf23f1d0>, 'animals': <gensim.models.keyedvectors.Vocab object at 0x7f56cf23f210>, 'leading': <gensim.models.keyedvectors.Vocab object at 0x7f56cf23f250>, 'field': <gensim.models.keyedvectors.Vocab object at 0x7f5

In [6]:
v1 = word2vec.wv['artificial']
v1

array([ 5.0177574e-03,  2.4320385e-03,  1.8794870e-03,  9.2189794e-04,
       -2.1354912e-03,  2.8065590e-03, -4.1626659e-03, -1.5642005e-03,
        1.6652644e-03, -4.3476196e-03, -2.8863670e-03,  5.2541536e-03,
       -6.2183738e-03,  9.1175683e-04, -1.6191517e-03,  2.3051801e-03,
        6.5827179e-03, -2.8607743e-03, -5.3833891e-03,  1.4575451e-03,
       -2.9212818e-03, -2.3323069e-04,  4.0949234e-03,  7.2474005e-03,
       -2.0064372e-03,  8.9203025e-04, -1.6455882e-03, -6.3128369e-03,
       -8.0654027e-05, -3.8618413e-03, -2.6747801e-03,  2.4157778e-05,
        1.0054902e-03,  4.9030292e-03,  2.3725696e-03, -6.2384404e-04,
        3.1903249e-03,  1.3244352e-03,  2.1307177e-03, -3.1478759e-03,
        6.0739316e-05,  2.9931623e-03,  5.4600318e-03,  2.7271262e-03,
        2.4184491e-03, -4.2734416e-03,  6.7854562e-06, -4.3567019e-03,
       -4.0742080e-03,  3.9134310e-03, -1.2742841e-03, -1.8277967e-03,
        3.4407625e-04,  1.2418759e-03,  2.5405851e-03, -3.8736355e-03,
      

In [8]:
sim_words = word2vec.wv.most_similar('intelligence')
sim_words

[('data', 0.5216653347015381),
 ('human', 0.4927040934562683),
 ('machine', 0.4918769598007202),
 ('ai', 0.4840529263019562),
 ('humans', 0.468401700258255),
 ('one', 0.458925724029541),
 ('computer', 0.44762176275253296),
 ('general', 0.44215720891952515),
 ('intelligent', 0.4375338554382324),
 ('even', 0.4344313442707062)]