In [1]:
import pandas as pd
import re
import nltk
import bs4 as bs
import urllib.request 

In [19]:
# Scrape the programming languages wiki page

data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Elon_Musk')

article = data.read()

parsed_article = bs.BeautifulSoup(article, 'lxml')

paragraphs = parsed_article.find_all('p')

# create text corpus from cleaned paragraphs
article_text = ''

for p in paragraphs:
    article_text += p.text

In [20]:
# cleaning text

processed_article = article_text.lower()
processed_article = re.sub('[^a-zA-Z]', ' ', processed_article)
processed_article = re.sub(r'\s+', ' ', processed_article)

# prepare dataset
all_sentences = nltk.sent_tokenize(processed_article)
all_words = [nltk.word_tokenize(sent) for sent in all_sentences]

# remove stop words
from nltk.corpus import stopwords
for i in range(len(all_words)):
    all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]

In [21]:
# creating word2vec model

from gensim.models import Word2Vec

word2vec = Word2Vec(all_words, min_count=3)

In [22]:
vocabulary = word2vec.wv.vocab
print(vocabulary)

{'elon': <gensim.models.keyedvectors.Vocab object at 0x000001F5F61835C0>, 'musk': <gensim.models.keyedvectors.Vocab object at 0x000001F5F61836D8>, 'l': <gensim.models.keyedvectors.Vocab object at 0x000001F5F6183748>, 'n': <gensim.models.keyedvectors.Vocab object at 0x000001F5F6183828>, 'born': <gensim.models.keyedvectors.Vocab object at 0x000001F5F6183860>, 'june': <gensim.models.keyedvectors.Vocab object at 0x000001F5F6183978>, 'engineer': <gensim.models.keyedvectors.Vocab object at 0x000001F5F61839B0>, 'designer': <gensim.models.keyedvectors.Vocab object at 0x000001F5F6183A58>, 'technology': <gensim.models.keyedvectors.Vocab object at 0x000001F5F6183B00>, 'entrepreneur': <gensim.models.keyedvectors.Vocab object at 0x000001F5F6183B70>, 'south': <gensim.models.keyedvectors.Vocab object at 0x000001F5F6183BA8>, 'africa': <gensim.models.keyedvectors.Vocab object at 0x000001F5F6183C50>, 'canada': <gensim.models.keyedvectors.Vocab object at 0x000001F5F6183C88>, 'united': <gensim.models.keye

In [24]:
v1 = word2vec.wv['tesla']

In [25]:
v1

array([ 3.2290092e-04,  9.0718165e-04, -4.0574181e-03,  6.8413331e-03,
        5.5073597e-03,  4.1572661e-03, -4.2189304e-03, -2.0862122e-03,
       -8.0087013e-04,  2.1556870e-03, -1.2840928e-03,  7.4201515e-03,
       -6.4167571e-03,  3.1787360e-03, -1.2865169e-03,  1.3989299e-03,
       -4.3854042e-04, -4.3445588e-03, -2.0573223e-03,  3.8512014e-03,
        1.8826320e-03,  2.8965704e-03, -4.0830807e-03,  5.1398325e-04,
        1.9806325e-03, -8.0483983e-04,  2.0172328e-03, -1.1830048e-03,
       -4.9365632e-04, -1.2476097e-03, -2.3615621e-03,  5.0354032e-03,
       -5.9737911e-04,  3.7226644e-03,  2.8310388e-03,  1.3359358e-04,
        4.5942068e-03,  4.2244559e-03,  1.7745069e-03, -4.0687774e-03,
       -6.7639453e-03,  1.3370616e-03, -5.7311240e-03, -4.7482317e-03,
       -2.5952565e-03,  2.1039208e-03, -4.2042937e-03, -9.4812523e-05,
       -2.0058225e-03, -8.8955462e-03,  6.0227071e-04, -4.0598535e-03,
        5.7038342e-05, -3.1283984e-03,  1.8047943e-04,  1.7744893e-03,
      

In [27]:
# most similar words to python

sim_words = word2vec.wv.most_similar('tesla')
print(sim_words)

[('musk', 0.6153647899627686), ('company', 0.5795050859451294), ('want', 0.4891301393508911), ('july', 0.46000516414642334), ('university', 0.45283210277557373), ('year', 0.45128148794174194), ('us', 0.44260120391845703), ('would', 0.4321960210800171), ('saying', 0.42996811866760254), ('world', 0.4294130802154541)]
