# Implementing Word2Vec with Gensim

In [None]:
# Parsing Imports
import bs4 as bs
import urllib.request
import re
import nltk

# Gensim Library
from gensim.models import Word2Vec


### *Scraping Data off Wikipedia to use as Corpus*

In [None]:
scrapped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence')
article = scrapped_data.read()

parsed_article = bs.BeautifulSoup(article, 'lxml')

paragraphs = parsed_article.find_all('p')

article_text = ""

for p in paragraphs:
    article_text += p.text

### *Pre-Processing*

In [None]:
# Cleaing the text
processed_article = article_text.lower()
processed_article = re.sub('[^a-zA-Z]', ' ', processed_article )
processed_article = re.sub(r'\s+', ' ', processed_article)

# Preparing the dataset
all_sentences = nltk.sent_tokenize(processed_article)

all_words = [nltk.word_tokenize(sent) for sent in all_sentences]

# Removing Stop Words
from nltk.corpus import stopwords
for i in range(len(all_words)):
    all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]

### *Creting Word2Vec Model*

In [None]:
word2vec = Word2Vec(all_words, min_count=2)
vocabulary = word2vec.wv.vocab
print(vocabulary)

### *Finding Vectors for a Word*

The vector v1 contains the vector representation for the word "artificial". By default, a hundred dimensional vector is created by Gensim Word2Vec. This is a much, much smaller vector as compared to what would have been produced by bag of words. If we use the bag of words approach for embedding the article, the length of the vector for each will be 1206 since there are 1206 unique words with a minimum frequency of 2. If the minimum frequency of occurrence is set to 1, the size of the bag of words vector will further increase. On the other hand, vectors generated through Word2Vec are not affected by the size of the vocabulary.

In [None]:
# Creating and Getting vectors to our words
v1 = word2vec.wv['artificial']
print(v1)

### *Findig Similar Words*

In [None]:
sim_words = word2vec.wv.most_similar('intelligence')