### Problem Statement

  - Implement Word2Vec using the Gensim library to generate word embeddings for
a given text corpus.

### Importing Modules

In [86]:
import gensim
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Tokenization and Stopwords Removal

In [87]:
text = """Ballspielverein Borussia 09 e. V. Dortmund, often known simply as Borussia Dortmund or by its initialism BVB, is a German professional sports club based in Dortmund, North Rhine-Westphalia. It is best known for its men's professional football team, which plays in the Bundesliga, the top tier of the German football league system. The club have won eight league championships, five DFB-Pokals, one UEFA Champions League, one Intercontinental Cup, and one UEFA Cup Winners' Cup."""

In [88]:
sentences = sent_tokenize(text)
sentences

['Ballspielverein Borussia 09 e. V. Dortmund, often known simply as Borussia Dortmund or by its initialism BVB, is a German professional sports club based in Dortmund, North Rhine-Westphalia.',
 "It is best known for its men's professional football team, which plays in the Bundesliga, the top tier of the German football league system.",
 "The club have won eight league championships, five DFB-Pokals, one UEFA Champions League, one Intercontinental Cup, and one UEFA Cup Winners' Cup."]

In [89]:
stopwords = stopwords.words("english")

In [90]:
clean_sents = []
for sent in sentences:
  wordList = word_tokenize(sent)
  wordList = [word for word in wordList if (word.isalnum()) and (word not in stopwords)]
  clean_sents.append(wordList)

clean_sents[0]

['Ballspielverein',
 'Borussia',
 '09',
 'Dortmund',
 'often',
 'known',
 'simply',
 'Borussia',
 'Dortmund',
 'initialism',
 'BVB',
 'German',
 'professional',
 'sports',
 'club',
 'based',
 'Dortmund',
 'North']

### Gensim's Word2Vec Model

Generating the Model

In [91]:
model = Word2Vec(sentences=clean_sents, vector_size=100, window=5, min_count=1, workers=4)

Saving the Model

In [92]:
model.save("word2vec.bin")

Loading and Predictions

In [93]:
load = Word2Vec.load("word2vec.bin")

model.wv["Borussia"]

array([-8.7285992e-03,  2.1308314e-03, -8.7997003e-04, -9.3094213e-03,
       -9.4442023e-03, -1.4172612e-03,  4.4358550e-03,  3.7120164e-03,
       -6.5057087e-03, -6.8811560e-03, -4.9921693e-03, -2.2841506e-03,
       -7.2565223e-03, -9.5993383e-03, -2.7467555e-03, -8.3703566e-03,
       -6.0418118e-03, -5.6872773e-03, -2.3438581e-03, -1.7123491e-03,
       -8.9622727e-03, -7.3357276e-04,  8.1697078e-03,  7.6938346e-03,
       -7.1973787e-03, -3.6777905e-03,  3.1199115e-03, -9.5713865e-03,
        1.4695568e-03,  6.5334365e-03,  5.7690898e-03, -8.7826597e-03,
       -4.5038909e-03, -8.1659770e-03,  4.6790839e-05,  9.2641562e-03,
        5.9691351e-03,  5.0724158e-03,  5.0668078e-03, -3.2551985e-03,
        9.5397616e-03, -7.3686596e-03, -7.2838929e-03, -2.2712594e-03,
       -7.6921302e-04, -3.2213675e-03, -5.9523829e-04,  7.4973004e-03,
       -6.9841626e-04, -1.6188443e-03,  2.7433424e-03, -8.3556585e-03,
        7.8516593e-03,  8.5396161e-03, -9.5815444e-03,  2.4403918e-03,
      

<hr><hr>