In [1]:
# -------------------------------
# Alternate Word Embedding using Gensim
# -------------------------------

from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

corpus = [
    "I am loving the NLP class, but sometimes it feels confusing!",
    "NLP is a fascinating field — it deals with text, speech, and language understanding."
]

# Step 1: Tokenize corpus
tokens = [word_tokenize(sent.lower()) for sent in corpus]

# Step 2: Train Word2Vec model (min_count=1 ensures all words are used)
model = Word2Vec(tokens, vector_size=50, window=3, min_count=1, sg=1)

# Step 3: Display word vectors
print("\nWord Embedding for 'nlp':\n", model.wv['nlp'][:5])  # first 5 numbers

# Step 4: Check similarity between words
print("\nSimilarity(nlp, language):", model.wv.similarity('nlp', 'language'))
print("Similarity(nlp, class):", model.wv.similarity('nlp', 'class'))



Word Embedding for 'nlp':
 [-0.01630604  0.00899287 -0.00826389  0.00163419  0.0169904 ]

Similarity(nlp, language): -0.31977788
Similarity(nlp, class): -0.119261764


In [3]:
# --------------------------------------
# Q.4: Word2Vec Implementation in Python
# --------------------------------------

# Step 1: Import libraries
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

# Step 2: Define the corpus (same two sentences)
doc1 = "I am loving the NLP class, but sometimes it feels confusing!"
doc2 = "NLP is a fascinating field — it deals with text, speech, and language understanding."

# Step 3: Tokenize each document (convert text → list of words)
tokens_doc1 = word_tokenize(doc1.lower())
tokens_doc2 = word_tokenize(doc2.lower())

# Step 4: Combine all tokenized documents into a single list for training
corpus = [tokens_doc1, tokens_doc2]

# Step 5: Train the Word2Vec model
model = Word2Vec(sentences=corpus, vector_size=50, window=3, min_count=1, sg=1)
# vector_size = dimensionality of word vectors
# window = number of context words to consider
# min_count = ignore words with frequency < 1
# sg = 1 means use Skip-gram (0 would mean CBOW)

# Step 6: Display vocabulary words
print("\nVocabulary in Model:")
print(list(model.wv.index_to_key))

# Step 7: Display vector representation of a word
print("\nVector for word 'nlp':")
print(model.wv['nlp'])

# Step 8: Find most similar words to a given word
print("\nMost similar words to 'nlp':")
print(model.wv.most_similar('nlp'))



Vocabulary in Model:
[',', 'nlp', 'it', '.', 'understanding', 'am', 'loving', 'the', 'class', 'but', 'sometimes', 'feels', 'confusing', '!', 'is', 'a', 'fascinating', 'field', '—', 'deals', 'with', 'text', 'speech', 'and', 'language', 'i']

Vector for word 'nlp':
[-0.01630604  0.00899287 -0.00826389  0.00163419  0.0169904  -0.00893342
  0.00902868 -0.0135695  -0.00710388  0.01878386 -0.00314644  0.00063232
 -0.00826854 -0.015368   -0.00301742  0.00493723 -0.00176314  0.0110779
 -0.00549505  0.00451258  0.0109021   0.01669286 -0.00289699 -0.01841494
  0.00873734  0.00114928  0.01488332 -0.00162273 -0.00527408 -0.01750709
 -0.00172094  0.00564497  0.0108108   0.0141087  -0.01141115  0.00371566
  0.01218499 -0.00960279 -0.00622708  0.01358758  0.00325974  0.00037814
  0.00694768  0.00043663  0.01923922  0.01011519 -0.01783127 -0.01408236
  0.00179584  0.0127875 ]

Most similar words to 'nlp':
[('fascinating', 0.22991271317005157), ('i', 0.21886946260929108), ('deals', 0.16039878129959106

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
