In [38]:
import nltk
from nltk.corpus import movie_reviews
from gensim.models import Word2Vec
import random

# Download corpus
nltk.download('movie_reviews')

# Load and tokenize documents
documents = [
    list(movie_reviews.words(fileid))
    for fileid in movie_reviews.fileids()
]

# Shuffle words within each doc to prevent overfitting to order
random.shuffle(documents)

# Print sample
print(f"Number of reviews: {len(documents)}")



[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\AHINA\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Number of reviews: 2000


In [39]:
# Train Word2Vec on movie reviews
model = Word2Vec(
    sentences=documents,
    vector_size=100,    # 100-dimensional embeddings
    window=5,           # Context window
    min_count=2,        # Ignore rare words
    workers=4,          # Parallel threads
    sg=1                # Skip-gram (1) or CBOW (0)
)

# Save model if needed
# model.save("movie_reviews_word2vec.model")


In [40]:
result = model.wv.most_similar(positive=["king", "woman"], negative=["man"])
print(result)  # Should return something like "queen"


[('sidney', 0.6609278321266174), ('queen', 0.6380879878997803), ('jane', 0.6356750726699829), ('neve', 0.6116956472396851), ('roman', 0.6087451577186584), ('bud', 0.6077442169189453), ('ricci', 0.6057746410369873), ('rachel', 0.6050881147384644), ('linda', 0.6034192442893982), ('manager', 0.6031489968299866)]


In [42]:
# Most similar to 'good'
print("\nWords similar to 'good':")
for word, sim in model.wv.most_similar('good', topn=5):
    print(f"{word}: {sim:.2f}")




Words similar to 'good':
great: 0.75
decent: 0.74
bad: 0.70
passable: 0.68
terrible: 0.68


In [43]:
from gensim.models import Word2Vec

# List of stop words to remove (articles, prepositions, verbs)
stop_words = set([
    "a", "an", "the", "is", "are", "was", "were", "in", "on", "and", "of", "to", "with", "for", 
    "has", "have", "had", "be", "as", "by", "at", "from", "that", "this", "it", "he", "she", "they",
    "you", "i", "we", "but", "or", "not", "so", "if", "then", "when", "which", "who", "whom",
    "rules", "rides", "drives", "live", "married"
])

# Expanded corpus with more sentences
sentences = [
    ["king", "rules", "a", "kingdom"],
    ["queen", "rules", "a", "kingdom", "with", "wisdom"],
    ["queen", "is", "a", "woman"],
    ["man", "is", "strong"],
    ["woman", "is", "wise"],
    ["prince", "is", "the", "son", "of", "a", "king"],
    ["princess", "is", "the", "daughter", "of", "a", "queen"],
    ["the", "king", "and", "queen", "live", "in", "a", "palace"],
    ["the", "queen", "is", "married", "to", "the", "king"],
    ["the", "man", "drives", "a", "car"],
    ["the", "woman", "rides", "a", "bicycle"],
    ["dogs", "are", "loyal", "animals"],
    ["cats", "are", "independent", "animals"],
    ["paris", "is", "the", "capital", "of", "france"],
    ["london", "is", "the", "capital", "of", "england"],
    ["apple", "is", "a", "fruit"],
    ["carrot", "is", "a", "vegetable"],
    ["python", "is", "a", "programming", "language"],
    ["java", "is", "a", "programming", "language"],
    ["dog", "barks", "loudly"],
    ["cat", "sleeps", "all", "day"],
    ["prince", "wears", "a", "crown"],
    ["princess", "wears", "a", "tiara"],
    ["kingdom", "has", "many", "subjects"],
    ["queen", "loves", "her", "people"],
    ["man", "builds", "a", "house"],
    ["woman", "plants", "a", "garden"],
    ["children", "play", "in", "the", "park"],
]

# Remove stop words from each sentence
filtered_sentences = [
    [word for word in sentence if word not in stop_words]
    for sentence in sentences
]

# Train Word2Vec model on filtered sentences
model = Word2Vec(filtered_sentences, vector_size=20, window=3, min_count=1, workers=2, sg=1)

# Show vector for 'king'
print("Word vector for 'king':")
print(model.wv['king'])

# Similar words to 'king'
print("\nWords similar to 'king':")
for word, sim in model.wv.most_similar('king', topn=5):
    print(f"{word}: {sim:.2f}")

# Word analogy: king - man + woman = ?
print("\nAnalogy: 'king' - 'man' + 'queen' ≈ ?")
result = model.wv.most_similar(positive=['king', 'queen'], negative=['man'], topn=1)
print(f"Result: {result[0][0]} ({result[0][1]:.2f})")


Word vector for 'king':
[ 0.03656607  0.02534573  0.03378296  0.00381261  0.03175386 -0.01702927
 -0.00473246  0.02884854 -0.03760872 -0.01968175 -0.03755583 -0.00465655
  0.04769655 -0.03658936 -0.01167611 -0.00968685  0.04039469 -0.02966029
  0.00022053 -0.02377453]

Words similar to 'king':
her: 0.46
house: 0.45
paris: 0.44
dog: 0.40
crown: 0.40

Analogy: 'king' - 'man' + 'queen' ≈ ?
Result: palace (0.57)
