In [5]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

corpus=["Word embeddings are powerful.", "TThey capture semantic relationships.", "Word2Vec is a popular model."]

tokenized_corpus=[word_tokenize(sentence.lower()) for sentence in corpus]

w2v_model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, sg=1, min_count=1)

neg_sampling_model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, sg=1, negative=5, min_count=1)

word_to_predict = "word"
next_word_w2v=w2v_model.wv.most_similar(word_to_predict, topn=1)[0][0]
next_word_neg_sampling = neg_sampling_model.wv.most_similar(word_to_predict, topn=1)[0][0]

print("Next Word (Word2Vec) : ", next_word_w2v)
print("Next Word (Negative Sampling) : ", next_word_neg_sampling)

Next Word (Word2Vec) :  is
Next Word (Negative Sampling) :  is


In [17]:
import numpy as np
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity


corpus = ["Word embeddings are powerful.", "They capture semantic relationships.", "Count-based methods are efficient."]

tokenized_corpus = [word_tokenize(sentence.lower()) for sentence in corpus]

vocab = set(word for sentence in tokenized_corpus for word in sentence)
word_to_index = {word: i for i, word in enumerate(vocab)}
co_occurance_matrix = np.zeros((len(vocab), len(vocab)))

window_size = 2

for sentence in tokenized_corpus:
    for i, target_word in enumerate(sentence):
        start = max(0, i - window_size)
        end = min(len(sentence), i+window_size + 1)
        context_words = [sentence[j] for j in range(start, end) if j != i ]
        target_index = word_to_index[target_word]
        for context_word in context_words:
            context_index = word_to_index[context_word]
            co_occurance_matrix[target_index, context_index] += 1

print("Co-occurances Matrix : ")
print(co_occurance_matrix)

similarity_matrix = cosine_similarity(co_occurance_matrix)

print("Cosine Similarity Matrix : ")
print(similarity_matrix)

Co-occurances Matrix : 
[[0. 0. 0. 1. 1. 1. 0. 0. 1. 1. 1. 2.]
 [0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1.]
 [1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1.]
 [2. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0.]]
Cosine Similarity Matrix : 
[[1.         0.36514837 0.         0.54772256 0.36514837 0.2236068
  0.         0.31622777 0.36514837 0.2236068  0.54772256 0.2236068 ]
 [0.36514837 1.         0.81649658 0.33333333 0.         0.
  0.33333333 0.57735027 0.         0.         0.33333333 0.20412415]
 [0.         0.81649658 1.         0.         0.         0.
  0.40824829 0.35355339 0.         0.         0.         0.25      ]
 [0.54772256 0.33333333 0.         1.         0.33333333 0.81649658
  0.         0.28