In [14]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import skipgrams
import numpy as np

# Read data from file
file_path = "/media/bhushan/Data/EDU/DL/Practicals/LP-IV-datasets/CBOW/CBOW.txt"
with open(file_path, 'r') as file:
    document = file.read()

# Data preparation
tokenizer = Tokenizer()
tokenizer.fit_on_texts([document])
total_words = len(tokenizer.word_index) + 1

# Generate training data
sequences = tokenizer.texts_to_sequences([document])[0]
X, y = skipgrams(sequences, vocabulary_size=total_words, window_size=2, negative_samples=1.0)

X_word_target, X_word_context = zip(*X)
X_word_target = np.array(X_word_target, dtype="int32")
X_word_context = np.array(X_word_context, dtype="int32")
y = np.array(y, dtype="int32")

# Train model
embedding_dim = 50
word_target_input = tf.keras.layers.Input((1,))
word_context_input = tf.keras.layers.Input((1,))

word_embedding_layer = tf.keras.layers.Embedding(total_words, embedding_dim, input_length=1)
word_target_embedding = word_embedding_layer(word_target_input)
word_context_embedding = word_embedding_layer(word_context_input)

merged = tf.keras.layers.dot([word_target_embedding, word_context_embedding], axes=-1)
merged = tf.keras.layers.Reshape((1,), input_shape=(1, 1))(merged)

model = tf.keras.models.Model(inputs=[word_target_input, word_context_input], outputs=merged)
model.compile(optimizer='adam', loss='mean_squared_error')

model.fit([X_word_target, X_word_context], y, epochs=10, batch_size=16)

# Output
word_embeddings = model.layers[2].get_weights()[0]
print("Word Embeddings Shape:", word_embeddings.shape)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Word Embeddings Shape: (103, 50)


In [23]:
from sklearn.metrics.pairwise import cosine_similarity

# Function to get word embedding
def get_embedding(word):
    word_index = tokenizer.word_index[word]
    return word_embeddings[word_index]

# Example similarity task
def evaluate_similarity(word1, word2):
    embedding1 = get_embedding(word1)
    embedding2 = get_embedding(word2)
    
    # Reshape embeddings for cosine_similarity function
    embedding1 = embedding1.reshape(1, -1)
    embedding2 = embedding2.reshape(1, -1)

    # Compute cosine similarity
    similarity = cosine_similarity(embedding1, embedding2)[0, 0]
    return similarity

# Example evaluation
word_pairs_to_evaluate = [("virus", "influenza"), ("spread", "transmission"), ("virus", "illness")]

for word1, word2 in word_pairs_to_evaluate:
    similarity_score = evaluate_similarity(word1, word2)
    print(f"Similarity between '{word1}' and '{word2}': {similarity_score:.4f}")


Similarity between 'virus' and 'influenza': 0.4428
Similarity between 'spread' and 'transmission': 0.2381
Similarity between 'virus' and 'illness': 0.0898
