In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense

# Sample corpus
sentences = [
    'I love machine learning',
    'Deep learning is a branch of machine learning',
    'Word embeddings are a type of word representation',
    'Machine learning is awesome',
    'Word2Vec is a popular word embedding method',
]

# Tokenizing the sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

# Converting sentences to sequences of integers
sequences = tokenizer.texts_to_sequences(sentences)
padded_sequences = pad_sequences(sequences, padding='post')

# Parameters
vocab_size = len(word_index) + 1  # Adding 1 because of reserved 0 index
embedding_dim = 8  # Dimension of the embedding space
input_length = padded_sequences.shape[1]  # Length of input sequences

# Model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length),
    Flatten(),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy')

# Train the model (Note: In this demo, we won't actually use labels)
model.fit(padded_sequences, np.zeros((len(sentences), 1)), epochs=10, verbose=0)

# Extract the embeddings
embeddings = model.layers[0].get_weights()[0]

# Displaying the embeddings
for word, i in word_index.items():
    print(f'Word: {word} - Embedding: {embeddings[i]}')




Word: learning - Embedding: [-0.01715189 -0.00602535 -0.05668322  0.04518725 -0.04940545 -0.00952972
  0.00926516  0.02820825]
Word: machine - Embedding: [ 0.01934711 -0.01436297  0.01959456 -0.00916991  0.00155818  0.05679836
  0.01975585 -0.01941023]
Word: is - Embedding: [-0.03567606  0.02820829  0.01609422  0.00899245 -0.03972582  0.04710207
 -0.01695937  0.04641515]
Word: a - Embedding: [-0.01446102 -0.00046867 -0.05311938  0.0120619  -0.01417905 -0.01092705
  0.0262667  -0.02308758]
Word: word - Embedding: [-0.04550141  0.00797239 -0.0342592  -0.04156759 -0.0074445  -0.00364592
  0.03261053 -0.047578  ]
Word: of - Embedding: [-0.02805678 -0.01814794  0.02440405 -0.02318072 -0.03416475 -0.02622538
 -0.03294413 -0.06069958]
Word: i - Embedding: [-0.00944831 -0.04214373 -0.04383896  0.01744683  0.02208924 -0.02028611
  0.04576056 -0.03880486]
Word: love - Embedding: [ 0.01807635  0.05958747 -0.02930271  0.03012785 -0.02237262  0.01292581
 -0.03431427  0.00385551]
Word: deep - Embedd

In [4]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarity12 = cosine_similarity([embedding1], [embedding2])