# Word2Vec with my own embedding (from the example notebook)

In [1]:
# Word2Vec
import numpy as np
import tensorflow as tf
import tensorflow.keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, GlobalAveragePooling1D
from tensorflow.keras.models import Model
import gensim
import pandas as pd

# Define parameters for the Word2Vec model
w2v_size = 300
w2v_window = 5
w2v_min_count = 1
w2v_epochs = 100
maxlen = 240  # Adjust to the length of your sentences

df_train = pd.read_csv('stackoverflow_questions_cleaned_train.csv')

# Prepare the sentences
sentences = df_train['sentence_bow_lem'].to_list()  # Use your specific DataFrame column
sentences = [gensim.utils.simple_preprocess(text) for text in sentences]

# Create and train the Word2Vec model
print("Build & train Word2Vec model ...")
w2v_model = gensim.models.Word2Vec(min_count=w2v_min_count, window=w2v_window,
                                   vector_size=w2v_size, seed=42, workers=1)
w2v_model.build_vocab(sentences)
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=w2v_epochs)
model_vectors = w2v_model.wv
w2v_words = model_vectors.index_to_key
print("Vocabulary size: %i" % len(w2v_words))
print("Word2Vec trained")

# Prepare the sentences (tokenization)
print("Fit Tokenizer ...")
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
x_sentences = pad_sequences(tokenizer.texts_to_sequences(sentences), maxlen=maxlen, padding='post')

num_words = len(tokenizer.word_index) + 1
print("Number of unique words: %i" % num_words)

# Create the embedding matrix
print("Create Embedding matrix ...")
embedding_matrix = np.zeros((num_words, w2v_size))
i, j = 0, 0

for word, idx in tokenizer.word_index.items():
    i += 1
    if word in w2v_words:
        j += 1
        embedding_vector = model_vectors[word]
        if embedding_vector is not None:
            embedding_matrix[idx] = embedding_vector

word_rate = np.round(j / i, 4)
print("Word embedding rate: ", word_rate)
print("Embedding matrix: %s" % str(embedding_matrix.shape))

# Create the embedding model
print("Create Embedding model ...")
word_input = Input(shape=(maxlen,), dtype='float64')
word_embedding = Embedding(input_dim=num_words, output_dim=w2v_size, weights=[embedding_matrix], input_length=maxlen)(word_input)
word_vec = GlobalAveragePooling1D()(word_embedding)
embed_model = Model([word_input], word_vec)

embed_model.summary()

# Generate embeddings for the sentences
embeddings = embed_model.predict(x_sentences)
print(embeddings.shape)

Build & train Word2Vec model ...
Vocabulary size: 8965
Word2Vec trained
Fit Tokenizer ...
Number of unique words: 8966
Create Embedding matrix ...
Word embedding rate:  1.0
Embedding matrix: (8966, 300)
Create Embedding model ...


2024-08-07 21:23:55.027657: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2024-08-07 21:23:55.027879: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2024-08-07 21:23:55.027910: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2024-08-07 21:23:55.028666: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-08-07 21:23:55.029489: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


2024-08-07 21:23:55.601966: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m252/252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
(8036, 300)
