In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Lambda, Dense
from tensorflow.keras.preprocessing.text import Tokenizer

# --- a. Data preparation ---
text = "machine learning is fun when you understand machine learning"
tokens = text.split()
window = 2  # look 2 words left and 2 right
t = Tokenizer()
t.fit_on_texts([text])
word2id = t.word_index
vocab_size = len(word2id) + 1

# build context -> target pairs (CBOW style)
contexts = []
targets = []
for i in range(window, len(tokens) - window):
    ctx = [ word2id[tokens[j]] for j in range(i-window, i) ] + \
          [ word2id[tokens[j]] for j in range(i+1, i+window+1) ]
    contexts.append(ctx)                 # list of length 2*window
    targets.append(word2id[tokens[i]])   # target id

X = np.array(contexts)   # shape (N, 2*window)
y = np.array(targets)    # shape (N,)

# --- c. Build CBOW model (train to predict target from context) ---
EMB = 10
context_size = X.shape[1]   # 4 if window=2

context_input = Input(shape=(context_size,), dtype='int32')
emb = Embedding(vocab_size, EMB)         # shared embeddings
emb_ctx = emb(context_input)             # (batch, context_size, EMB)

# average context embeddings
avg = Lambda(lambda x: tf.reduce_mean(x, axis=1))(emb_ctx)   # (batch, EMB)
out = Dense(vocab_size, activation='softmax')(avg)           # predict target id

model = Model(inputs=context_input, outputs=out)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# --- d. Train (very small epochs for exam/demo) ---
model.fit(X, y, epochs=50, verbose=0)   # increase epochs if desired
embeddings = model.get_weights()[0]     # embedding matrix

#print("Embedding for 'machine':", embeddings[word2id['machine']])

print("\nWord Embeddings (Each word → vector of 10 values):")
for word, idx in word2id.items():
    print(f"{word:12s} → {embeddings[idx]}")




2025-11-10 02:32:40.467740: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-11-10 02:32:40.913736: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-11-10 02:32:42.602822: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-11-10 02:32:44.034534: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)



Word Embeddings (Each word → vector of 10 values):
machine      → [ 0.10305393 -0.03375963 -0.09591786  0.02790289  0.10280092  0.1135118
  0.0148653  -0.013227    0.01704875  0.02733277]
learning     → [ 0.06971125 -0.00066066  0.01840196 -0.07477301  0.02450555  0.12276971
 -0.04249259 -0.0898311  -0.01211943  0.0446405 ]
is           → [-0.06788607  0.02967547  0.03171511 -0.05808173 -0.04704909 -0.04400498
 -0.08278476 -0.0422259  -0.05466047  0.04351336]
fun          → [-0.03569328 -0.02147065 -0.04777352  0.06117056  0.03141813  0.03904499
 -0.02141307  0.02974931  0.12766857  0.05886892]
when         → [ 0.11494256 -0.00776421  0.00348982  0.03913676 -0.00489082  0.08962949
 -0.01005294  0.03766618 -0.05047538  0.08024477]
you          → [-0.07031088  0.03625695 -0.04210336 -0.03727319 -0.00629778 -0.00700407
 -0.02820542 -0.04916715 -0.10018494  0.02954241]
understand   → [-0.07156876 -0.07507898 -0.07293874  0.05603063 -0.03222511  0.03583488
  0.00531011  0.05814617  0.03869