In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import skipgrams
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Reshape
from tensorflow.keras.optimizers import Adam

In [2]:
# 1. Persiapan data (contoh sederhana)
text = "anjing suka makan tulang kucing suka makan ikan burung terbang tinggi"

In [3]:
# Tokenisasi teks
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
word2idx = tokenizer.word_index
idx2word = {v: k for k, v in word2idx.items()}
vocab_size = len(word2idx) + 1  # +1 untuk padding


In [4]:
# Konversi teks ke urutan token
sequences = tokenizer.texts_to_sequences([text])[0]

In [5]:
# 2. Membuat pasangan Skip-gram (kata target, konteks)
pairs, labels = skipgrams(sequences, vocab_size, window_size=2)

In [6]:
# 3. Membuat Model Skip-Gram
embedding_dim = 3  # Vektor embedding 3 dimensi

In [8]:
# Model Sequential
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=1))
model.add(Reshape((embedding_dim,)))  # Mengubah output embedding jadi vektor
model.add(Dense(vocab_size, activation='softmax'))  # Output adalah prediksi konteks

In [9]:
# Kompilasi model
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [10]:
# 4. Menyiapkan data untuk pelatihan
target_words = np.array([pair[0] for pair in pairs])
context_words = np.array([pair[1] for pair in pairs])

In [11]:
# Latih model
model.fit(target_words, context_words, epochs=100, verbose=1)

Epoch 1/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.1990 - loss: 2.3003
Epoch 2/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.1990 - loss: 2.2998  
Epoch 3/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.2107 - loss: 2.2977 
Epoch 4/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.1990 - loss: 2.2954 
Epoch 5/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.2212 - loss: 2.2939 
Epoch 6/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.2500 - loss: 2.2925 
Epoch 7/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.2695 - loss: 2.2910 
Epoch 8/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.2122 - loss: 2.2923  
Epoch 9/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x7f734b956c80>

In [12]:
# 5. Lihat hasil embedding (vektor representasi kata)
embeddings = model.layers[0].get_weights()[0]

In [13]:
# Tampilkan vektor embedding untuk setiap kata
for word, i in word2idx.items():
    print(f"Vektor embedding untuk kata '{word}': {embeddings[i]}")

Vektor embedding untuk kata 'suka': [ 0.23315194  0.09371545 -0.3457369 ]
Vektor embedding untuk kata 'makan': [-0.27384168  0.05476398 -0.18892683]
Vektor embedding untuk kata 'anjing': [-0.13610274  0.03547959 -0.04224649]
Vektor embedding untuk kata 'tulang': [-0.15617885  0.01009094  0.06623537]
Vektor embedding untuk kata 'kucing': [ 0.16151963  0.26851556 -0.31127104]
Vektor embedding untuk kata 'ikan': [-0.11812352  0.10711876 -0.08763087]
Vektor embedding untuk kata 'burung': [ 0.22587049 -0.2147045   0.12769897]
Vektor embedding untuk kata 'terbang': [-0.19039162 -0.3829218   0.00814358]
Vektor embedding untuk kata 'tinggi': [-0.06208886 -0.27154014 -0.12983596]
