In [1]:
import numpy as np
import pickle
from gensim.models import Word2Vec
import os

In [2]:
word2idx_path = "artifacts/vocab/word2idx.pkl"

with open(word2idx_path, "rb") as f:
    word2idx = pickle.load(f)

len(word2idx)

19560

In [3]:
w2v_path = "embeddings/idwiki_word2vec.model"
w2v = Word2Vec.load(w2v_path)

EMBED_DIM = w2v.vector_size
print("Embedding dimension:", EMBED_DIM)

Embedding dimension: 300


In [4]:
embedding_matrix = np.zeros((len(word2idx), EMBED_DIM))

for word, idx in word2idx.items():
    if word in w2v.wv:
        embedding_matrix[idx] = w2v.wv[word]
    else:
        # random normal vector utk kata yg tdk ada di Word2Vec
        embedding_matrix[idx] = np.random.normal(scale=0.6, size=(EMBED_DIM,))

In [5]:
os.makedirs("artifacts/embedding", exist_ok=True)

save_path = "artifacts/embedding/embedding_matrix.npy"
np.save(save_path, embedding_matrix)

save_path

'artifacts/embedding/embedding_matrix.npy'

In [6]:
sample_words = ["politik", "ekonomi", "teknologi", "korupsi"]

for w in sample_words:
    if w in w2v.wv:
        print(f"{w} → OK")
    else:
        print(f"{w} → tidak ada di Word2Vec")

politik → OK
ekonomi → OK
teknologi → OK
korupsi → OK


In [7]:
print("Embedding matrix shape:", embedding_matrix.shape)
print("Saved to:", save_path)

Embedding matrix shape: (19560, 300)
Saved to: artifacts/embedding/embedding_matrix.npy
