#### Word2Vec Word Embeddings

In [4]:
# imports

from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import stopwordsiso
from stopwordsiso import stopwords
import os
from nltk.tokenize import word_tokenize


In [5]:
directory = "clean-txt"

# all words from files
sentences = []

# populates sentences with all texts
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        with open(os.path.join(directory, filename), "r", encoding="utf-8") as file:
            text = file.read()
            words = word_tokenize(text)
            sentences.extend(words)

# remove stop words
stop_words = set(stopwords('tl'))

# preprocessing
processed_sentences = []
for sentence in sentences:
    words = word_tokenize(sentence.lower())
    words = [word for word in words if word.isalpha() and word not in stop_words]
    processed_sentences.append(words)

In [6]:
# training the word2vec model
model = Word2Vec(processed_sentences, vector_size=100, window=5, min_count=1, workers=4)

# save
model.save("filipino_word2vec_model.bin")

In [10]:
# load
model = Word2Vec.load("filipino_word2vec_model.bin")

In [15]:
# uses

# get similar words
similar_words = model.wv.most_similar("hapon", topn=10)
print(similar_words)

# get similarity between two words
cosine_similarity = model.wv.similarity('babae', 'lalaki')
print(cosine_similarity)

[('itinanghal', 0.34606117010116577), ('hinarang', 0.34141793847084045), ('tinitirahang', 0.3247967064380646), ('hanapin', 0.32373079657554626), ('pagsusulit', 0.3082212507724762), ('igagalang', 0.3019391894340515), ('droga', 0.29676660895347595), ('malipol', 0.2953234612941742), ('malalayo', 0.2875189483165741), ('binondo', 0.28147175908088684)]
-0.06799804


#### FastText Word Embeddings

In [2]:
pip install fasttext

Note: you may need to restart the kernel to use updated packages.


In [3]:
# imports

import fasttext


In [4]:
# pre-trained model
# https://fasttext.cc/docs/en/crawl-vectors.html
model = fasttext.load_model('cc.tl.300.bin')



In [10]:
# uses

find = "bahay"
similar = model.get_nearest_neighbors(find, k=100)                      # this is more like a bypass to not get repeated words
filtered = [word for word in similar if find not in word[1]]    # filter
filitered = filtered[:20]                                # gets top 20 based on the filter

# may want to implement for django
# list of words, instead of embedding + word
for item in filtered:
    print(item[1])

# prints out array of embedding_value-word pair
print(filtered)

# example
# reyna : hari = babae : lalaki (etc.)
analogies = model.get_analogies('reyna', 'hari', 'lalaki', k=1) # default k == 1
print(analogies)


kubo-kubo
inuupahang
titirahan
resthouse
gatehouse
boardinghouse
Mansyon
kubo
apartment
pinauupahan
inuupahan
Treehouse
tinutuluyan
townhouse
treehouse
unuupahan
bunkhouse
kubong
kwarto
makisilong
garahe
tahanan
Paguwi
staffhouse
pinapaupahan
safehouse
[(0.6491464972496033, 'kubo-kubo'), (0.6266229748725891, 'inuupahang'), (0.6241036653518677, 'titirahan'), (0.6082715392112732, 'resthouse'), (0.6074135899543762, 'gatehouse'), (0.607253909111023, 'boardinghouse'), (0.5948531627655029, 'Mansyon'), (0.5940391421318054, 'kubo'), (0.5937168598175049, 'apartment'), (0.5917840003967285, 'pinauupahan'), (0.59103924036026, 'inuupahan'), (0.5894445180892944, 'Treehouse'), (0.5871586799621582, 'tinutuluyan'), (0.5829326510429382, 'townhouse'), (0.5817272663116455, 'treehouse'), (0.5778210163116455, 'unuupahan'), (0.5770028829574585, 'bunkhouse'), (0.5768028497695923, 'kubong'), (0.5718590617179871, 'kwarto'), (0.5712212324142456, 'makisilong'), (0.5663864612579346, 'garahe'), (0.5652022957801819,