In [11]:
import re
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import fasttext
import fasttext.util

In [2]:
with open('sinhala_stopwords.txt', encoding="utf8") as stp:
    stopwords = stp.read()
stp.close()

In [3]:
all_sentences = []
text_file = open('sinhala.txt', 'r', encoding="utf8")
for sentence in text_file:
    word_list = re.sub(r'[.,"\'-?:!;\n]', '', sentence)
    word_list = word_tokenize(word_list)
    parsed = []
    for word in word_list:
        if word not in stopwords:
            parsed.append(word)
    all_sentences.append(parsed)
text_file.close()

In [4]:
test_words = ['යුතු','මුදල්','දින','සඳහන්','නිවාඩු','ලේකම්','කවරේද','ජාතික','වැටුප්','ඉදිරිපත්']

In [5]:
word2vec_skipgram = Word2Vec(all_sentences, min_count=2, sg=0)
for word in test_words:
    sim_words = word2vec_skipgram.wv.most_similar(word)
    print(word)
    print(*sim_words, sep="\n")
    print()

යුතු
('යුතුය', 0.9641897678375244)
('හැකි', 0.9415063261985779)
('යුත්තේ', 0.930286705493927)
('භා', 0.9099206328392029)
('යුතුවේ', 0.9034984111785889)
('සහතික', 0.8958048224449158)
('වැය', 0.8953712582588196)
('තීරණය', 0.8946816921234131)
('වාර්තාව', 0.8940847516059875)
('ලිපිවල', 0.8905689716339111)

මුදල්
('ප්\u200dරතිපාදනය', 0.9473763108253479)
('ශ\u200d්\u200dරමය', 0.9105315804481506)
('වියදම්', 0.907948911190033)
('පියවීම', 0.9063883423805237)
('ඉල්ලීම්', 0.9027177691459656)
('ඔබගේ', 0.9021285772323608)
('ජේදය', 0.9013384580612183)
('එකී', 0.9011407494544983)
('දෑ', 0.9009097814559937)
('රෙගුලාසිවලට', 0.8988077044487)

දින
('ලැබ', 0.9714817404747009)
('මස', 0.9707421064376831)
('පළ', 0.9686501026153564)
('අත්සන්', 0.9587963819503784)
('ලැබුවාවූ', 0.9539610147476196)
('පළකරනු', 0.9485043883323669)
('පත\u200d්\u200dරයේ', 0.9448172450065613)
('ඔක්තෝබර්', 0.9430589079856873)
('ඉදිරිපත්කරන', 0.9330236911773682)
('පත්\u200dරයේ', 0.9326760172843933)

සඳහන්
('වගන්තිවල', 0.911608040332794

In [6]:
word2vec_cbow = Word2Vec(all_sentences, min_count=2, sg=1)
for word in test_words:
    sim_words = word2vec_cbow.wv.most_similar(word)
    print(word)
    print(*sim_words, sep="\n")
    print()

යුතු
('යුතුය', 0.8542492985725403)
('යුත්තේ', 0.8040897250175476)
('හැකි', 0.8024328947067261)
('හැක', 0.7615551352500916)
('යුතුව', 0.7563894391059875)
('හැක්කේ', 0.75618976354599)
('නොහැකි', 0.72788006067276)
('හැකිය', 0.7278342843055725)
('යුතුවේ', 0.7111713290214539)
('නියෝගයෙහි', 0.6846619248390198)

මුදල්
('වන්දි', 0.6916292309761047)
('ක්\u200dරම', 0.6839450001716614)
('අත්තිකාරම්', 0.6794115900993347)
('ප්\u200dරවාහන', 0.6750259399414062)
('වියදම්', 0.6693071126937866)
('ගෙවීම', 0.6632765531539917)
('භාවිතය', 0.6531163454055786)
('සම්පාදන', 0.652567446231842)
('මධ්\u200dයසාර', 0.6470654010772705)
('ක්\u200dරමසම්පාදන', 0.6448792219161987)

දින
('මාර්තු', 0.8301911354064941)
('තිබුණ', 0.823740541934967)
('අද', 0.8234764933586121)
('සඳහන්ව', 0.8156938552856445)
('ජුනි', 0.7879504561424255)
('මස', 0.7849934697151184)
('සිකුරාදා', 0.7846826910972595)
('යෝජනාවේ', 0.7833022475242615)
('රැස්වීම්', 0.7799726128578186)
('දෙසැම්බර්', 0.77949059009552)

සඳහන්
('දැක්වෙන', 0.8741230964660645

In [7]:
ft = fasttext.load_model("cc.si.300.bin")



In [8]:
for word in test_words:
    sim_words = ft.get_nearest_neighbors(word)
    print(word)
    print(*sim_words, sep="\n")
    print()

යුතු
(0.7585144639015198, 'හැකි')
(0.7250702381134033, 'යුතුය')
(0.6748450398445129, 'යුතුමුත්')
(0.6698734760284424, 'යුතුම')
(0.659508466720581, 'යුතුමව')
(0.6580422520637512, 'යුතුයි.අවශ්\u200dය')
(0.6556801795959473, 'යුතුමද')
(0.6507406830787659, 'විය')
(0.6501041054725647, 'නොහැකි')
(0.6489912271499634, 'යුතුයැ’යි')

මුදල්
(0.6813880205154419, 'මුදල්හදල්')
(0.670252799987793, 'මුදල්ගෙවා')
(0.658108651638031, 'මුදල්ය')
(0.6521779298782349, 'මුදල්ම')
(0.6497461795806885, 'මුදල්මත')
(0.6361136436462402, 'මුදලය')
(0.6310645937919617, 'මුදල්වලට')
(0.6301226615905762, 'මුදල්දී')
(0.6195958852767944, 'මුදල්ද')
(0.6072321534156799, 'මුදල්වත්')

දින
(0.5975547432899475, 'අගෝස්තු')
(0.5897132754325867, 'මාර්තු')
(0.5823441743850708, 'ජනවාරි')
(0.5803649425506592, 'දිනයේයි')
(0.5787533521652222, 'ජූලි')
(0.5731720924377441, 'දිනයවූ')
(0.5692654848098755, 'මැයි')
(0.5682485699653625, 'දිනට')
(0.5620603561401367, 'නොවැම්බර්')
(0.5605639815330505, '1962.02.04')

සඳහන්
(0.6882578134536743, 'සඳහ

In [13]:
ftvec = KeyedVectors.load_word2vec_format('cc.si.300.vec', binary=False)

In [15]:
for word in test_words:
    sim_words = ftvec.most_similar(word)
    print(word)
    print(*sim_words, sep="\n")
    print()

යුතු
('හැකි', 0.758513331413269)
('යුතුය', 0.7250239849090576)
('යුතුමුත්', 0.6748719811439514)
('යුතුම', 0.669881284236908)
('යුතුමව', 0.6594510674476624)
('යුතුයි.අවශ්\u200dය', 0.6580855250358582)
('යුතුමද', 0.6557757258415222)
('විය', 0.65071702003479)
('නොහැකි', 0.6500844359397888)
('යුතුයැ’යි', 0.6491255164146423)

මුදල්
('මුදල්හදල්', 0.6814319491386414)
('මුදල්ගෙවා', 0.6704550385475159)
('මුදල්ය', 0.6580941677093506)
('මුදල්ම', 0.6521395444869995)
('මුදල්මත', 0.6496994495391846)
('මුදලය', 0.6362110376358032)
('මුදල්වලට', 0.631045937538147)
('මුදල්දී', 0.6302124261856079)
('මුදල්ද', 0.6196855902671814)
('මුදල්වත්', 0.6071982979774475)

දින
('අගෝස්තු', 0.5975399017333984)
('මාර්තු', 0.58966463804245)
('ජනවාරි', 0.5822768807411194)
('දිනයේයි', 0.5804850459098816)
('ජූලි', 0.5787363648414612)
('දිනයවූ', 0.5730636119842529)
('මැයි', 0.5692448616027832)
('දිනට', 0.5682713389396667)
('නොවැම්බර්', 0.5620465874671936)
('1962.02.04', 0.56041419506073)

සඳහන්
('සඳහන්ඉඟි', 0.6883292198181152