In [1]:
!pip install gensim==4.3.2



In [3]:
from gensim.models import FastText, KeyedVectors
import string
import os
import pandas as pd
punctuations = set(string.punctuation)

In [25]:
def create_dir(output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)


def preprocess(myfile):
    with open(myfile, encoding='utf-8') as f:
        text_lines  = f.readlines()
    n_tokens = 0
    new_text_lines = []
    token_set = []
    for text in text_lines:
        text_tokens = text.split()
        new_text_tokens = [word for word in text_tokens if word not in punctuations]
        new_text_lines.append(new_text_tokens)
        n_tokens+=len(new_text_tokens)
        token_set+=new_text_tokens

    print("# sentences", len(new_text_lines))
    print("# Tokens ", n_tokens)
    print("# Vocabulary ", len(set(token_set)))

    return new_text_lines


#https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#
#https://radimrehurek.com/gensim/models/word2vec.html#module-gensim.models.word2vec
#https://radimrehurek.com/gensim/auto_examples/tutorials/run_fasttext.html
def train_fastText(data_path, lang):
    model_full = FastText(preprocess('/content/drive/MyDrive/Text Representation/yoruba.txt'), vector_size=300, window=5, min_count=3, workers=4, sg=1, epochs=10, negative=10)
    output_dir = data_path+"embeddings/"+lang+"/"
    create_dir(output_dir)
    model_full.wv.save(output_dir+lang+".bin")
    print("embedding training Done")


In [26]:
train_fastText(data_path, 'yoruba')

# sentences 3397
# Tokens  111670
# Vocabulary  14372
embedding training Done


In [27]:
data_path = "/content/drive/MyDrive/Text Representation/datasets/wordSim353.csv"
wordsim = pd.read_csv(data_path)
wordsim.head()

Unnamed: 0,S/N,English1,English2,Yoruba1,Yoruba2,EngSim
0,1,tiger,cat,ẹkùn,ológbò,7.35
1,2,tiger,tiger,ẹkùn,ẹkùn,10.0
2,3,book,paper,ìwé,bébà,7.46
3,4,computer,keyboard,kọ̀mpútà,pátákó ìtẹ̀wé,7.62
4,5,computer,internet,kọ̀mpútà,ayélujára,7.58


In [28]:
model = KeyedVectors.load("/content/drive/MyDrive/Text Representation/datasets/wordSim353.csvembeddings/yoruba/yoruba.bin")

In [29]:

found = []
for i in range(0,353):
    p=None
    try:
        yor1 = wordSim353.at[i,'Yoruba1'].strip()
        yor2 = wordSim353.at[i,'Yoruba2'].strip()
        p=round(model.similarity(yor1, yor2) * 10,2)
        found.append(i)
    except:
        pass
    wordsim.at[i,'model']=p
print("# pairs of words found: ", len(found))
#print(found)
#print(len(set(found) & set(found_pairs)))


# pairs of words found:  0


In [18]:
from scipy.stats import spearmanr
ourcorr1, p_value1 = spearmanr(wordsim.Yoruba,wordsim.model, nan_policy="omit")

In [30]:
ourcorr1

0.15765993421369978

wv = model.word_vec


model['ọdọ̀']

array([-4.62786295e-02, -4.39219363e-02, -1.47762774e-02,  7.12207425e-03,
       -1.83496311e-01, -3.50847766e-02, -1.14363581e-02, -4.80968520e-05,
        1.17513299e-01,  1.47885289e-02,  1.07411787e-01, -9.50299203e-02,
        2.51120001e-01, -1.02194466e-01, -2.01771572e-01, -7.67978206e-02,
        1.85966805e-01, -1.89607173e-01,  2.98526827e-02, -1.50551468e-01,
       -6.17945269e-02, -9.85621959e-02,  7.31451288e-02, -7.37788677e-02,
        1.99896246e-02, -7.38658085e-02,  1.16045237e-01, -2.29634911e-01,
        7.49319345e-02, -1.50093630e-01,  1.14465699e-01, -1.91878200e-01,
       -1.52376145e-01, -1.57559112e-01,  2.12592147e-02, -7.97777325e-02,
        6.51537403e-02, -1.01624466e-01,  1.60018116e-01,  2.36706972e-01,
        2.04797871e-02, -9.58587304e-02, -5.25653064e-02, -2.68158615e-01,
        2.91449279e-02,  1.38766199e-01,  3.16993624e-01,  2.46165082e-01,
       -2.65524149e-01,  1.02484249e-01, -1.19676232e-01,  2.24283054e-01,
        1.66099951e-01,  

In [31]:
model.most_similar("igi")

[('igi.', 0.8765624761581421),
 ('igi,', 0.8478281497955322),
 ('ọ̀pẹ', 0.8230783343315125),
 ('lulẹ̀', 0.7904080152511597),
 ('ọ̀pẹ.', 0.7889252305030823),
 ('Igi', 0.7784389853477478),
 ('abẹ́', 0.7695708274841309),
 ('balẹ̀', 0.7681936025619507),
 ('foríbalẹ̀', 0.7681863307952881),
 ('lulẹ̀,', 0.7598339319229126)]

In [34]:
model.most_similar("ẹkùn")

[('ikùn', 0.9774208664894104),
 ('rọrùn', 0.9766912460327148),
 ('ìlẹ̀kùn', 0.9756035804748535),
 ('Àrùn', 0.9745671153068542),
 ('okùn', 0.9725813865661621),
 ('sóòrùn', 0.9715046286582947),
 ('fọhùn', 0.9704322218894958),
 ('Àṣé', 0.9689681529998779),
 ('gígùn', 0.9678296446800232),
 ('Oòrùn', 0.9676233530044556)]

In [32]:
model.most_similar("bata")

[('pẹlẹbẹ', 0.9776225686073303),
 ('wẹ', 0.9638885259628296),
 ('tete', 0.9627532958984375),
 ('10.', 0.9611928462982178),
 ('2023.', 0.9599006175994873),
 ('dẹ́kun', 0.958572268486023),
 ('fọ', 0.9574018716812134),
 ('po', 0.9568259716033936),
 ('ata', 0.9564055800437927),
 ('adẹ́tẹ̀', 0.9551315903663635)]

In [22]:
model.most_similar("ogun")

[('ologun', 0.9999558925628662),
 ('oogun', 0.9999520778656006),
 ('Balogun', 0.9999364018440247),
 ('Hamas', 0.9999244213104248),
 ('ikọlu', 0.999923825263977),
 ('Israel.', 0.9999210238456726),
 ('Israel', 0.9999207258224487),
 ('farapa', 0.9999129772186279),
 ('atawọn', 0.9999082088470459),
 ('Palestine', 0.9999076724052429)]

In [35]:
model.most_similar("ògùn")

[('òògùn', 0.9798166155815125),
 ('gígùn', 0.9792511463165283),
 ('sóòrùn', 0.9778701663017273),
 ('Oòrùn', 0.9778334498405457),
 ('Àrùn', 0.9775603413581848),
 ('rọrùn', 0.9749994874000549),
 ('oògùn', 0.9745840430259705),
 ('tààrà', 0.9732843041419983),
 ('òòrùn', 0.9714354872703552),
 ('ọrùn', 0.9711557626724243)]

In [None]:
model.most_similar("ògùn")

[('òògùn', 0.9798166155815125),
 ('gígùn', 0.9792511463165283),
 ('sóòrùn', 0.9778701663017273),
 ('Oòrùn', 0.9778334498405457),
 ('Àrùn', 0.9775603413581848),
 ('rọrùn', 0.9749994874000549),
 ('oògùn', 0.9745840430259705),
 ('tààrà', 0.9732843041419983),
 ('òòrùn', 0.9714354872703552),
 ('ọrùn', 0.9711557626724243)]

In [36]:
model.most_similar("ijọba")

[('agba', 0.9751817584037781),
 ('ija', 0.9750121831893921),
 ('eto', 0.970425009727478),
 ('Eko', 0.9679886698722839),
 ('ilu', 0.9671522378921509),
 ('ileẹjọ', 0.9668627977371216),
 ('yoo', 0.9666933417320251),
 ('Ọlọpaa', 0.9661507606506348),
 ('ọlọpaa', 0.9656313061714172),
 ('nigba', 0.9654848575592041)]