In [1]:
import ipywidgets as widgets
import gensim
import os

## Word embedding model for arabic
This app allows you to do arithmetics with word embeddings model for Arabic.

The current model is the Wikipedia CBOW model from [AraVec](https://github.com/bakrianoo/aravec), see: Abu Bakr Soliman, Kareem Eisa, and Samhaa R. El-Beltagy, “AraVec: A set of Arabic Word Embedding Models for use in Arabic NLP”, in proceedings of the 3rd International Conference on Arabic Computational Linguistics (ACLing 2017), Dubai, UAE, 2017.

In [2]:
model_path = 'data'

In [None]:
import zipfile
from urllib.request import urlopen
from io import BytesIO

def download_model(name, download_url):
    outname = os.path.join(model_path, name)
    
    if not os.path.exists(outname):
        resp = urlopen(download_url)
        zf = zipfile.ZipFile(BytesIO(resp.read()), 'r')


        fname = zf.namelist()[0]
        with open(outname, 'wb') as fout:
            fout.write(zf.read(fname))
            
            
model_urls = {
    'fiqh-norm': 'https://surfdrive.surf.nl/files/index.php/s/MpDn5ckasu33LuT/download',
    'fiqh': 'https://surfdrive.surf.nl/files/index.php/s/7RvP2iYCOXkcWRp/download',
    'fiqh-stemmed': 'https://surfdrive.surf.nl/files/index.php/s/Ah9HeEg8vDMzPIo/download',
    #'wiki_cbow_100': 'https://archive.org/download/aravec2.0/wiki_cbow_100.zip'
}

for name in model_urls:
    print(name)
    download_model(name, model_urls[name])

In [21]:
nicer_names_dict = {'fiqh-i10-s100-w5-sg0_wv': 'Fiqh', 
                   'stemmed-fiqh-i10-s100-w5-sg0_wv': 'Stemmed Fiqh'}

In [22]:
# To do: list multiple available models and only download them when needed
models = {}
filenames = os.listdir(model_path)
for fn in filenames:
    try:
        if fn in nicer_names_dict:
            models[nicer_names_dict[fn]] = gensim.models.KeyedVectors.load(os.path.join(model_path, fn))
    except:
        pass

In [24]:
def arithmetics(word1, word2, word3, model, topn=10):
    res = model.wv.most_similar(positive=[word1, word3], negative=[word2], topn=topn)
    output = [u'{} \t{:.3f}'.format(w, s) for w, s in res]
    return '\n'.join(output)

The output word is sementically related to word3 similar as word1 to word2. For example:
king (*word1*) - man (*word2*) + woman (*word3*) = queen (*output*)

In [25]:
outputs = {model: widgets.Textarea(disabled=True, rows=30, description=model )
                                   for model in models}

def print_related_words(button):
    for model in models:
        try: 
            outputs[model].value = arithmetics(input_word1.value.strip(),
                                               input_word2.value.strip(),
                                               input_word3.value.strip(),
                                               models[model], input_number.value)
        except KeyError:
            outputs[model].value = 'Error: word does not exist in vocabulary'
#     except:
#         output.value = 'Unknown error'

input_word1 = widgets.Text(description='Word 1 (king):')
input_word2 = widgets.Text(description='Word 2 (man):')
input_word3 = widgets.Text(description='Word 3 (woman):')

input_number = widgets.IntSlider(
    value=50,
    min=5,
    max=100,
    step=5,
    description='Number of results:',
)

button_submit = widgets.Button(description='Submit')
button_submit.on_click(print_related_words)

output_boxes = tuple([outputs[m] for m in sorted(outputs.keys())])
widgets.VBox((input_word1, input_word2, input_word3, 
              input_number, button_submit, widgets.HBox(output_boxes)))


VBox(children=(Text(value='', description='Word 1 (king):'), Text(value='', description='Word 2 (man):'), Text…

  from ipykernel import kernelapp as app
