In [1]:
import ipywidgets as widgets
import gensim
import os

## Word embedding model for arabic
This app allows you to find related words in a word embedding model for Arabic.

The current model is the Wikipedia CBOW model from [AraVec](https://github.com/bakrianoo/aravec), see: Abu Bakr Soliman, Kareem Eisa, and Samhaa R. El-Beltagy, “AraVec: A set of Arabic Word Embedding Models for use in Arabic NLP”, in proceedings of the 3rd International Conference on Arabic Computational Linguistics (ACLing 2017), Dubai, UAE, 2017.

In [2]:
model_path = 'data'

In [3]:
# To do: list multiple available models and only download them when needed
models = {}
filenames = os.listdir(model_path)
for fn in filenames:
    try:
        models[fn] = gensim.models.KeyedVectors.load(os.path.join(model_path, fn))
    except:
        pass

In [4]:
nicer_names_dict = {'wikipedia_cbow_100': 'Wikipedia',
                    'cbow-fiqh-100-wikipedia-finetuned-wv': 'Wikipedia+Fiqh',
                   'fiqh-i10-s100-w5-sg0_wv': 'Fiqh', 
                   'stemmed-fiqh-i10-s100-w5-sg0_wv': 'Stemmed Fiqh'}

In [5]:
models = {nicer_names_dict.get(model, model): models[model] for model in models}

In [6]:
def most_similar(word, model, topn=10):
    res = model.wv.most_similar(word, topn=topn)
    output = [u'{} \t{:.3f}'.format(w, s) for w, s in res]
    return '\n'.join(output)

In [7]:
outputs = {model: widgets.Textarea(disabled=True, rows=30, description=model )
                                   for model in models}

def print_related_words(button):
    for model in models:
        try: 
            outputs[model].value = most_similar(input_word.value.strip(), models[model], input_number.value)
        except KeyError:
            outputs[model].value = 'Error: word does not exist in vocabulary'
#     except:
#         output.value = 'Unknown error'

input_word = widgets.Text(description='Word:')
input_number = widgets.IntSlider(
    value=50,
    min=5,
    max=100,
    step=5,
    description='Number of results:',
)

button_submit = widgets.Button(description='Submit')
button_submit.on_click(print_related_words)

output_boxes = tuple([outputs[m] for m in sorted(outputs.keys())])
widgets.VBox((input_word, input_number, button_submit, widgets.HBox(output_boxes)))


VBox(children=(Text(value='', description='Word:'), IntSlider(value=50, description='Number of results:', min=…