In [1]:
import ipywidgets as widgets
import gensim
import os

## Word embedding model for arabic
This app allows you to find related words in a word embedding model for Arabic.


In [2]:
model_path = 'data'

In [3]:
import zipfile
from urllib.request import urlopen
from io import BytesIO

def download_model(download_url, model_path):
    outname = os.path.join(model_path, name)
    resp = urlopen(download_url)
    zf = zipfile.ZipFile(BytesIO(resp.read()), 'r')


    for fname in zf.namelist():
        out_file = os.path.join(model_path, os.path.basename(fname))
        if not os.path.exists(out_file):
            with open(out_file, 'wb') as fout:
                fout.write(zf.read(fname))
    model_name = fname.split('.')[0]
    return model_name


model_urls = {
    'fiqh-stemmed': 'https://surfdrive.surf.nl/files/index.php/s/7ZhTWqjTLRaNf2M/download',
    'fiqh-norm': 'https://surfdrive.surf.nl/files/index.php/s/JZKLrkmVRP202T4/download'
}

model_names = {}

for name in model_urls:
    model_names[name] = download_model(model_urls[name], model_path)

In [4]:
# To do: list multiple available models and only download them when needed
models = {}
for name in model_names:
    try:
        models[name] = gensim.models.KeyedVectors.load(os.path.join(model_path, model_names[name]))
    except:
        pass

In [5]:
def most_similar(word, model, topn=10):
    res = model.wv.most_similar(word, topn=topn)
    output = [u'{} \t{:.3f}'.format(w, s) for w, s in res]
    return '\n'.join(output)

In [6]:
outputs = {model: widgets.Textarea(disabled=True, rows=30, description=model )
                                   for model in models}

def print_related_words(button):
    for model in models:
        try: 
            outputs[model].value = most_similar(input_word.value.strip(), models[model], input_number.value)
        except KeyError:
            outputs[model].value = 'Error: word does not exist in vocabulary'
#     except:
#         output.value = 'Unknown error'

input_word = widgets.Text(description='Word:')
input_number = widgets.IntSlider(
    value=50,
    min=5,
    max=100,
    step=5,
    description='Number of results:',
)

button_submit = widgets.Button(description='Submit')
button_submit.on_click(print_related_words)

output_boxes = tuple([outputs[m] for m in sorted(outputs.keys())])
widgets.VBox((input_word, input_number, button_submit, widgets.HBox(output_boxes)))


VBox(children=(Text(value='', description='Word:'), IntSlider(value=50, description='Number of results:', min=…