In [6]:
%matplotlib inline
import ipywidgets as widgets
import gensim
import os
import matplotlib.pyplot as plt
import bidi.algorithm
import arabic_reshaper

## Word embedding model for arabic
This app allows you to visualize a list of words in a word embedding model for Arabic, mapped to two dimensions using Principal Component Analysis (PCA).

The current model is the Wikipedia CBOW model from [AraVec](https://github.com/bakrianoo/aravec), see: Abu Bakr Soliman, Kareem Eisa, and Samhaa R. El-Beltagy, “AraVec: A set of Arabic Word Embedding Models for use in Arabic NLP”, in proceedings of the 3rd International Conference on Arabic Computational Linguistics (ACLing 2017), Dubai, UAE, 2017.

In [7]:
model_path = 'data'

In [41]:
import zipfile
from urllib.request import urlopen
from io import BytesIO

def download_model(download_url, model_path):
    outname = os.path.join(model_path, name)
    
#     if not os.path.exists(outname):
    resp = urlopen(download_url)
    zf = zipfile.ZipFile(BytesIO(resp.read()), 'r')


    for fname in zf.namelist():
        out_file = os.path.join(model_path, os.path.basename(fname))
        if not os.path.exists(out_file):
            with open(out_file, 'wb') as fout:
                fout.write(zf.read(fname))
    model_name = fname.split('.')[0]
    return model_name

In [43]:
model_urls = {
    'fiqh-norm': 'https://surfdrive.surf.nl/files/index.php/s/VxVIHxUzUuFyonc/download',
    'fiqh': 'https://surfdrive.surf.nl/files/index.php/s/7RvP2iYCOXkcWRp/download',
    'fiqh-stemmed': 'https://surfdrive.surf.nl/files/index.php/s/Ah9HeEg8vDMzPIo/download',
    #'wiki_cbow_100': 'https://archive.org/download/aravec2.0/wiki_cbow_100.zip'
}

model_names = {}

for name in model_urls:
    model_names[name] = download_model(model_urls[name], model_path)

In [44]:
model_names

{'fiqh-norm': 'fiqh-norm-i10-s100-w5-sg0_wv',
 'fiqh': 'fiqh-i10-s100-w5-sg0_wv',
 'fiqh-stemmed': 'stemmed-fiqh-i10-s100-w5-sg0_wv'}

In [45]:
# To do: list multiple available models and only download them when needed
models = {}
for name in model_names:
    try:
        models[name] = gensim.models.KeyedVectors.load(os.path.join(model_path, model_names[name]))
    except:
        pass

In [46]:
models

{'fiqh-norm': <gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7fcec97e5080>,
 'fiqh': <gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7fcec9bc2240>,
 'fiqh-stemmed': <gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7fceacc24a58>}

In [21]:
models = {nicer_names_dict.get(model, model): models[model] for model in models}

In [39]:
models

{'fiqh-norm-i10-s100-w5-sg0_wv': <gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7fcec97e54a8>,
 'stemmed-fiqh-i10-s100-w5-sg0_wv': <gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7fcec97e5470>,
 'fiqh-i10-s100-w5-sg0_wv': <gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7fcebe532d30>}

In [22]:
def most_similar(word, model, topn=10):
    res = model.wv.most_similar(word, topn=topn)
    output = [u'{} \t{:.3f}'.format(w, s) for w, s in res]
    return '\n'.join(output)

In [23]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

def plot_embedding(word_list, model, method='pca', learning_rate=60, colors=None):
    X = model[word_list]
    if method is 'pca':
        X_embedded = PCA(n_components=2).fit_transform(X)
    else:
        X_pre = PCA(n_components=30).fit_transform(X)
        X_embedded = TSNE(n_components=2, learning_rate=learning_rate, random_state=0).fit_transform(X_pre)

    plt.figure(figsize=(15,15))
    plt.scatter(X_embedded[:,0], X_embedded[:,1], c=colors)
    plt.axis('off')
    
    for i, w in enumerate(word_list):
        display_word = bidi.algorithm.get_display(arabic_reshaper.reshape(w))
        plt.annotate(display_word, xy=(X_embedded[i, 0], X_embedded[i, 1]), fontsize=20)
    plt.show()

In [24]:
def plot_closest_words(word, model, method='pca', topn=30, learning_rate=60, colors=None):
    word_list = [w for w, s in model.wv.most_similar(word, topn=topn)]
    word_list.append(word)
    plot_embedding(word_list, model, method, learning_rate, colors)

In [25]:
def plot_widget(model_name, word_list_input, color_list_input):
    model=models[model_name]
    word_list = word_list_input.split('\n')
    word_list = [w.strip() for w in word_list]
    word_list = [w for w in word_list if w!='' ]
    
    color_list = [c.strip() for c in color_list_input.split('\n')]
    color_list = [c for c in color_list if c!='']
    if len(word_list) > 1:
        try:
            colors = None
            if len(color_list)==len(word_list):
                colors = color_list
            elif len(color_list)>0:
                print('nr of colors should match nr of words')
            plot_embedding(word_list, model, colors=colors)
        except KeyError as err:
            print(err)
    elif len(word_list)==1:
        print('Need at least two words')

In [26]:
from IPython.display import display, clear_output

input_word_list = widgets.Textarea(description='Word list:', rows=10)
input_colors = widgets.Textarea(description='colors:', rows=10)
input_model = widgets.Dropdown(description='Model:', options=models.keys())

def update_plot(button):
    with(out):
        clear_output()
        plot_widget(input_model.value, input_word_list.value, input_colors.value)
        widgets.interaction.show_inline_matplotlib_plots()

button_submit = widgets.Button(description='Submit')
button_submit.on_click(update_plot)

#button_submit = widgets.Button(description='Submit')
out = widgets.Output()



widgets.VBox([input_model, 
              widgets.HBox([input_word_list, input_colors]),
              button_submit,
              out])

VBox(children=(Dropdown(description='Model:', options=('fiqh-stemmed',), value='fiqh-stemmed'), HBox(children=…