In [1]:
%matplotlib inline
import ipywidgets as widgets
import gensim
import os
import matplotlib.pyplot as plt
import bidi.algorithm
import arabic_reshaper

## Word embedding model for arabic
This app allows you to visualize a list of words in a word embedding model for Arabic, mapped to two dimensions using Principal Component Analysis (PCA).

The current model is the Wikipedia CBOW model from [AraVec](https://github.com/bakrianoo/aravec), see: Abu Bakr Soliman, Kareem Eisa, and Samhaa R. El-Beltagy, “AraVec: A set of Arabic Word Embedding Models for use in Arabic NLP”, in proceedings of the 3rd International Conference on Arabic Computational Linguistics (ACLing 2017), Dubai, UAE, 2017.

In [2]:
model_path = 'data'

In [3]:
# To do: list multiple available models and only download them when needed
models = {}
filenames = os.listdir(model_path)
for fn in filenames:
    try:
        models[fn] = gensim.models.KeyedVectors.load(os.path.join(model_path, fn))
    except:
        pass

In [4]:
nicer_names_dict = {'wikipedia_cbow_100': 'Wikipedia',
                    'cbow-fiqh-100-wikipedia-finetuned-wv': 'Wikipedia+Fiqh',
                   'fiqh-i10-s100-w5-sg0_wv': 'Fiqh', 
                   'stemmed-fiqh-i10-s100-w5-sg0_wv': 'Stemmed Fiqh'}

In [5]:
models = {nicer_names_dict.get(model, model): models[model] for model in models}

In [6]:
def most_similar(word, model, topn=10):
    res = model.wv.most_similar(word, topn=topn)
    output = [u'{} \t{:.3f}'.format(w, s) for w, s in res]
    return '\n'.join(output)

In [30]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

def plot_embedding(word_list, model, method='pca', learning_rate=60, colors=None):
    X = model[word_list]
    if method is 'pca':
        X_embedded = PCA(n_components=2).fit_transform(X)
    else:
        X_pre = PCA(n_components=30).fit_transform(X)
        X_embedded = TSNE(n_components=2, learning_rate=learning_rate, random_state=0).fit_transform(X_pre)

    plt.figure(figsize=(15,15))
    plt.scatter(X_embedded[:,0], X_embedded[:,1], c=colors)
    plt.axis('off')
    
    for i, w in enumerate(word_list):
        display_word = bidi.algorithm.get_display(arabic_reshaper.reshape(w))
        plt.annotate(display_word, xy=(X_embedded[i, 0], X_embedded[i, 1]), fontsize=20)
    plt.show()

In [31]:
def plot_closest_words(word, model, method='pca', topn=30, learning_rate=60, colors=None):
    word_list = [w for w, s in model.wv.most_similar(word, topn=topn)]
    word_list.append(word)
    plot_embedding(word_list, model, method, learning_rate, colors)

In [39]:
def plot_widget(model_name, word_list_input):
    model=models[model_name]
    word_list = word_list_input.split('\n')
    word_list = [w.strip() for w in word_list]
    word_list = [w for w in word_list if w!='' ]
    if len(word_list) > 1:
        try:
            plot_embedding(word_list, model)
        except KeyError as err:
            print(err)
    elif len(word_list)==1:
        print('Need at least two words')


input_word_list = widgets.Textarea(description='Word list:', rows=10)
input_model = widgets.Dropdown(options=models.keys())

#button_submit = widgets.Button(description='Submit')
interactive_plot = widgets.interactive(plot_widget, model_name=input_model, word_list_input=input_word_list)


#widgets.VBox((input_word_list, interactive_plot))
interactive_plot

interactive(children=(Dropdown(description='model_name', options=('Wikipedia', 'Wikipedia+Fiqh', 'Stemmed Fiqh…