# Visualizing FastText Model

This notebook develops functions for visualizing semantic relationships preserved in a FastText model of the Sumerian corpus using t-SNE and Bokeh.

Inspired by [*LDA visualized using t-SNE and Bokeh*](https://www.kaggle.com/yohanb/lda-visualized-using-t-sne-and-bokeh) by Yohan, and [*Visualizing Word Vectors with t-SNE*](https://www.kaggle.com/jeffd23/visualizing-word-vectors-with-t-sne) by Jeff Delayney.

Note: for visualization Bokeh is preferred over Matplotlib primarily because of the difficulties in using a custom font in Matplotlib. This becomes a major obstacle when trying to represent tokens in cuneiform.

In [1]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
warnings.filterwarnings(action='ignore', category=FutureWarning, module='gensim' )
import gensim
import numpy as np
import pickle
from sklearn.manifold import TSNE
from gensim.models.fasttext import FastText as FT_gensim
import pandas as pd
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, LabelSet #, HoverTool, CustomJS, , Slider
from bokeh.layouts import column
from bokeh.palettes import all_palettes
output_notebook()
# note bokeh 1.0 requires notebook 5

# Load the Model
There are three models: 
- model_cuneiform.model: the Sumerian copus in Unicode cuneiform
- model_tl.model: the Sumerian corpus in transliteration
- model_lemm.model: the Sumerian corpus in lemmatization

In [2]:
model_c = FT_gensim.load("model/model_cuneiform.model")
model_l = FT_gensim.load("model/model_lemm.model")
model_t = FT_gensim.load("model/model_tl.model")

# Cuneify
Create a function that allows input in transliteration, with output in cuneiform.

In [3]:
with open("output/ogsl.p", "rb") as p:
    o = pickle.load(p)
d = dict(zip(o["value"], o["utf8"]))

In [4]:
def cun(text): 
    """transform transliterated input into cuneiform. Use unicode subscript numbers and separate all signs with hyphens; separate words with blanks
    Examples: 'ma-an-gi₄'; 'd-en-lil₂ nibru-ki'.
    Transliteration style (sugal₇ vs. sukkal; dug₄ vs. du₁₁; gen vs. ŋen; etc.) and capitalization are unimportant.
    """
    cun_line = []
    words = text.lower().split()
    for word in words: 
        signs = word.lower().split('-')
        seq = [d[s] if s in d else s for s in signs]
        seq = ''.join(seq)
        cun_line.append(seq)
    line = ' '.join(cun_line)
    return(line)

# Create Lists of Semantically Related Words

In [5]:
def word_categories(model, words, topn=10):
    """word_categories takes an iterable with one or more words from the vocabulary of model.
    For each word a dictionary of similar words (with the target word) is construed.
    The value of each word is numerical (integer) and indicates the category to which it belongs.
    The function returns a dictionary."""
    word_d = {}
    for idx, word in enumerate(words):
        w = model.wv.most_similar(word, topn=topn)
        w = [m[0] for m in w]
        w.append(word)
        for item in w:
            if item in word_d:
                word_d[item] = 5
            else:
                word_d[item] = idx
    return word_d

In [6]:
def tsne_bokeh(model, words, fontsize="12pt"):

    labels = []
    tokens = []
    categories = []
    
    for word in words:
        tokens.append(model.wv[word])
        labels.append(word)
        categories.append(words[word])
    
    color_d = {0: "black", 1: "red", 2: "yellow", 3: "green", 4: "brown", 5: "blue"}
    colors = [color_d[category] for category in categories]

    tsne = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    tsne_embedding = tsne.fit_transform(tokens)
    tsne_embedding = pd.DataFrame(tsne_embedding, columns=['x','y'])
    tsne_embedding["color"] = colors
    tsne_embedding["labels"] = labels
    source = ColumnDataSource(
        data=dict(
        x = tsne_embedding.x,
        y = tsne_embedding.y,
        colors = tsne_embedding.color,
        labels=tsne_embedding.labels
        )
    )

    l = LabelSet(x='x', y='y', text='labels', level='glyph',
              x_offset=5, y_offset=5, source=source, render_mode='canvas', 
             text_font_size=fontsize, text_font="CuneiformComposite")

    plot_tsne = figure(plot_width=900, plot_height=900) #, tools=tools_tsne, title='Papers')
    plot_tsne.circle('x', 'y', size=7, fill_color='colors', 
                  line_alpha=0, line_width=0.01, source=source)
    plot_tsne.add_layout(l)
    layout = column(plot_tsne)
    show(layout)

In [8]:
words_d = word_categories(model_c, [cun("šim-gig"), cun("hi-sar")], 35)
tsne_bokeh(model_c, words_d, "20pt")

In [14]:
animals = word_categories(model_c, [cun("ab-ba-sa₆-ga"), cun("d-šul-gi-uru-mu")], 20)
tsne_bokeh(model_c, animals, "20pt")

In [16]:
model_c.wv.most_similar(cun("ab-ba-sa₆-ga"), topn=20), model_c.wv.most_similar(cun("d-šul-gi-iri-mu"), topn=20)

([('𒉌𒆪', 0.932532548904419),
  ('𒇽𒀭𒊏', 0.9131841659545898),
  ('𒌨𒀭𒁹𒀭', 0.9126473069190979),
  ('𒌨𒀭𒍂𒇽𒄭', 0.9120600819587708),
  ('𒈾𒊷', 0.9108825922012329),
  ('𒌨𒈩', 0.9093538522720337),
  ('𒀭𒇷𒉌', 0.9093157052993774),
  ('𒁀𒀭𒍣', 0.9069061875343323),
  ('𒇽𒀭𒋫', 0.9059919118881226),
  ('𒌨𒀭𒊩𒌆𒂯', 0.9053530693054199),
  ('𒌨𒀭𒀭𒁹𒀭', 0.9049001932144165),
  ('𒌨𒌉', 0.9048446416854858),
  ('𒌨𒀭𒂗𒃲𒁺𒁺', 0.9034087657928467),
  ('𒌨𒀭𒀹𒀭', 0.9028141498565674),
  ('𒁕𒆷𒀀', 0.9024139642715454),
  ('𒌨𒊷𒂵', 0.9018524885177612),
  ('𒌨𒀭𒅎', 0.9017372131347656),
  ('𒇽𒀭𒋀𒆠', 0.9016850590705872),
  ('𒌨𒄯', 0.9016162157058716),
  ('𒁀𒂵𒀀', 0.9013009071350098)],
 [('𒋗𒈠𒈠𒋫', 0.886979341506958),
  ('𒀀𒄷𒉿𒅕', 0.880461573600769),
  ('𒉡𒌫𒀭𒂗𒍪', 0.8799625635147095),
  ('𒌨𒀭𒋀𒆠𒋫', 0.8770174980163574),
  ('𒅆𒀭𒂗𒆤𒂠𒋫', 0.8766659498214722),
  ('𒍪𒁀𒂵𒋫', 0.8730865120887756),
  ('𒀀𒄷𒉿𒅕𒋫', 0.8717504739761353),
  ('𒂗𒀭𒈬𒋫', 0.8670489192008972),
  ('𒀭𒂄𒄀𒀀𒀀𒈬𒋫', 0.8625446557998657),
  ('𒀭𒂄𒄀𒈪𒊬', 0.8622329235076904),
  ('𒐌𒄰', 0.8607234954833984),
  ('𒌋𒌋𒄰', 0.8601

# Plot TSNE in Matplotlib (deprecated)

def tsne_plot(model, words, cun=False, fontsize=12):
    "Creates a TSNE model and plots it"
    "adapted from https://www.kaggle.com/jeffd23/visualizing-word-vectors-with-t-sne"
    """model is a word embedding model. 
    The variable words is a dictionary of words taken from the model; the value
    of each word is a category label, an integer between 0 and 5"""
    if cun:
        cunfont = {'fontname':'CuneiformOB'}
    else:
        cunfont = {'fontname' : 'DejaVu'}
    labels = []
    tokens = []
    categories = []
    
    for word in words:
        tokens.append(model.wv[word])
        labels.append(word)
        categories.append(words[word])
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
    
    color_d = {0: "black", 1: "green", 2: "blue", 3: "red", 4: "yellow", 5: "brown"}
    
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.plot(x[i],y[i], 'o', color="black")
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom',
                     **cunfont,
                     fontsize=fontsize,
                     color = color_d[categories[i]])
    plt.show()