# FastText 

In [3]:
# imports
from cltk.tokenize.sentence import TokenizeSentence
from pprint import pprint as print
import gensim
from gensim.models.fasttext import FastText
from gensim.test.utils import datapath
from gensim import utils

## Preprocessing

In [10]:
# load data
with open("../data/raw_data/HomerGesamt_deaccented.txt", 'r', encoding='utf-8') as src:
    data = src.read()

In [7]:
# Tokenization
sentences = TokenizeSentence('greek').tokenize(data)

### Generate sentences

In [19]:
class Homer_Iterator(object):
    def __init__(self, sentences):
        self.sentences = sentences

    def __iter__(self):
        for sent in self.sentences:
            yield utils.simple_preprocess(sent)
            
corpus = Homer_Iterator(sentences)

# Initialize model
model = FastText(size=100,
                 alpha=0.001,
                 min_count=1,
                 negative=50
                )

In [7]:
model = FastText.load("../data/models/homer_FastText030220.model")

In [21]:
# Build vocab
model.build_vocab(sentences=corpus)

In [46]:
# train the model
model.train(sentences=corpus,
            epochs=150,
            min_count=1,
            total_words=model.corpus_total_words,
            alpha=0.01,
            window=10,
            negative=50
           )

In [56]:
model.save('../data/models/homer_FastText030220.model')


# Vector lookup

In [11]:
data[1500:1600]

'α μηριʼ εκηα ταυρων ηδʼ αιγων μοι κρηηνον εελδωρ τισειαν Δαναοι εμα δακρυα σοισι βελεσσιν ως εφατʼ ε'

In [12]:
model.wv.most_similar('βελεσσιν',topn=25)

[('νεφεσσιν', 0.9999997615814209),
 ('τεκεσσιν', 0.9999997615814209),
 ('μελεσσιν', 0.9999997615814209),
 ('επεσσιν', 0.9999997615814209),
 ('επεισιν', 0.9999997615814209),
 ('επεεσσιν', 0.9999997615814209),
 ('τρωεσσιν', 0.9999997615814209),
 ('πολεσσιν', 0.9999997019767761),
 ('ιπποισιν', 0.9999997019767761),
 ('εισιν', 0.9999997019767761),
 ('συεσσιν', 0.9999997019767761),
 ('παντεσσιν', 0.9999997019767761),
 ('νεεσσιν', 0.9999997019767761),
 ('μυρομενοισιν', 0.9999997019767761),
 ('βοεσσιν', 0.9999997019767761),
 ('οιεσσιν', 0.9999996423721313),
 ('μεταλλωσιν', 0.9999996423721313),
 ('βενθεσσιν', 0.9999996423721313),
 ('τεκεεσσιν', 0.9999996423721313),
 ('παρημενον', 0.9999996423721313),
 ('ερμισιν', 0.9999996423721313),
 ('υπερφιαλοισιν', 0.9999996423721313),
 ('τειχεσσιν', 0.9999996423721313),
 ('μεσοισιν', 0.9999996423721313),
 ('αυλοισιν', 0.9999996423721313)]

In [13]:
vectors = model.wv.vectors

## Decomposition and Normalization

In [18]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler

In [19]:
svd = TruncatedSVD(n_components=2, n_iter=10)
scaler = StandardScaler()

In [35]:
# Dimensionality reduction and normalization
vectors_svd = svd.fit_transform(vectors)
vectors_svd_normalized = scaler.fit_transform(vectors_svd)

## Data Visualization

In [14]:
import bokeh.models as bm, bokeh.plotting as pl
from bokeh.io import output_notebook
output_notebook()

In [15]:
# Draw function
def draw_vectors(x, y, radius=10, alpha=0.25, color='blue',
                 width=600, height=400, show=True, **kwargs):
    """ draws an interactive plot for data points with auxilirary info on hover """
    if isinstance(color, str): color = [color] * len(x)
    data_source = bm.ColumnDataSource({ 'x' : x, 'y' : y, 'color': color, **kwargs })

    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height)
    fig.scatter('x', 'y', size=radius, color='color', alpha=alpha, source=data_source)

    fig.add_tools(bm.HoverTool(tooltips=[(key, "@" + key) for key in kwargs.keys()]))
    if show: pl.show(fig)
    return fig

In [16]:
# Extract vocabulary
words = sorted(model.wv.vocab.keys(),
              key = lambda w : model.wv.vocab[w].count, 
              reverse = True)

In [None]:
from sklearn.manifold import TSNE
word_tsne = TSNE(n_components=2).fit_transform(vectors)


In [20]:
words_tsne_scaled = scaler.fit_transform(word_tsne)

In [None]:
# Uncomment this to draw your plot
#draw_vectors(words_tsne_scaled[:, 0], words_tsne_scaled[:, 1], token=words)

![Homer visualized](./homer_tsne.png)