# FastText 

In [6]:
# imports
from cltk.tokenize.sentence import TokenizeSentence
from pprint import pprint as print
from gensim.models.fasttext import FastText
from gensim.test.utils import datapath
from gensim import utils

## Preprocessing

In [3]:
# load data
with open("../data/raw_data/HomerGesamt_deaccented.txt", 'r', encoding='utf-8') as src:
    data = src.read()

In [7]:
# Tokenization
sentences = TokenizeSentence('greek').tokenize(data)

### Generate sentences

In [19]:
class Homer_Iterator(object):
    def __init__(self, sentences):
        self.sentences = sentences

    def __iter__(self):
        for sent in self.sentences:
            yield utils.simple_preprocess(sent)
            
corpus = Homer_Iterator(sentences)

In [20]:
# Initialize model
model = FastText(size=100,
                 alpha=0.001,
                 min_count=1,
                 negative=50
                )

In [21]:
# Build vocab
model.build_vocab(sentences=corpus)

In [46]:
# train the model
model.train(sentences=corpus,
            epochs=150,
            min_count=1,
            total_words=model.corpus_total_words,
            alpha=0.01,
            window=10,
            negative=50
           )

In [56]:
model.save('../data/models/homer_FastText030220.model')


# Vector lookup

In [57]:
model.wv.most_similar('εθηκε')

[('μεθηκε', 0.9999969601631165),
 ('παρεθηκε', 0.999996542930603),
 ('εθηκεν', 0.9999963045120239),
 ('επεθηκε', 0.9999963045120239),
 ('κατεθηκε', 0.9999954700469971),
 ('εθηκας', 0.9999954104423523),
 ('μετεθηκε', 0.9999953508377075),
 ('παρεθηκεν', 0.999995231628418),
 ('μεθηκεν', 0.9999951720237732),
 ('αναπρησας', 0.9999950528144836)]

In [58]:
data[:250]

'μηνιν αειδε θεα Πηληϊαδεω Αχιληος ουλομενην μυριʼ Αχαιοις αλγεʼ εθηκε πολλας ιφθιμους ψυχας Αϊδι προϊαψεν ηρωων αυτους ελωρια τευχε κυνεσσιν οιωνοισι πασι Διος ετελειετο βουλη εξ ου δη πρωτα διαστητην ερισαντε Ατρεϊδης αναξ ανδρων διος Αχιλλευς αρ σφ'

In [65]:
model.wv.most_similar('φατο',topn=25)

[('εποιατο', 0.9999993443489075),
 ('εινατο', 0.9999993443489075),
 ('επιστατο', 0.9999992847442627),
 ('ειποντα', 0.9999992847442627),
 ('περονατο', 0.9999992847442627),
 ('πεισονται', 0.9999992251396179),
 ('πελονται', 0.9999992251396179),
 ('απεσσονται', 0.9999992251396179),
 ('αριστηεσσιν', 0.9999992251396179),
 ('πολυρρηνος', 0.9999992251396179),
 ('ονονται', 0.9999992251396179),
 ('κοτεσσατο', 0.9999992251396179),
 ('επιστησονται', 0.9999992251396179),
 ('φραξαντο', 0.9999992251396179),
 ('αλατο', 0.9999992251396179),
 ('εφατο', 0.9999992251396179),
 ('παυσατο', 0.9999992251396179),
 ('μαχεσσατο', 0.9999992251396179),
 ('αντομεναι', 0.9999992251396179),
 ('κοτεσσαμενος', 0.9999991655349731),
 ('ανειρετο', 0.9999991655349731),
 ('ορνιθεσσιν', 0.9999991655349731),
 ('δασονται', 0.9999991655349731),
 ('εινασθαι', 0.9999991655349731),
 ('φιλτατον', 0.9999991655349731)]

In [60]:
vectors = model.wv.vectors

## Decomposition and Normalization

In [31]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler

In [32]:
svd = TruncatedSVD(n_components=2, n_iter=10)
scaler = StandardScaler()

In [35]:
# Dimensionality reduction and normalization
vectors_svd = svd.fit_transform(vectors)
vectors_svd_normalized = scaler.fit_transform(vectors_svd)

## Data Visualization

In [36]:
import bokeh.models as bm, bokeh.plotting as pl
from bokeh.io import output_notebook
output_notebook()

In [37]:
# Draw function
def draw_vectors(x, y, radius=10, alpha=0.25, color='blue',
                 width=600, height=400, show=True, **kwargs):
    """ draws an interactive plot for data points with auxilirary info on hover """
    if isinstance(color, str): color = [color] * len(x)
    data_source = bm.ColumnDataSource({ 'x' : x, 'y' : y, 'color': color, **kwargs })

    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height)
    fig.scatter('x', 'y', size=radius, color='color', alpha=alpha, source=data_source)

    fig.add_tools(bm.HoverTool(tooltips=[(key, "@" + key) for key in kwargs.keys()]))
    if show: pl.show(fig)
    return fig

In [61]:
# Extract vocabulary
words = sorted(model.wv.vocab.keys(),
              key = lambda w : model.wv.vocab[w].count, 
              reverse = True)

In [62]:
from sklearn.manifold import TSNE
word_tsne = TSNE(n_components=2).fit_transform(vectors)
words_tsne_scaled = scaler.fit_transform(word_tsne)

In [63]:
draw_vectors(words_tsne_scaled[:, 0], words_tsne_scaled[:, 1], token=words)