# FastText 

In [1]:
# imports
from cltk.tokenize.sentence import TokenizeSentence
from pprint import pprint as print
import gensim
from gensim.models.fasttext import FastText
from gensim.test.utils import datapath
from gensim import utils

## Preprocessing

In [2]:
# load data
with open("../data/raw_data/HomerGesamt_deaccented.txt", 'r', encoding='utf-8') as src:
    data = src.read()

In [3]:
# Tokenization
sentences = TokenizeSentence('greek').tokenize(data)

### Generate sentences

In [4]:
class Homer_Iterator(object):
    def __init__(self, sentences):
        self.sentences = sentences

    def __iter__(self):
        for sent in self.sentences:
            yield utils.simple_preprocess(sent)
            
corpus = Homer_Iterator(sentences)

In [None]:
# Initialize model
#model = FastText(size=100,alpha=0.001,min_count=1,negative=50)

In [170]:
# Build vocab
# model.build_vocab(sentences=corpus)

In [171]:
model = FastText.load("../data/models/homer_FastText0402-alpha.model")

In [172]:
# train the model
model.train(sentences=corpus,
            epochs=250,
            min_count=3,
            total_words=model.corpus_total_words,
            start_alpha=0.01,
            end_alpha=0.09,
            window=15,
            negative=10
           )

In [173]:
model.save('../data/models/homer_FastText0402-alpha.model')


# Vector lookup

In [174]:
data[1500:2000]

'α μηριʼ εκηα ταυρων ηδʼ αιγων μοι κρηηνον εελδωρ τισειαν Δαναοι εμα δακρυα σοισι βελεσσιν ως εφατʼ ευχομενος εκλυε Φοιβος Απολλων βη κατʼ Ουλυμποιο καρηνων χωομενος κηρ τοξʼ ωμοισιν εχων αμφηρεφεα φαρετρην εκλαγξαν αρʼ οϊστοι επʼ ωμων χωομενοιο αυτου κινηθεντος ηϊε νυκτι εοικως εζετʼ επειτʼ απανευθε νεων μετα ιον εηκε δεινη κλαγγη γενετʼ αργυρεοιο βιοιο ουρηας πρωτον επῳχετο κυνας αργους επειτʼ αυτοισι βελος εχεπευκες εφιεις βαλλʼ αιει πυραι νεκυων καιοντο θαμειαι εννημαρ ανα στρατον ῳχετο κηλα '

In [200]:
model.wv.most_similar(positive='εφατʼ',topn=25)

[('πεφατʼ', 0.9190710783004761),
 ('φατʼ', 0.8086668252944946),
 ('στρωφατʼ', 0.7787216901779175),
 ('θεσφατʼ', 0.7118062376976013),
 ('φθεγξατʼ', 0.7074302434921265),
 ('υατʼ', 0.7016330361366272),
 ('ατʼ', 0.6847632527351379),
 ('ηατʼ', 0.6824649572372437),
 ('ιατʼ', 0.6741114854812622),
 ('ρυατʼ', 0.6652892231941223),
 ('ρατʼ', 0.6612401008605957),
 ('σβεσατʼ', 0.6612303256988525),
 ('υδατʼ', 0.6557314991950989),
 ('εκχευατʼ', 0.6525974869728088),
 ('ακηχεδατʼ', 0.6484414339065552),
 ('εατʼ', 0.6466086506843567),
 ('οσφισατʼ', 0.6222934722900391),
 ('ηκεσατʼ', 0.6186704635620117),
 ('ῃδεσατʼ', 0.6090571880340576),
 ('δεατʼ', 0.6015806198120117),
 ('οιματʼ', 0.5981293320655823),
 ('ουατʼ', 0.5962398052215576),
 ('σευατʼ', 0.5956606864929199),
 ('χωσατʼ', 0.5944528579711914),
 ('ορεξατʼ', 0.5929105877876282)]

In [201]:
# extract vectors
vectors = model.wv.vectors
# Extract vocabulary
words = sorted(model.wv.vocab.keys(),
              key = lambda w : model.wv.vocab[w].count, 
              reverse = True)

## Decomposition and Normalization

In [202]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler

In [203]:
svd = TruncatedSVD(n_components=2, n_iter=10)
scaler = StandardScaler()

In [204]:
# Dimensionality reduction and normalization
vectors_svd = svd.fit_transform(vectors)
vectors_svd_normalized = scaler.fit_transform(vectors_svd)

## TSNE Decomposition and Normalization

In [219]:
from sklearn.manifold import TSNE

In [220]:
# reduce dimensionality
word_tsne = TSNE(n_components=2).fit_transform(vectors)

In [221]:
# scale to zero mean and unit variance
words_tsne_scaled = scaler.fit_transform(word_tsne)

## Build pandas Df

In [190]:
import pandas as pd
import re

In [222]:
df = pd.DataFrame(data=words_tsne_scaled, columns=["x", "y"])
#df = pd.DataFrame(data=vectors_svd_normalized, columns=["x", "y"])

In [223]:
df['token'] = words

In [225]:
classe = []
faslse_positives = ["που", "αυτου","και","ειναι","κατʼ"]
for tok in df['token']:
    if tok in faslse_positives:
        classe.append('non_medium')
        continue
    regex = r"\w+(μαι|σαι|σο|ται|το|μεθα|μεθʼ|σθε|σθʼ|σθ|νται|ντʼ|ντο|σθαι|μην|ου|ιο|σθω|σθων|μεν\w{1,3})\b"
    match = re.match(regex, tok)
    if match:
        classe.append('medium')
    else:
        classe.append('non_medium')
        
        

In [226]:
df['label'] = classe

In [227]:
df.head()

Unnamed: 0,x,y,token,label
0,0.594039,0.191381,ως,non_medium
1,-0.764052,-0.623022,εν,non_medium
2,0.505259,-0.199548,γαρ,non_medium
3,0.312153,-0.434983,ου,non_medium
4,0.360167,-0.793898,δη,non_medium


In [228]:
df.to_csv("../data/gensim_vectors/media_vectors_tsne.csv")

## Data Visualization

In [229]:
import bokeh.models as bm, bokeh.plotting as pl
from bokeh.io import output_notebook, export_png
from bokeh.plotting import figure, show
from bokeh.transform import factor_cmap, factor_mark
output_notebook()

In [230]:
def draw_groups(data, radius=10, alpha=0.25,
                 width=600, height=400, show=True, markers=['circle_x', 'triangle'], colorstyle=['#2ca02c', '#aec7e8'], name="plot.png"):
    """ draws an interactive plot for data points with auxilirary info on hover """
    
    MARKERS = markers
    LABEL = ['medium', 'non_medium']
    
    src = bm.ColumnDataSource(data)
    
    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height, title="fastText w=15, negative=10")
    
    fig.scatter('x','y',
                size=10,
                legend_field="label",
                marker=factor_mark('label', MARKERS, LABEL),
                color=factor_cmap('label', colorstyle, LABEL),
                source=src)

    
    fig.add_tools(bm.HoverTool(tooltips=[("token", "@token")]))
    export_png(fig, filename="../data/plots/"+name)
    if show: pl.show(fig)
    return fig

In [231]:
draw_groups(data=df,name="fastText_0402_2.png")