# FastText 

In [24]:
# imports
from cltk.tokenize.sentence import TokenizeSentence
from pprint import pprint as print
import gensim
import re
from gensim.models.fasttext import FastText
from gensim.test.utils import datapath
from gensim import utils

## Preprocessing

In [29]:
# load data
with open("../data/raw_data/HomerGesamt_deaccented.txt", 'r', encoding='utf-8') as src:
    data = src.read()

In [30]:
# Tokenization
sentences = TokenizeSentence('greek').tokenize(data)

In [31]:
def clean_sentences(sentences):
    sentences_new = []
    for text in sentences:
        # delete everything which is not text
        text = re.sub(r"\d+(\.\d+)?", r" ", text,0, re.MULTILINE)
        text = re.sub("\d(\.\d\.\d)+", '',text,0,re.MULTILINE)
        text = re.sub(r'\d+', '', text, re.MULTILINE)
        # add whitespace after punctuation
        text = re.sub(r"(\w)(\.|\,|\?|·|;)", "\\1 \\2", text, 0, re.MULTILINE)
        #delete chapter notation
        text = re.sub(r'(ch|sec)\s?\.?','',text,re.MULTILINE)
        #delete extra spaces
        text = re.sub(r'\s{2,5}',' ',text,re.MULTILINE)
        text = re.sub(r'  +', ' ', text, 0,re.MULTILINE)
        text = re.sub(r'\s\.\s\.\s', ' . ', text, 0, re.MULTILINE)
        text = re.sub(r'\s·\s.\s', '',text,0,re.MULTILINE)
        # delete punctuation
        text = re.sub(r'(\.|\?|\;|,|:|-|·)', '',text, 0, re.MULTILINE)
        sentences_new.append(text)
    return sentences_new

In [32]:
sentences = clean_sentences(sentences)

### Generate sentences

In [34]:
class Homer_Iterator(object):
    def __init__(self, sentences):
        self.sentences = sentences

    def __iter__(self):
        for sent in self.sentences:
            yield utils.simple_preprocess(sent)
            
corpus = Homer_Iterator(sentences)

In [35]:
# Initialize model
#model = FastText(size=100,alpha=0.001,min_count=1,negative=50)

In [170]:
# Build vocab
# model.build_vocab(sentences=corpus)

In [36]:
model = FastText.load("../data/models/homer_FastText0402-alpha.model")

In [39]:
# train the model
model.train(sentences=corpus, # corpus
            epochs=30,
            window=10, # context-window
            min_count=1, # also words that appears once can be interesting in the case of Homer
            sg=1, # 1 = Skip-Gram, 0 = CBOW
            negative=20,
            hs=1, # use softmax
            workers=2,
            total_words = model.corpus_total_words
           )

In [40]:
model.save('../data/models/homer_FastText0402-beta.model')

In [41]:
# extract vectors
vectors = model.wv.vectors
# Extract vocabulary
words = sorted(model.wv.vocab.keys(),
              key = lambda w : model.wv.vocab[w].count, 
              reverse = True)

# Vector lookup

In [54]:
data[1500:2000]

'ιφι ανασσεις , Σμινθευ ει ποτε χαριεντʼ επι νηον ερεψα , η δη ποτε κατα πιονα μηριʼ εκηα ταυρων ηδʼ αιγων , μοι κρηηνον εελδωρ · τισειαν Δαναοι εμα δακρυα σοισι βελεσσιν . ως εφατʼ ευχομενος , εκλυε Φοιβος Απολλων , βη κατʼ Ουλυμποιο καρηνων χωομενος κηρ , τοξʼ ωμοισιν εχων αμφηρεφεα φαρετρην · εκλαγξαν αρʼ οϊστοι επʼ ωμων χωομενοιο , αυτου κινηθεντος · ηϊε νυκτι εοικως . εζετʼ επειτʼ απανευθε νεων , μετα ιον εηκε · δεινη κλαγγη γενετʼ αργυρεοιο βιοιο · ουρηας πρωτον επῳχετο κυνας αργους , επειτ'

In [55]:
model.wv.most_similar(positive='δυναται',topn=25)

[('μναται', 0.8304828405380249),
 ('ααται', 0.8053926825523376),
 ('πιλναται', 0.8048871159553528),
 ('δαμναται', 0.7976365685462952),
 ('εαται', 0.7803430557250977),
 ('νεαται', 0.7642230987548828),
 ('βεβληαται', 0.7562119364738464),
 ('κεαται', 0.7555913329124451),
 ('ραται', 0.7555294036865234),
 ('εληλαται', 0.7402574419975281),
 ('δεχαται', 0.7344808578491211),
 ('ειλυαται', 0.7307053208351135),
 ('ιρυαται', 0.7193729877471924),
 ('ερραδαται', 0.7192108631134033),
 ('επταται', 0.7165117263793945),
 ('καταιβαται', 0.716057300567627),
 ('επιπιλναται', 0.7151651382446289),
 ('κιδναται', 0.6993271112442017),
 ('πεφαται', 0.6983848810195923),
 ('δυνανται', 0.6903455257415771),
 ('νενιπται', 0.6828632354736328),
 ('μαρναται', 0.6813768744468689),
 ('ανιαται', 0.6811359524726868),
 ('ρωται', 0.6802849769592285),
 ('εεδνωται', 0.6797809600830078)]

## Decomposition and Normalization

In [56]:
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE

In [57]:
scaler = StandardScaler()
tsne = TSNE(n_components=2, metric='cosine', init='pca', verbose=2,n_jobs=8)

In [58]:
# reduce dimensionality
word_tsne = tsne.fit_transform(vectors)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 30453 samples in 0.000s...
[t-SNE] Computed neighbors for 30453 samples in 20.019s...
[t-SNE] Computed conditional probabilities for sample 1000 / 30453
[t-SNE] Computed conditional probabilities for sample 2000 / 30453
[t-SNE] Computed conditional probabilities for sample 3000 / 30453
[t-SNE] Computed conditional probabilities for sample 4000 / 30453
[t-SNE] Computed conditional probabilities for sample 5000 / 30453
[t-SNE] Computed conditional probabilities for sample 6000 / 30453
[t-SNE] Computed conditional probabilities for sample 7000 / 30453
[t-SNE] Computed conditional probabilities for sample 8000 / 30453
[t-SNE] Computed conditional probabilities for sample 9000 / 30453
[t-SNE] Computed conditional probabilities for sample 10000 / 30453
[t-SNE] Computed conditional probabilities for sample 11000 / 30453
[t-SNE] Computed conditional probabilities for sample 12000 / 30453
[t-SNE] Computed conditional probabilities for sa

In [60]:
# scale to zero mean and unit variance
words_tsne = scaler.fit_transform(word_tsne)

## Build pandas Df

In [61]:
import pandas as pd
import re

In [63]:
df = pd.DataFrame(data=words_tsne, columns=["x", "y"])
#df = pd.DataFrame(data=vectors_svd_normalized, columns=["x", "y"])

In [64]:
df['token'] = words

In [65]:
classe = []
faslse_positives = ["που", "αυτου","και","ειναι","κατʼ"]
for tok in df['token']:
    if tok in faslse_positives:
        classe.append('non_medium')
        continue
    regex = r"\w+(μαι|σαι|σο|ται|το|μεθα|μεθʼ|σθε|σθʼ|σθ|νται|ντʼ|ντο|σθαι|μην|ου|ιο|σθω|σθων|μεν\w{1,3})\b"
    match = re.match(regex, tok)
    if match:
        classe.append('medium')
    else:
        classe.append('non_medium')
        
        

In [66]:
df['label'] = classe

In [67]:
df.head()

Unnamed: 0,x,y,token,label
0,-0.543308,0.286993,ως,non_medium
1,0.45697,0.144438,εν,non_medium
2,-0.309913,-0.498592,γαρ,non_medium
3,-0.575061,-0.264054,ου,non_medium
4,-0.591362,0.419302,δη,non_medium


In [68]:
df.to_csv("../data/gensim_vectors/media_vectors_tsne.csv")

## Data Visualization

In [69]:
import bokeh.models as bm, bokeh.plotting as pl
from bokeh.io import output_notebook, export_png
from bokeh.plotting import figure, show
from bokeh.transform import factor_cmap, factor_mark
output_notebook()

In [70]:
def draw_groups(data, radius=10, alpha=0.25,
                 width=600, height=400, show=True, markers=['circle_x', 'triangle'], colorstyle=['#2ca02c', '#aec7e8'], name="plot.png"):
    """ draws an interactive plot for data points with auxilirary info on hover """
    
    MARKERS = markers
    LABEL = ['medium', 'non_medium']
    
    src = bm.ColumnDataSource(data)
    
    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height, title="fastText w=15, negative=10")
    
    fig.scatter('x','y',
                size=10,
                legend_field="label",
                marker=factor_mark('label', MARKERS, LABEL),
                color=factor_cmap('label', colorstyle, LABEL),
                source=src)

    
    fig.add_tools(bm.HoverTool(tooltips=[("token", "@token")]))
    export_png(fig, filename="../data/plots/"+name)
    if show: pl.show(fig)
    return fig

In [71]:
draw_groups(data=df,name="fastText_0402_2.png")