# Word2Vec 

In [1]:
import pandas as pd 
import re
from argparse import Namespace
import gensim
from gensim import utils
from gensim.models import Word2Vec
from cltk.tokenize.sentence import TokenizeSentence
tokenizer = TokenizeSentence('greek')


##  Preprocessing

In [2]:
args = Namespace(
    raw_data = "../data/raw_data/HomerGesamt_deaccented.txt")

# load data
with open(args.raw_data, 'r', encoding='utf-8') as src:
    data = src.read()

In [3]:
# Extract sentences
sentences = tokenizer.tokenize(data)

### Prepare for Word2Vec

In [4]:
# generator

class HomerCorpus(object):
    def __init__(self, sentencesList):
        self.sentences = sentencesList
    
    def __iter__(self):
        for sentence in self.sentences:
            yield utils.simple_preprocess(sentence)

In [5]:
sentences = HomerCorpus(sentences)

### Initialize model

In [6]:
# print some log while training
from gensim.models.callbacks import CallbackAny2Vec
class EpochLogger(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0
    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))
        
    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        self.epoch += 1

    def on_train_begin(self, model):
        print("Training start")
        
    def on_train_end(self, model):
        print("Training end")
        self.trEpoch += 1
        
logEpoch = EpochLogger()

In [7]:
# Once the model is initialised and trained the 1. time, we can load it, see below:

model = gensim.models.word2vec.Word2Vec(sentences=sentences, # corpus
                               window=5, # context-window
                               min_count=1, # also words that appears once can be interesting in the case of Homer
                               sg=1, # 1 = Skip-Gram, 0 = CBOW
                               negative=20,
                               hs=1, # use softmax
                               workers=2,
                               callbacks=[logEpoch]
                              )

In [9]:
model = gensim.models.word2vec.Word2Vec.load('../data/models/w2vec-homer-0402-skipgram-softmax.model')

In [10]:
# train model
model.train(
    sentences=sentences,
    epochs=10,
    total_words=10000,
    word_count=0,
    queue_factor=2,
    report_delay=1.0
)

Training start
Epoch #220 start
Epoch #220 end
Epoch #221 start
Epoch #221 end
Epoch #222 start
Epoch #222 end
Epoch #223 start
Epoch #223 end
Epoch #224 start
Epoch #224 end
Epoch #225 start
Epoch #225 end
Epoch #226 start
Epoch #226 end
Epoch #227 start
Epoch #227 end
Epoch #228 start
Epoch #228 end
Epoch #229 start
Epoch #229 end
Training end


(3822434, 3977140)

In [11]:
model.save('../data/models/w2vec-homer-0402-skipgram-softmax.model')

In [12]:
model.wv.most_similar('ευχεται',topn=25)

[('πυγμῃ', 0.5719588994979858),
 ('νικησαντʼ', 0.5564693212509155),
 ('ευχομαι', 0.5128200650215149),
 ('αμεινων', 0.5083506107330322),
 ('ταφιοισι', 0.5026240348815918),
 ('ειναι', 0.49997133016586304),
 ('παγχαλκεος', 0.4906706213951111),
 ('λυκιῃ', 0.48938143253326416),
 ('αριστος', 0.48750296235084534),
 ('ῃδη', 0.4853050410747528),
 ('εντης', 0.4837966561317444),
 ('αγχισῃ', 0.47767677903175354),
 ('ιμι', 0.4760376811027527),
 ('σεο', 0.4735616147518158),
 ('φησι', 0.4681958556175232),
 ('εκγεγαμεν', 0.46025270223617554),
 ('ολωλε', 0.45778679847717285),
 ('ινειας', 0.45111557841300964),
 ('ευπηγης', 0.4477945864200592),
 ('εβληται', 0.4471311867237091),
 ('μναται', 0.4443034827709198),
 ('νικησειʼ', 0.44404086470603943),
 ('αγχισαο', 0.4439655840396881),
 ('αχαιων', 0.4422879219055176),
 ('εσσεσθʼ', 0.4402849078178406)]

In [13]:
# extract vectors
vectors = model.wv.vectors    


# Extract vocabulary
words = sorted(model.wv.vocab.keys(),
              key = lambda w : model.wv.vocab[w].count, 
              reverse = True)


In [14]:
model.predict_output_word(["εριδι"],topn=7)

[('εριδι', 0.99999964),
 ('ξυνεηκε', 1.3415844e-07),
 ('λητους', 4.627762e-08),
 ('θεους', 3.9049986e-08),
 ('ξυνελαυνεις', 1.33630556e-08),
 ('σφωε', 8.3357135e-09),
 ('θυμοβορῳ', 6.343769e-09)]

## SVD decomposition

In [15]:
# import scaler and dimensionality reducer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
scaler = StandardScaler()

## TSNE decomposition

In [44]:
vectors_tsne = TSNE(n_components=2,
                    perplexity = 5,
                    learning_rate = 300.0,
                    metric='cosine',
                    n_iter=500,
                    verbose=3,
                    n_jobs=6).fit_transform(vectors)

[t-SNE] Computing 16 nearest neighbors...
[t-SNE] Indexed 30453 samples in 0.000s...
[t-SNE] Computed neighbors for 30453 samples in 19.810s...
[t-SNE] Computed conditional probabilities for sample 1000 / 30453
[t-SNE] Computed conditional probabilities for sample 2000 / 30453
[t-SNE] Computed conditional probabilities for sample 3000 / 30453
[t-SNE] Computed conditional probabilities for sample 4000 / 30453
[t-SNE] Computed conditional probabilities for sample 5000 / 30453
[t-SNE] Computed conditional probabilities for sample 6000 / 30453
[t-SNE] Computed conditional probabilities for sample 7000 / 30453
[t-SNE] Computed conditional probabilities for sample 8000 / 30453
[t-SNE] Computed conditional probabilities for sample 9000 / 30453
[t-SNE] Computed conditional probabilities for sample 10000 / 30453
[t-SNE] Computed conditional probabilities for sample 11000 / 30453
[t-SNE] Computed conditional probabilities for sample 12000 / 30453
[t-SNE] Computed conditional probabilities for sa

In [45]:
vectors_tsne = scaler.fit_transform(vectors_tsne)

## Build Dataframe

In [46]:
# TSNE version
df = pd.DataFrame(data=vectors_tsne, columns=["x", "y"])
# SVD version
#df = pd.DataFrame(data=vectors_svd_normalized, columns=["x", "y"])

In [47]:
# add tokens
df['token'] = words

In [48]:
classe = []
faslse_positives = ["που", "αυτου","και","ειναι","κατʼ"]
for tok in df['token']:
    if tok in faslse_positives:
        classe.append('non_medium')
        continue
    regex = r"\w+(μαι|σαι|σο|ται|το|μεθα|μεθʼ|σθε|σθʼ|σθ|νται|ντʼ|ντο|σθαι|μην|ου|ιο|σθω|σθων|μεν\w{1,3})\b"
    match = re.match(regex, tok)
    if match:
        classe.append('medium')
    else:
        classe.append('non_medium')

In [49]:
# add labels
df['label'] = classe

In [50]:
df.head()

Unnamed: 0,x,y,token,label
0,0.581939,0.756496,ως,non_medium
1,0.841819,0.870704,εν,non_medium
2,1.435843,-0.245549,γαρ,non_medium
3,1.433206,-0.245737,ου,non_medium
4,0.871191,-0.588509,δη,non_medium


In [51]:
df.to_csv("../data/gensim_vectors/media_vectors_tsne.csv")
# df.to_csv("../data/gensim_vectors/media_vectors_svd.csv")

## Visualization

In [52]:
import bokeh.models as bm, bokeh.plotting as pl
from bokeh.io import output_notebook, export_png
from bokeh.plotting import figure, show
from bokeh.transform import factor_cmap, factor_mark
output_notebook()

In [55]:
def draw_groups(data, radius=10, alpha=0.25,
                 width=600, height=400, show=True, markers=['circle_x', 'triangle'], colorstyle=['#2ca02c', '#aec7e8'], name="plot.png"):
    """ draws an interactive plot for data points with auxilirary info on hover """
    
    MARKERS = markers
    LABEL = ['medium', 'non_medium']
    
    src = bm.ColumnDataSource(data)
    
    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height, title="Skip Gram w=15, negative=10, softmax")
    
    fig.scatter('x','y',
                size=3,
                legend_field="label",
                marker=factor_mark('label', MARKERS, LABEL),
                color=factor_cmap('label', colorstyle, LABEL),
                source=src)

    
    fig.add_tools(bm.HoverTool(tooltips=[("token", "@token")]))
    export_png(fig, filename="../data/plots/"+name)
    if show: pl.show(fig)
    return fig

In [56]:
draw_groups(data=df,name="fastText_0402_2034_2.png")