# Word2Vec 

In [196]:
import pandas as pd 
import re
import gensim
from gensim import utils
from gensim.models import Word2Vec
from cltk.tokenize.sentence import TokenizeSentence
tokenizer = TokenizeSentence('greek')


##  Preprocessing

In [197]:
args = Namespace(
    raw_data = "../data/raw_data/HomerGesamt_deaccented.txt")

# load data
with open(args.raw_data, 'r', encoding='utf-8') as src:
    data = src.read()

In [198]:
# Extract sentences
sentences = tokenizer.tokenize(data)

### Prepare for Word2Vec

In [202]:
# generator

class HomerCorpus(object):
    def __init__(self, sentencesList):
        self.sentences = sentencesList
    
    def __iter__(self):
        for sentence in self.sentences:
            yield utils.simple_preprocess(sentence)

In [203]:
sentences = HomerCorpus(sentences)

### Initialize model

In [225]:
model = gensim.models.word2vec.Word2Vec(sentences=sentences, # corpus
                               window=10, # context-window
                               min_count=1, # also words that appears once can be interesting in the case of Homer
                               sg=1, # 1 = Skip-Gram, 0 = CBOW
                               alpha = 0.001,
                               negative=20,
                                hs=1, # use softmax
                                workers=2                                
                              )

In [226]:
# train model
model.train(
    sentences=sentences,
    epochs=155,
    total_words=10000,
    start_alpha=0.01,
    end_alpha=0.003,
    word_count=0,
    queue_factor=2,
    report_delay=1.0
)

(59256120, 61645670)

In [227]:
model.save('../data/models/w2vec-homer-0402-skipgram-softmax.model')

In [228]:
model.wv.most_similar('εθηκε')

[('αποφθιμενον', 0.5828766226768494),
 ('εμπτῳ', 0.5780500173568726),
 ('κοσμητορι', 0.5486245155334473),
 ('νημιδας', 0.547937273979187),
 ('αυδηεντα', 0.5406896471977234),
 ('επισφυριοις', 0.5106483101844788),
 ('ελαφρον', 0.5101377367973328),
 ('ποιητην', 0.5083999633789062),
 ('κνημῃσιν', 0.5057922601699829),
 ('λοισθηϊʼ', 0.5036720037460327)]

In [229]:
# extract vectors
vectors = model.wv.vectors    


# Extract vocabulary
words = sorted(model.wv.vocab.keys(),
              key = lambda w : model.wv.vocab[w].count, 
              reverse = True)


## SVD decomposition

In [230]:
# import scaler and dimensionality reducer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
scaler = StandardScaler()

In [None]:
svd = TruncatedSVD(n_components=2, n_iter=10)
vectors = svd.fit_transform(vectors)

In [None]:
vectors_normalized = scaler.fit_transform(vectors)

## TSNE decomposition

In [231]:
vectors_tsne = scaler.fit_transform(vectors)
vectors_tsne = TSNE(n_components=2, metric='cosine', init='pca', verbose=2,n_jobs=8).fit_transform(vectors_tsne)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 30453 samples in 0.000s...
[t-SNE] Computed neighbors for 30453 samples in 19.234s...
[t-SNE] Computed conditional probabilities for sample 1000 / 30453
[t-SNE] Computed conditional probabilities for sample 2000 / 30453
[t-SNE] Computed conditional probabilities for sample 3000 / 30453
[t-SNE] Computed conditional probabilities for sample 4000 / 30453
[t-SNE] Computed conditional probabilities for sample 5000 / 30453
[t-SNE] Computed conditional probabilities for sample 6000 / 30453
[t-SNE] Computed conditional probabilities for sample 7000 / 30453
[t-SNE] Computed conditional probabilities for sample 8000 / 30453
[t-SNE] Computed conditional probabilities for sample 9000 / 30453
[t-SNE] Computed conditional probabilities for sample 10000 / 30453
[t-SNE] Computed conditional probabilities for sample 11000 / 30453
[t-SNE] Computed conditional probabilities for sample 12000 / 30453
[t-SNE] Computed conditional probabilities for sa

In [233]:
vectors_tsne = scaler.fit_transform(vectors_tsne)

## Build Dataframe

In [234]:
# TSNE version
df = pd.DataFrame(data=words_tsne_scaled, columns=["x", "y"])
# SVD version
#df = pd.DataFrame(data=vectors_svd_normalized, columns=["x", "y"])

In [235]:
# add tokens
df['token'] = words

In [236]:
classe = []
faslse_positives = ["που", "αυτου","και","ειναι","κατʼ"]
for tok in df['token']:
    if tok in faslse_positives:
        classe.append('non_medium')
        continue
    regex = r"\w+(μαι|σαι|σο|ται|το|μεθα|μεθʼ|σθε|σθʼ|σθ|νται|ντʼ|ντο|σθαι|μην|ου|ιο|σθω|σθων|μεν\w{1,3})\b"
    match = re.match(regex, tok)
    if match:
        classe.append('medium')
    else:
        classe.append('non_medium')

In [237]:
# add labels
df['label'] = classe

In [238]:
df.head()

Unnamed: 0,x,y,token,label
0,2.033425,-0.54543,ως,non_medium
1,1.928565,-1.080328,εν,non_medium
2,0.199721,-2.188579,γαρ,non_medium
3,0.680102,-1.566368,ου,non_medium
4,1.823148,-1.16384,δη,non_medium


In [239]:
df.to_csv("../data/gensim_vectors/media_vectors_tsne.csv")
# df.to_csv("../data/gensim_vectors/media_vectors_svd.csv")

## Visualization

In [240]:
import bokeh.models as bm, bokeh.plotting as pl
from bokeh.io import output_notebook, export_png
from bokeh.plotting import figure, show
from bokeh.transform import factor_cmap, factor_mark
output_notebook()

In [241]:
def draw_groups(data, radius=10, alpha=0.25,
                 width=600, height=400, show=True, markers=['circle_x', 'triangle'], colorstyle=['#2ca02c', '#aec7e8'], name="plot.png"):
    """ draws an interactive plot for data points with auxilirary info on hover """
    
    MARKERS = markers
    LABEL = ['medium', 'non_medium']
    
    src = bm.ColumnDataSource(data)
    
    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height, title="Skip Gram w=15, negative=10, softmax")
    
    fig.scatter('x','y',
                size=7,
                legend_field="label",
                marker=factor_mark('label', MARKERS, LABEL),
                color=factor_cmap('label', colorstyle, LABEL),
                source=src)

    
    fig.add_tools(bm.HoverTool(tooltips=[("token", "@token")]))
    export_png(fig, filename="../data/plots/"+name)
    if show: pl.show(fig)
    return fig

In [242]:
draw_groups(data=df,name="fastText_0402_1434.png")