# Word2Vec 

In [1]:
from argparse import Namespace
from greek_accentuation.characters import strip_accents, strip_breathing
import gensim
from gensim import utils
import pandas as pd 
import re
from gensim.models import Word2Vec
from cltk.tokenize.word import WordTokenizer
from cltk.tokenize.sentence import TokenizeSentence
word_tokenizer = WordTokenizer('greek')


##  Preprocessing

In [2]:
args = Namespace(
    raw_data = "../data/raw_data/HomerGesamt_cleaned.txt",
    stopwd_path = "../data/stopwords.txt"
)

# load data
with open(args.raw_data, 'r', encoding='utf-8') as src:
    data = src.read()

In [3]:
# load stopwords
with open(args.stopwd_path, 'r', encoding="utf-8") as src:
    stopwords = src.read()
    
stopwords = stopwords.split()

In [4]:
# delete stopwords (I know, a kind of gross and clumsy solution)
tokens_raw = data.split()
tokens_raw_cleaned = [w for w in tokens_raw if w not in stopwords]
data = " ".join(w for w in tokens_raw_cleaned)

In [5]:
# remove accents and breathings
data = strip_accents(strip_breathing(data))

In [6]:
# save (I will need this cleaned file for sure later on)
with open("../data/raw_data/HomerGesamt_deaccented.txt", 'w', encoding="utf-8") as fp:
    fp.write(data)

In [7]:
# Extract sentences
sentences = TokenizeSentence('greek').tokenize(data)

### Prepare for Word2Vec

In [8]:
# generator

class HomerCorpus(object):
    def __init__(self, sentencesList):
        self.sentences = sentencesList
    
    def __iter__(self):
        for sentence in self.sentences:
            yield utils.simple_preprocess(sentence)

In [9]:
sentences = HomerCorpus(sentences)

### Initialize model

In [10]:
model = gensim.models.word2vec.Word2Vec(sentences=sentences, # corpus
                               window=10, # context-window
                               min_count=1, # also words that appears once can be interesting in the case of Homer
                               sg=0, # 1 = Skip-Gram, 0 = CBOW
                               alpha = 0.001                            
                              )

In [52]:
# train model
model.train(
    sentences=sentences,
    epochs=155,
    total_words=10000,
    start_alpha=0.01,
    end_alpha=0.06,
    word_count=0,
    queue_factor=2,
    report_delay=1.0,
    compute_loss=False
)

(1550000, 61645670)

In [53]:
model.save('../data/models/w2vec-homer-0402.model')

In [18]:
# quick look at the size of the vocab
model.wv.vectors.shape

(30453, 100)

In [19]:
data[:150]

'μηνιν αειδε θεα Πηληϊαδεω Αχιληος ουλομενην μυριʼ Αχαιοις αλγεʼ εθηκε πολλας ιφθιμους ψυχας Αϊδι προϊαψεν ηρωων αυτους ελωρια τευχε κυνεσσιν οιωνοισι '

In [51]:
model.wv.most_similar('εθηκε')

[('κρονου', 0.6951189041137695),
 ('αχαιοις', 0.672841489315033),
 ('ιφθιμους', 0.6721858978271484),
 ('πολλας', 0.6702628135681152),
 ('εφηνε', 0.6683825850486755),
 ('εσταοτες', 0.6348026394844055),
 ('ψυχας', 0.6307708024978638),
 ('ουλομενην', 0.629317581653595),
 ('αϊδι', 0.6168668270111084),
 ('θηκεν', 0.6061471700668335)]

In [54]:
model.wv.most_similar('εθηκε')

[('κρονου', 0.693800151348114),
 ('εσταοτες', 0.6759641170501709),
 ('πολλας', 0.6672139167785645),
 ('αχαιοις', 0.6632511019706726),
 ('ιφθιμους', 0.6569918990135193),
 ('εφηνε', 0.6417087316513062),
 ('ουλομενην', 0.62326979637146),
 ('θηκεν', 0.6109983921051025),
 ('ψυχας', 0.6089295148849487),
 ('αγκυλομητεω', 0.5999419689178467)]

In [55]:
vectors = model.wv.vectors    

## SVD decomposition

In [18]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=2, n_iter=10)

In [20]:
vectors = svd.fit_transform(vectors)

In [21]:
print(svd.explained_variance_ratio_)

[0.36827844 0.02314621]


In [22]:
print(svd.singular_values_)

[254.14343   58.650124]


### Scaler

In [56]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [57]:
vectors_normalized = scaler.fit_transform(vectors)

## Visualization

In [84]:
import bokeh.models as bm, bokeh.plotting as pl
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.transform import factor_cmap, factor_mark
output_notebook()

In [26]:
# Draw function
def draw_vectors(x, y, radius=10, alpha=0.25, color='blue',
                 width=600, height=400, show=True, **kwargs):
    """ draws an interactive plot for data points with auxilirary info on hover """
    if isinstance(color, str): color = [color] * len(x)
    data_source = bm.ColumnDataSource({ 'x' : x, 'y' : y, 'color': color, **kwargs })

    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height)
    fig.scatter('x', 'y', size=radius, color='color', alpha=alpha, source=data_source)

    fig.add_tools(bm.HoverTool(tooltips=[(key, "@" + key) for key in kwargs.keys()]))
    if show: pl.show(fig)
    return fig

In [159]:
# Extract vocabulary
words = sorted(model.wv.vocab.keys(),
              key = lambda w : model.wv.vocab[w].count, 
              reverse = True)

In [28]:
draw_vectors(vectors_normalized[:, 0], vectors_normalized[:, 1], token=words)

# Visualization with TSNE

In [60]:
from sklearn.manifold import TSNE
word_tsne = TSNE(n_components=2).fit_transform(vectors)
words_tsne_scaled = scaler.fit_transform(word_tsne)

In [61]:
draw_vectors(words_tsne_scaled[:, 0], words_tsne_scaled[:, 1], token=words)

## Build DataFrame

*for later Analysis*

In [62]:
# build dataframe from vectors
df = pd.DataFrame(data=words_tsne_scaled, columns=["x", "y"])
df['tokens'] = words

In [63]:
# check endings
label = []
for tok in df['tokens']:
    regex = r"\w+(μαι|αι|ται|τʼ|το|μεθα|μεθʼ|σθε|σθʼ|σθ|νται|ντʼ|ντο|σθαι|μην|σο|ου|ιο|σθω|σθων|μεν\w{1,3})\b"
    match = re.match(regex, tok)
    if match:
        label.append('medium')
    else:
        label.append('non_medium')

In [64]:
df['label'] = label

In [110]:
df.head(15)

Unnamed: 0,x,y,tokens,label
0,2.033425,-0.54543,ως,non_medium
1,1.928565,-1.080328,εν,non_medium
2,0.199721,-2.188579,γαρ,non_medium
3,0.680102,-1.566368,ου,non_medium
4,1.823148,-1.16384,δη,non_medium
5,1.447033,-1.737826,επι,non_medium
6,1.600468,-0.485331,μοι,non_medium
7,1.628549,-2.059772,επει,non_medium
8,1.929719,-1.11588,μιν,non_medium
9,1.012881,-2.212163,γε,non_medium


In [66]:
df.to_csv('../data/media_w2vec.csv')

In [182]:
# Draw function
def draw_groups(data, radius=10, alpha=0.25,
                 width=600, height=400, show=True, markers=['circle_x', 'triangle'], colorstyle=['#2ca02c', '#aec7e8'], **kwargs):
    """ draws an interactive plot for data points with auxilirary info on hover """
    
    MARKERS = markers
    LABEL = ['medium', 'non_medium']
    
    src = bm.ColumnDataSource(data)
    
    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height)
    
    fig.scatter('x','y',
                legend_field="label",
                marker=factor_mark('label', MARKERS, LABEL),
                color=factor_cmap('label', colorstyle, LABEL),
                source=src)

    
    fig.add_tools(bm.HoverTool(tooltips=[("token", "@tokens")]))
    
    if show: pl.show(fig)
    return fig

In [183]:
draw_groups(data=df, colorstyle=['#d62728', '#aec7e8'], token=words)