# Word2Vec 

In [68]:
from argparse import Namespace
from greek_accentuation.characters import strip_accents, strip_breathing
import gensim
from gensim import utils
from gensim.models import Word2Vec
from cltk.tokenize.word import WordTokenizer
from cltk.tokenize.sentence import TokenizeSentence
word_tokenizer = WordTokenizer('greek')


In [55]:
args = Namespace(
    raw_data = "../data/raw_data/HomerGesamt_cleaned.txt",
    stopwd_path = "../data/stopwords.txt"
)

# load data
with open(args.raw_data, 'r', encoding='utf-8') as src:
    data = src.read()

In [56]:
# load stopwords
with open(args.stopwd_path, 'r', encoding="utf-8") as src:
    stopwords = src.read()
    
stopwords = stopwords.split()

In [57]:
# delete stopwords (I know, a kind of gross and clumsy solution)
tokens_raw = data.split()
tokens_raw_cleaned = [w for w in tokens_raw if w not in stopwords]
data = " ".join(w for w in tokens_raw_cleaned)

In [65]:
# remove accents and breathings
data = strip_accents(strip_breathing(data))

In [69]:
# Extract sentences
sentences = TokenizeSentence('greek').tokenize(data)

In [70]:
# generator

class HomerCorpus(object):
    def __init__(self, sentencesList):
        self.sentences = sentencesList
    
    def __iter__(self):
        for sentence in self.sentences:
            yield utils.simple_preprocess(sentence)

In [71]:
sentences = HomerCorpus(sentences)

In [72]:
model = gensim.models.word2vec.Word2Vec(sentences=sentences, # corpus
                               window=10, # context-window
                               min_count=1, # also words that appears once can be interesting in the case of Homer
                               sg=1, # 1 = Skip-Gram, 0 = CBOW
                               alpha = 0.001
                              )

In [79]:
model.train(
    sentences=sentences,
    epochs=125,
    total_words=10000,
    start_alpha=0.001,
    end_alpha=0.05,
    word_count=0,
    queue_factor=2,
    report_delay=1.0,
    compute_loss=True
)

(1250000, 49714250)

In [83]:
model.save('../data/models/gensim-homer03022001.model')

In [82]:
# quick look at the size of the vocab
model.wv.vectors.shape

(30453, 100)

In [84]:
data[:150]

'μηνιν αειδε θεα Πηληϊαδεω Αχιληος ουλομενην μυριʼ Αχαιοις αλγεʼ εθηκε πολλας ιφθιμους ψυχας Αϊδι προϊαψεν ηρωων αυτους ελωρια τευχε κυνεσσιν οιωνοισι '

In [85]:
model.most_similar('ουλομενην')

  """Entry point for launching an IPython kernel.


[('αχαιοις', 0.9820338487625122),
 ('αειδε', 0.9775828719139099),
 ('πολλας', 0.9637151956558228),
 ('ιφθιμους', 0.9418951272964478),
 ('ψυχας', 0.9213254451751709),
 ('αϊδι', 0.8919933438301086),
 ('πηληϊαδεω', 0.8798472881317139),
 ('αχιληος', 0.871590793132782),
 ('προϊαψεν', 0.8544961214065552),
 ('ηρωων', 0.8381833434104919)]

## SVD Dimensionality reduction

In [89]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=2, n_iter=10)

In [90]:
vectors = model.wv.vectors    

In [91]:
vectors = svd.fit_transform(vectors)

In [92]:
print(svd.explained_variance_ratio_)

[0.33798635 0.0197518 ]


In [93]:
print(svd.singular_values_)

[260.41373   57.982033]


### Scaler

In [94]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [95]:
vectors_normalized = scaler.fit_transform(vectors)

## Visualization

In [96]:
import bokeh.models as bm, bokeh.plotting as pl
from bokeh.io import output_notebook
output_notebook()

In [97]:
# Draw function
def draw_vectors(x, y, radius=10, alpha=0.25, color='blue',
                 width=600, height=400, show=True, **kwargs):
    """ draws an interactive plot for data points with auxilirary info on hover """
    if isinstance(color, str): color = [color] * len(x)
    data_source = bm.ColumnDataSource({ 'x' : x, 'y' : y, 'color': color, **kwargs })

    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height)
    fig.scatter('x', 'y', size=radius, color='color', alpha=alpha, source=data_source)

    fig.add_tools(bm.HoverTool(tooltips=[(key, "@" + key) for key in kwargs.keys()]))
    if show: pl.show(fig)
    return fig

In [98]:
# Extract vocabulary
words = sorted(model.wv.vocab.keys(),
              key = lambda w : model.wv.vocab[w].count, 
              reverse = True)

In [99]:
draw_vectors(vectors_normalized[:, 0], vectors_normalized[:, 1], token=words)

# Visualization with TSNE

In [None]:
from sklearn.manifold import TSNE
word_tsne = TSNE(n_components=2).fit_transform(vectors)
words_tsne_scaled = scaler.fit_transform(word_tsne)

In [None]:
draw_vectors(words_tsne_scaled[:, 0], words_tsne_scaled[:, 1], token=words)