# Word2Vec 

In [1]:
from argparse import Namespace
from greek_accentuation.characters import strip_accents, strip_breathing
import gensim
from gensim import utils
from gensim.models import Word2Vec
from cltk.tokenize.word import WordTokenizer
from cltk.tokenize.sentence import TokenizeSentence
word_tokenizer = WordTokenizer('greek')


##  Preprocessing

In [2]:
args = Namespace(
    raw_data = "../data/raw_data/HomerGesamt_cleaned.txt",
    stopwd_path = "../data/stopwords.txt"
)

# load data
with open(args.raw_data, 'r', encoding='utf-8') as src:
    data = src.read()

In [3]:
# load stopwords
with open(args.stopwd_path, 'r', encoding="utf-8") as src:
    stopwords = src.read()
    
stopwords = stopwords.split()

In [4]:
# delete stopwords (I know, a kind of gross and clumsy solution)
tokens_raw = data.split()
tokens_raw_cleaned = [w for w in tokens_raw if w not in stopwords]
data = " ".join(w for w in tokens_raw_cleaned)

In [5]:
# remove accents and breathings
data = strip_accents(strip_breathing(data))

In [31]:
# save (I will need this cleaned file for sure later on)
with open("../data/raw_data/HomerGesamt_deaccented.txt", 'w', encoding="utf-8") as fp:
    fp.write(data)

In [6]:
# Extract sentences
sentences = TokenizeSentence('greek').tokenize(data)

### Prepare for Word2Vec

In [7]:
# generator

class HomerCorpus(object):
    def __init__(self, sentencesList):
        self.sentences = sentencesList
    
    def __iter__(self):
        for sentence in self.sentences:
            yield utils.simple_preprocess(sentence)

In [8]:
sentences = HomerCorpus(sentences)

### Initialize model

In [9]:
model = gensim.models.word2vec.Word2Vec(sentences=sentences, # corpus
                               window=10, # context-window
                               min_count=1, # also words that appears once can be interesting in the case of Homer
                               sg=1, # 1 = Skip-Gram, 0 = CBOW
                               alpha = 0.001
                              )

In [10]:
# train model
model.train(
    sentences=sentences,
    epochs=125,
    total_words=10000,
    start_alpha=0.001,
    end_alpha=0.05,
    word_count=0,
    queue_factor=2,
    report_delay=1.0,
    compute_loss=True
)

(1250000, 49714250)

In [11]:
model.save('../data/models/gensim-homer03022001.model')

In [12]:
# quick look at the size of the vocab
model.wv.vectors.shape

(30453, 100)

In [13]:
data[:150]

'μηνιν αειδε θεα Πηληϊαδεω Αχιληος ουλομενην μυριʼ Αχαιοις αλγεʼ εθηκε πολλας ιφθιμους ψυχας Αϊδι προϊαψεν ηρωων αυτους ελωρια τευχε κυνεσσιν οιωνοισι '

In [17]:
model.most_similar('εθηκε')

  """Entry point for launching an IPython kernel.


[('αποφθιμενον', 0.7316854000091553),
 ('αχαιοις', 0.7166543006896973),
 ('θηκεν', 0.7134474515914917),
 ('πολλας', 0.7082372307777405),
 ('ιφθιμους', 0.7053426504135132),
 ('θαυμαζομεν', 0.7000839710235596),
 ('επισφυριοις', 0.6980096697807312),
 ('αραρυιας', 0.6973046064376831),
 ('καλας', 0.6957942247390747),
 ('αειδε', 0.6932476162910461)]

## SVD Dimensionality reduction

In [18]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=2, n_iter=10)

In [19]:
vectors = model.wv.vectors    

In [20]:
vectors = svd.fit_transform(vectors)

In [21]:
print(svd.explained_variance_ratio_)

[0.36827844 0.02314621]


In [22]:
print(svd.singular_values_)

[254.14343   58.650124]


### Scaler

In [23]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [24]:
vectors_normalized = scaler.fit_transform(vectors)

## Visualization

In [25]:
import bokeh.models as bm, bokeh.plotting as pl
from bokeh.io import output_notebook
output_notebook()

In [26]:
# Draw function
def draw_vectors(x, y, radius=10, alpha=0.25, color='blue',
                 width=600, height=400, show=True, **kwargs):
    """ draws an interactive plot for data points with auxilirary info on hover """
    if isinstance(color, str): color = [color] * len(x)
    data_source = bm.ColumnDataSource({ 'x' : x, 'y' : y, 'color': color, **kwargs })

    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height)
    fig.scatter('x', 'y', size=radius, color='color', alpha=alpha, source=data_source)

    fig.add_tools(bm.HoverTool(tooltips=[(key, "@" + key) for key in kwargs.keys()]))
    if show: pl.show(fig)
    return fig

In [27]:
# Extract vocabulary
words = sorted(model.wv.vocab.keys(),
              key = lambda w : model.wv.vocab[w].count, 
              reverse = True)

In [28]:
draw_vectors(vectors_normalized[:, 0], vectors_normalized[:, 1], token=words)

# Visualization with TSNE

In [29]:
from sklearn.manifold import TSNE
word_tsne = TSNE(n_components=2).fit_transform(vectors)
words_tsne_scaled = scaler.fit_transform(word_tsne)

In [30]:
draw_vectors(words_tsne_scaled[:, 0], words_tsne_scaled[:, 1], token=words)