In [None]:
## Installing fasttext package if needed
!pip -q install -U fasttext seaborn gensim scikit-learn numpy pandas

#https://radimrehurek.com/gensim/models/fasttext.html
#https://colab.research.google.com/drive/1_jZOV8G-zr11aHYcgV9IFpCgObA4Br1E#scrollTo=62Ui7K--RZry
#https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial

In [None]:
# -*- coding: utf-8 -*-

In [1]:
import os
import sys
import re
import pickle
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import gensim
from gensim.models import KeyedVectors
from gensim.models.fasttext import FastText
from gensim.models import Word2Vec
#from gensim.scripts.glove2word2vec import glove2word2vec
#from gensim.test.utils import datapath, get_tmpfile

#import fasttext
#import fasttext.util

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import parser

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def timing(f):
    @wraps(f)
    def wrap(*args, **kw):
        ts = time()
        result = f(*args, **kw)
        te = time()
        print('func:{} took: {:2.4f} sec'.format(f.__name__, te-ts))
        return result
    return wrap

### Retrieving pickle

In [3]:
df_files = pd.read_pickle('../outputs/df_files.pkl')

In [4]:
folder = '/home/jovyan/shared/C_amc_141/R_amc_3.1_12921/203_vert_spacy_rftt/'

In [5]:
responses = []
for filepath in df_files[0:25].path:
    path = os.path.join(folder, filepath)
    response = parser.parse(path)
    response = list(response)
    responses.extend(response)
print(len(responses))

5706


In [6]:
# Defining values for parameters
embedding_size = 100
window_size = 5
min_word = 2
down_sampling = 1e-2


model_ft = FastText(responses,
                    size=embedding_size,
                    window=window_size,
                    min_count=min_word,
                    sample=down_sampling,
                    workers = -1,
                    sg=1,
                    iter=100)

model_wv = Word2Vec(sentences=responses, 
                    size=embedding_size, 
                    window=window_size, 
                    min_count=min_word,
                    sample=down_sampling,
                    workers=-1, 
                    sg=0)

In [7]:
word = "Politiker"

In [8]:
model_ft.wv.most_similar(word, topn=10)

[('Politikern', 0.8484600186347961),
 ('Politikers', 0.840560793876648),
 ('Politikerin', 0.7620358467102051),
 ('politiker', 0.7289317846298218),
 ('Politiken', 0.6909029483795166),
 ('Politik', 0.6865997910499573),
 ('Politika', 0.6759346723556519),
 ('Politi', 0.6442322731018066),
 ('politikers', 0.618487536907196),
 ('Politikerinnen', 0.6086919903755188)]

In [9]:
model_wv.wv.most_similar(word, topn=10)

[('Bravo', 0.37358352541923523),
 ('kommentarlos', 0.3728218972682953),
 ('Arena', 0.3688957691192627),
 ('Gilligan', 0.36179256439208984),
 ('Meyer', 0.35840052366256714),
 ('prominente', 0.3579070568084717),
 ('ÖHV', 0.3571491539478302),
 ('entscheidende', 0.3562243580818176),
 ('bezichtigten', 0.3510328233242035),
 ('Berühmtheiten', 0.3474709987640381)]

In [10]:
responses = []
for filepath in df_files[25:50].path:
    path = os.path.join(folder, filepath)
    response = parser.parse(path)
    response = list(response)
    responses.extend(response)
    
print(len(responses))

6661


In [12]:
model_ft.build_vocab(responses, update=True)  # Update the vocabulary
model_ft.train(responses, total_examples=len(responses), epochs=model_ft.epochs)

model_wv.build_vocab(responses, update=True)  # Update the vocabulary
model_wv.train(responses, total_examples=len(responses), epochs=model_wv.epochs)

(0, 0)

In [13]:
model_ft.wv.most_similar(word, topn=10)

[('Politikern', 0.8484600186347961),
 ('Politikers', 0.840560793876648),
 ('Politikerin', 0.7620358467102051),
 ('politiker', 0.7289317846298218),
 ('Politiken', 0.6909029483795166),
 ('Politik', 0.6865997910499573),
 ('Politika', 0.6759346723556519),
 ('litiker', 0.6655194759368896),
 ('Politi', 0.6442322731018066),
 ('Exilpolitiker', 0.6252126693725586)]

In [14]:
model_wv.wv.most_similar(word, topn=10)

[('Warenhaus', 0.43683958053588867),
 ('Peiritsch', 0.4329608082771301),
 ('2150', 0.3990057706832886),
 ('Bravo', 0.37358352541923523),
 ('kommentarlos', 0.3728218972682953),
 ('Arena', 0.3688957691192627),
 ('Gilligan', 0.36179256439208984),
 ('Meyer', 0.35840052366256714),
 ('prominente', 0.3579070568084717),
 ('ÖHV', 0.3571491539478302)]

In [16]:
responses = []
for filepath in df_files[50:75].path:
    path = os.path.join(folder, filepath)
    response = parser.parse(path)
    response = list(response)
    responses.extend(response)

print(len(responses))

6269


In [17]:
model_ft.build_vocab(responses, update=True)  # Update the vocabulary
model_ft.train(responses, total_examples=len(responses), epochs=model_ft.epochs)

model_wv.build_vocab(responses, update=True)  # Update the vocabulary
model_wv.train(responses, total_examples=len(responses), epochs=model_wv.epochs)

(0, 0)

In [18]:
model_ft.wv.most_similar(word, topn=10)

[('Politikern', 0.8484600186347961),
 ('Politikers', 0.840560793876648),
 ('Politikerin', 0.7620358467102051),
 ('politiker', 0.7289317846298218),
 ('Politiken', 0.6909029483795166),
 ('Politik', 0.6865997910499573),
 ('Politika', 0.6759346723556519),
 ('litiker', 0.6655194759368896),
 ('Berufspolitiker', 0.65301114320755),
 ('Politi', 0.6442322731018066)]

In [19]:
model_wv.wv.most_similar(word, topn=10)

[('Warenhaus', 0.43683958053588867),
 ('Peiritsch', 0.4329608082771301),
 ('2150', 0.3990057706832886),
 ('überdecken', 0.3787200450897217),
 ('Bravo', 0.37358352541923523),
 ('kommentarlos', 0.3728218972682953),
 ('Arena', 0.3688957691192627),
 ('Gilligan', 0.36179256439208984),
 ('Meyer', 0.35840052366256714),
 ('prominente', 0.3579070568084717)]

In [20]:
def analogy(model, worda, wordb, wordc):
    result = model.wv.most_similar(negative=[worda], positive=[wordb, wordc])
    return result[0][0]

In [21]:
analogy(model_ft, 'Mann','Politiker','Frau')

'litiker'

In [22]:
analogy(model_wv, 'Mann','Politiker','Frau')

'hübsch'

In [None]:
# Save gensim models
model_ft.save("../outputs/amc.fasttext.300.toy.model")
model_wv.save("../outputs/amc.word2vec.300.toy.model")

In [None]:
# Load saved gensim fastText model
#model_ft = Word2Vec.load("../outputs/amc.fasttext.300.toy.model")

In [23]:
def tsne_plot(for_word, w2v_model):
    # trained fastText model dimention
    dim_size = w2v_model.wv.vectors.shape[1]
    arrays = np.empty((0, dim_size), dtype='f')
    word_labels = [for_word]
    color_list  = ['red']
    # adds the vector of the query word
    arrays = np.append(arrays, w2v_model.wv.__getitem__([for_word]), axis=0)
    # gets list of most similar words
    sim_words = w2v_model.wv.most_similar(for_word, topn=10)
    # adds the vector for each of the closest words to the array
    for wrd_score in sim_words:
        wrd_vector = w2v_model.wv.__getitem__([wrd_score[0]])
        word_labels.append(wrd_score[0])
        color_list.append('green')
        arrays = np.append(arrays, wrd_vector, axis=0)
    #---------------------- Apply PCA and tsne to reduce dimention --------------
    # fit 2d PCA model to the similar word vectors
    model_pca = PCA(n_components = 10).fit_transform(arrays)
    # Finds 2d coordinates t-SNE
    np.set_printoptions(suppress=True)
    Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(model_pca)
    # Sets everything up to plot
    df_plot = pd.DataFrame({'x': [x for x in Y[:, 0]],
                            'y': [y for y in Y[:, 1]],
                            'words_name': word_labels,
                            'words_color': color_list})
    #------------------------- tsne plot Python -----------------------------------
    # plot dots with color and position
    fig, ax = plt.subplots(figsize=(16,10))
    plot_dot = sns.regplot(data=df_plot,
                           x="x",
                           y="y",
                           fit_reg=False,
                           marker="o",
                           ax=ax,
                           scatter_kws={'s': 40,
                                        'facecolors': df_plot['words_color']
                                        }
                           )
 
    # Adds annotations with color one by one with a loop
    for line in range(0, df_plot.shape[0]):
         plot_dot.text(df_plot["x"][line],
                 df_plot['y'][line],
                 '  ' + df_plot["words_name"][line].title(),
                 horizontalalignment='left',
                 verticalalignment='bottom', size='medium',
                 color=df_plot['words_color'][line],
                 weight='normal'
                ).set_size(15)
 
    plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50)
    plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50)
    plt.title('t-SNE visualization for word "{}'.format(for_word.title()) +'"')

In [None]:
# tsne plot for top 10 similar word to 'chicken'
tsne_plot(for_word=word, w2v_model=model)