In [21]:
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import re
import matplotlib

In [22]:
import matplotlib.pyplot as plt
%matplotlib inline
stopword_file ='long_stopwords'

In [23]:
stop_words = []

with open(stopword_file,'r') as inpFile:
    lines = inpFile.readlines()
    stop_words_temp = map(lambda x : re.sub('\n','',x),lines)
    stop_words = list(map(lambda x:  re.sub('[^A-Za-z0-9]+', '',x), stop_words_temp))

In [24]:
def clean(word):
    word = word.strip()
    word = word.lower()
    word = re.sub('[^A-Za-z0-9]+', '', word)
    if word not in stop_words:
        return word
    else:
        return ''

In [25]:
line_count = 0
sentences = []

with open('dataset','r') as inpFile:
    x = inpFile.readlines()
    for line in x:
         if line is not None or line != '\n':
            words = line.split()
            words = map(lambda x: clean(x), words)
            words = list(filter(lambda x:True if len(x) > 0 else False, words))
            sentences.append(words)
            

In [26]:
sentences[100:110]

[['anbarivan', 'nl'],
 [],
 ['ii'],
 [],
 [],
 ['contents'],
 [],
 ['contents', 'iv'],
 [],
 ['list', 'figures', 'ix']]

In [27]:
model = Word2Vec(sentences, window=5, size=300, workers=4, min_count=3,sg = 1,sorted_vocab= 1)

# summarize the loaded model
print(model)

Word2Vec(vocab=282, size=300, alpha=0.025)


In [28]:
# summarize vocabulary
words = list(model.wv.vocab)
print(words[:10])

['local', 'location', 'slope', 'institute', 'gratitude', 'reason', 'regression', 'prediction', '2013', 'aggregation']


In [29]:
model.save('testmodel')
model=gensim.models.Word2Vec.load('testmodel')

In [30]:
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector
import numpy as np
import gensim

import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

In [31]:
fname = "testmodel"
model = gensim.models.keyedvectors.KeyedVectors.load('testmodel')

In [32]:
max_size = len(model.wv.vocab)

In [33]:
w2v = np.zeros((max_size,model.layer1_size))

  """Entry point for launching an IPython kernel.


In [34]:
w2v

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [35]:
with open("tensorboard/metadata.tsv", 'w+') as file_metadata:
    for i,word in enumerate(model.wv.index2word[:max_size]):
        w2v[i] = model.wv[word]
        file_metadata.write(word + '\n')


In [36]:
w2v

array([[ 0.00044317,  0.00014877, -0.00018831, ...,  0.00145819,
         0.00120087, -0.00533671],
       [ 0.00364124, -0.00079064, -0.00081195, ...,  0.00066612,
        -0.00040527, -0.00541985],
       [ 0.00339293, -0.0005191 ,  0.00030497, ...,  0.00024385,
        -0.00094939, -0.00636967],
       ...,
       [ 0.00162614, -0.00107125,  0.00049592, ...,  0.00103954,
        -0.00072271, -0.0033562 ],
       [-0.00092026,  0.00030447,  0.00150038, ..., -0.00159269,
        -0.00109916, -0.00078331],
       [-0.00033032,  0.00057825,  0.00094793, ..., -0.00177892,
         0.0006027 , -0.00078415]])

In [37]:
labels = []
tokens = []

for word in model.wv.vocab:
    tokens.append(model[word])
    labels.append(word)
    
tsne_model = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)



  """


In [38]:
import pandas as pd
df_pca = pd.DataFrame(tsne_model.fit_transform(tokens))
df_pca.values

array([[-3.33484858e-01, -1.84872460e+00],
       [-4.57739019e+00, -3.65143156e+00],
       [ 5.07144308e+00,  3.46598059e-01],
       [-5.09462214e+00, -6.37110376e+00],
       [ 4.71983767e+00,  7.60334635e+00],
       [ 6.07437944e+00,  9.27426052e+00],
       [ 7.83287907e+00,  1.23519278e+01],
       [ 7.00581455e+00,  5.14096498e+00],
       [-1.13868399e+01, -4.74464464e+00],
       [ 5.83107090e+00,  1.12451906e+01],
       [-1.77651453e+00, -1.25647879e+00],
       [ 4.93155813e+00,  3.67151022e+00],
       [-3.11204672e+00, -4.06665373e+00],
       [ 6.83760071e+00,  6.50033140e+00],
       [ 5.26630938e-01, -1.00120568e+00],
       [ 5.60234594e+00,  3.69100595e+00],
       [-1.20410967e+01, -1.13387804e+01],
       [ 5.66920614e+00,  8.68757343e+00],
       [ 4.34440994e+00, -2.20460251e-01],
       [ 9.03541374e+00,  7.72725773e+00],
       [-9.52120113e+00, -7.30560255e+00],
       [-1.87268329e+00, -7.22386694e+00],
       [ 8.32278633e+00,  1.28201418e+01],
       [-6.

In [39]:
## TensorFlow Variable from data
tf_data = tf.Variable(df_pca)

In [41]:
import os

PATH = os.getcwd()




## Running TensorFlow Session

LOG_DIR = PATH + '/tensorboard/'

metadata = os.path.join(LOG_DIR, 'metadata.tsv')

with tf.Session() as sess:
    saver = tf.train.Saver([tf_data])
    sess.run(tf_data.initializer)
    saver.save(sess, os.path.join(LOG_DIR, 'tf_data.ckpt'))
    config = projector.ProjectorConfig()
    
    
# One can add multiple embeddings.
embedding = config.embeddings.add()
embedding.tensor_name = tf_data.name
# Link this tensor to its metadata(Labels) file
embedding.metadata_path = metadata
# Saves a config file that TensorBoard will read during startup.
projector.visualize_embeddings(tf.summary.FileWriter(LOG_DIR), config)


In [None]:
#-------------------------------------------------------#

In [None]:

#!pip install tfp-nightly

In [None]:
#runn terminal

In [None]:
#python -m tensorboard.main --logdir=/home/abu/Desktop/nlp/tensorboard