In [1]:
import time
import numpy as np
import pandas as pd
import tensorflow as tf
import sklearn
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from collections import  Counter
from string import punctuation
from nltk.corpus import stopwords
from sklearn.datasets import fetch_20newsgroups

In [13]:
#nltk.download()

In [50]:
data = fetch_20newsgroups()

In [51]:
text = ' '.join(data.data).lower()

In [39]:
type(text)
len(text)

22065807

In [75]:
text2 = text[0:1000000]

In [76]:
sentences_text = nltk.sent_tokenize(text2)
len(sentences_text)

7657

In [77]:
sentences = [nltk.word_tokenize(s) for s in sentences_text]
print(sentences[10])

['please', 'send', 'a', 'brief', 'message', 'detailing', 'your', 'experiences', 'with', 'the', 'procedure', '.']


In [78]:
min_count = 5
puncs = set(punctuation)
stops = set(stopwords.words('english'))

flat_words = []
for sentence in sentences:
    flat_words += sentence
    
counts = Counter(list(flat_words))
counts = pd.DataFrame(counts.most_common())
counts.columns = ['word', 'count']

counts = counts[counts['count'] >= min_count]
counts = counts[~counts['word'].isin(puncs)]
counts = counts[~counts['word'].isin(stops)]


vocab = pd.Series(range(len(counts)), index=counts['word']).sort_index()

print('The vocabulary has:', len(vocab), 'words')


The vocabulary has: 3588 words


In [79]:
filtered_sentences = []

for sentence in sentences:
    sentence = [word for word in sentence if word in vocab.index]
    if len(sentence):
        filtered_sentences.append(sentence)
sentences = filtered_sentences


In [80]:
for i, sentence in enumerate(sentences):
    sentences[i] = [vocab.loc[word] for word in sentence]

In [81]:
from nltk.util import skipgrams

window_size = 10

data = []
for sentance in sentences:
    data += skipgrams(sentance, 2, window_size)

data = pd.DataFrame(data, columns=['x', 'y'])
data.head()

Unnamed: 0,x,y
0,4,158
1,4,6
2,4,162
3,158,6
4,158,162


In [82]:
validation_size = 5000

data_valid = data.iloc[-validation_size:]
data_train = data.iloc[:-validation_size]
print('Train size:', len(data_train), 'Validation size:', len(data_valid))


Train size: 474569 Validation size: 5000


In [87]:
learning_rate = .01
embed_size = 300
batch_size = 64
steps = 10000

In [88]:
inputs = tf.placeholder(tf.int32, [None])
targets = tf.placeholder(tf.int32, [None])

In [89]:
embeddings = tf.Variable(tf.random_uniform((len(vocab), embed_size), -1, 1))
embed = tf.nn.embedding_lookup(embeddings, inputs)

In [90]:
logits = tf.layers.dense(embed, len(vocab), activation=None,
    kernel_initializer=tf.random_normal_initializer())

In [91]:
labels = tf.one_hot(targets, len(vocab))
loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels)
loss = tf.reduce_mean(loss)

train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)


In [92]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [94]:
def get_batches(x, y, batch_size, n=None):
    if n:
        # cheap way to add some randomization
        rand_start = np.random.randint(0, len(x) - batch_size * n)
        x = x[rand_start:]
        y = y[rand_start:]

    for start in range(len(x))[::batch_size][:n]:
        end = start + batch_size
        yield x[start:end], y[start:end]

step = 0
while step < steps:
    start = time.time()
    
    # shuffle train data once in while
    if step % 100000 == 0:
        data_train = data_train.sample(frac=1.)
    
    # train part
    train_loss = []
    for x, y in get_batches(
        data_train['x'].values, data_train['x'].values, batch_size, n=1000):
        step += 1
        _, batch_loss = sess.run([train_op, loss], {inputs: x, targets: y})
        train_loss.append(batch_loss)

    # validation prat (one batch of "validation_size")
    feed_dict = {inputs: data_valid['x'].values, targets: data_valid['x'].values}
    valid_loss, x_vectors = sess.run([loss, embed], feed_dict)
    y_vectors = sess.run(embed, {inputs: data_valid['x'].values})

    # outputs
    print('Step:', step, 'TLoss:', np.mean(train_loss), 'VLoss:', np.mean(valid_loss),
          'Similarity: %.3f' % cosine_similarity(x_vectors, y_vectors).mean(),
          'Seconds %.1f' % (time.time() - start))

Step: 1000 TLoss: 2.49755 VLoss: 3.08729 Similarity: 0.021 Seconds 12.1
Step: 2000 TLoss: 0.0418947 VLoss: 1.69547 Similarity: 0.021 Seconds 12.1
Step: 3000 TLoss: 0.00826811 VLoss: 1.19409 Similarity: 0.021 Seconds 12.2
Step: 4000 TLoss: 0.000519004 VLoss: 1.19396 Similarity: 0.021 Seconds 12.1
Step: 5000 TLoss: 5.79719e-06 VLoss: 1.19379 Similarity: 0.021 Seconds 12.2
Step: 6000 TLoss: 0.000514915 VLoss: 1.19366 Similarity: 0.021 Seconds 12.2
Step: 7000 TLoss: 2.3306e-06 VLoss: 1.19357 Similarity: 0.021 Seconds 12.2
Step: 8000 TLoss: 1.47615e-06 VLoss: 1.19349 Similarity: 0.021 Seconds 12.2
Step: 9000 TLoss: 9.33616e-07 VLoss: 1.19341 Similarity: 0.022 Seconds 12.1
Step: 10000 TLoss: 5.98537e-07 VLoss: 1.19334 Similarity: 0.022 Seconds 12.3


In [95]:
vectors = sess.run(embeddings)
vectors = pd.DataFrame(vectors, index=vocab.index)

In [97]:
print('Similarity:')
print('   computer to mouse =', cosine_similarity(vectors.loc[['computer']], vectors.loc[['mouse']])[0][0])
print('   dog to mouse =', cosine_similarity(vectors.loc[['dog']], vectors.loc[['mouse']])[0][0])


Similarity:
   computer to mouse = 0.123813
   dog to mouse = 0.0155194


In [190]:
word = "computer"

In [191]:
a = cosine_similarity(vectors.loc[[word]],vectors)
top_20 = a.argsort()[::-1][:20]
top_20 = top_20[0][-20:]
tmp = vectors[0].to_dict()
tmp2 = list(tmp.keys())
for i in top_20:
    print(tmp2[i])

access.digex.com
acts
soviet
violence
finally
roby
motif
strong
supra
hearings
rod
atmosphere
seven
atl
filioque
believe
bell-northern
worse
.sl
computer


In [192]:
word = "mouse"
a = cosine_similarity(vectors.loc[[word]],vectors)
top_20 = a.argsort()[::-1][:20]
top_20 = top_20[0][-20:]
tmp = vectors[0].to_dict()
tmp2 = list(tmp.keys())
for i in top_20:
    print(tmp2[i])

39
approval
bnr.ca
carson.u.washington.edu
argued
somehow
`p
omran
apple
internet
francisco
calls
'as
15.00
minor
greece
dale
concealed
watch
mouse


In [193]:
word = "apple"
a = cosine_similarity(vectors.loc[[word]],vectors)
top_20 = a.argsort()[::-1][:20]
top_20 = top_20[0][-20:]
tmp = vectors[0].to_dict()
tmp2 = list(tmp.keys())
for i in top_20:
    print(tmp2[i])

registered
burst
dangerous
mouse
dale
os
across
graham
monitor
approval
requires
mac
public
wear
demand
near
speaking
fred
moment
apple


In [194]:
word = "mac"
a = cosine_similarity(vectors.loc[[word]],vectors)
top_20 = a.argsort()[::-1][:20]
top_20 = top_20[0][-20:]
tmp = vectors[0].to_dict()
tmp2 = list(tmp.keys())
for i in top_20:
    print(tmp2[i])

costs
become
homosexuals
asked
u
survey
0.375
missing
orbital
flyers
un
long
corn
user
western
jet
apple
obviously
depends
mac


In [134]:
len(a[0])

3588

In [106]:
len(vectors)

3588

In [136]:
top_20 = a.argsort()[::-1][:20]
top_20 = top_20[0][-20:]

In [138]:
len(top_20)

20

In [142]:
top_20

array([ 325,  348, 3056, 3436, 1377, 2822, 2207, 3134, 3170, 1600, 2827,
        526, 2956,  524, 1375,  610,  614, 3544,   42,  898])

In [177]:
tmp = vectors[0].to_dict()
tmp2 = list(tmp.keys())

In [178]:
tmp2[325]

'access.digex.com'

In [179]:
tmp2[348]

'acts'

In [181]:
for i in top_20:
    print(tmp2[i])

access.digex.com
acts
soviet
violence
finally
roby
motif
strong
supra
hearings
rod
atmosphere
seven
atl
filioque
believe
bell-northern
worse
.sl
computer


In [182]:
vectors

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'',0.219020,0.327594,-0.026279,0.849900,1.049641,0.106119,-1.036108,-0.844456,-0.931968,-0.517253,...,0.262925,-0.048303,-0.795332,-0.719295,-0.454516,0.098882,-0.886429,-0.469470,-0.659400,-0.575264
"',3",-0.845126,0.659842,-0.691344,-0.051600,0.726842,0.644064,0.081962,0.986629,0.547500,1.073118,...,-0.055287,0.002428,-0.245105,-0.609643,0.205721,0.169478,-0.370806,-0.752446,0.013805,0.288330
'1t,0.277531,-0.060102,0.048275,-0.017766,-0.972611,0.773291,-0.371148,0.117678,-0.400326,0.631382,...,-0.160468,0.701483,0.299333,-0.963652,-0.416046,-0.548308,-0.384386,-0.354478,1.005568,0.256684
'a,-0.073268,-0.577153,0.282718,-0.407436,-0.105576,-0.362498,-0.098031,-0.353301,0.137164,-1.055875,...,-0.484186,-0.228497,0.071181,-0.813474,0.460967,0.137250,-0.218550,0.867151,0.353875,0.490211
'as,0.731154,-0.832164,0.142006,-0.425607,0.082140,-0.550713,-0.215837,0.017684,0.827606,-0.540193,...,0.664619,-0.594159,-0.744524,0.087807,-0.193515,0.491032,0.514991,-0.888147,0.738542,-0.387292
'ax,-0.334812,0.145703,-0.196292,0.429087,1.089558,0.158960,0.618779,-0.364027,0.102972,0.337003,...,0.601719,-0.417940,-0.553440,-0.350729,-0.480615,0.009201,-0.106892,0.760248,-0.073919,-0.834981
'd,-1.072956,-0.407988,0.066262,-0.660072,-0.629947,-0.824782,0.280640,-0.141655,0.293983,-0.061961,...,-0.037557,0.040960,0.865561,-0.702513,0.024941,-0.343607,-0.310505,0.857963,0.459802,-0.823663
'in,-0.876235,-0.495948,-1.007374,0.238954,0.689588,-0.275466,0.694720,-0.031236,-0.169276,0.707978,...,0.423352,-0.248421,0.179555,-0.160231,-1.038044,0.529418,-0.186064,-0.474740,0.252410,0.692550
'll,0.665801,-0.862596,0.036773,-0.015098,-1.069746,0.487695,0.248091,0.458088,0.995888,-0.512104,...,0.660056,-0.217025,0.375482,-1.084263,0.107010,0.431116,-0.565049,-0.293653,0.627092,0.659780
'm,1.111691,0.905766,0.732034,-0.801996,-0.956032,-0.415996,0.483600,-1.133537,0.064842,0.692898,...,-0.855811,-0.327444,-0.103619,0.394662,0.840765,-0.555867,-0.330666,-0.115458,-0.558625,-0.837108


In [185]:
import os

In [None]:
embedding_var = tf.Variable(....)

In [188]:
from tensorflow.contrib.tensorboard.plugins import projector

# Create randomly initialized embedding weights which will be trained.
#N = 10000 # Number of items (vocab size).
#D = 300 # Dimensionality of the embedding.
embedding_var = tf.Variable(vectors, name='word_embedding')

# Format: tensorflow/contrib/tensorboard/plugins/projector/projector_config.proto
config = projector.ProjectorConfig()

# You can add multiple embeddings. Here we add only one.
embedding = config.embeddings.add()
embedding.tensor_name = embedding_var.name
# Link this tensor to its metadata file (e.g. labels).
embedding.metadata_path = os.path.join("/home/ec2-user/dm/tf", 'metadata.tsv')

# Use the same LOG_DIR where you stored your checkpoint.
summary_writer = tf.summary.FileWriter("/home/ec2-user/dm/tf")

# The next line writes a projector_config.pbtxt in the LOG_DIR. TensorBoard will
# read this file during startup.
projector.visualize_embeddings(summary_writer, config)

In [None]:
embedding.sprite.image_path = 
# Specify the width and height of a single thumbnail.
embedding.sprite.single_image_dim.extend([10, 10])