In [29]:
import time
import numpy as np
import pandas as pd
import tensorflow as tf
import sklearn
import nltk
import re

In [30]:
from sklearn.datasets import fetch_20newsgroups
data = fetch_20newsgroups()

print("Fecthing data..................................")
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
newsgroups_data = newsgroups_train.data + newsgroups_test.data
newsgroups_labels = list(newsgroups_train.target) + list(newsgroups_test.target)

print("Data fetched successfully")
newsgroups_data_downsampled = []
newsgroups_labels_downsampled = []
for index in range(len(newsgroups_data)):
    if newsgroups_labels[index] == 1 or newsgroups_labels[index] == 16:
        newsgroups_data_downsampled.append(newsgroups_data[index])
        newsgroups_labels_downsampled.append(newsgroups_labels[index])
print("Data downsampled")

text = ' '.join(newsgroups_data_downsampled).lower()
text[100:350]

Fecthing data..................................
Data fetched successfully
Data downsampled


'ision\nlines: 14\ndistribution: world\nnntp-posting-host: amber.ssd.csd.harris.com\nx-newsreader: tin [version 1.1 pl9]\n\nrobert j.c. kyanko (rob@rjck.uucp) wrote:\n> abraxis@iastate.edu writes in article <abraxis.734340159@class1.iastate.edu>:\n> > anyone '

In [31]:
sentences_text = nltk.sent_tokenize(text)
sentences = [nltk.word_tokenize(s) for s in sentences_text]
temp_sentences = []
for sentence in sentences:
    temp_words = []
    for word in sentence:
        temp_word = re.sub('[^A-Za-z]+', '', word)
        if temp_word != '':
            temp_words.append(temp_word)
    temp_sentences.append(temp_words)

sentences = temp_sentences
print(sentences[10])

['this', 'makes', 'the', 'right', 'of', 'the', 'people', 'to', 'keep', 'and', 'bear', 'many', 'modern', 'weapons', 'nonexistant']


In [32]:
from collections import  Counter
from string import punctuation

min_count = 12
puncs = set(punctuation)


flat_words = []
for sentence in sentences:
    flat_words += sentence
    
counts = Counter(list(flat_words))
counts = pd.DataFrame(counts.most_common())
counts.columns = ['word', 'count']

counts = counts[counts['count'] >= min_count]
counts = counts[~counts['word'].isin(puncs)]


vocab = pd.Series(range(len(counts)), index=counts['word']).sort_index()

print('The vocabulary has:', len(vocab), 'words')

The vocabulary has: 4494 words


In [33]:
filtered_sentences = []

for sentence in sentences:
    sentence = [word for word in sentence if word in vocab.index]
    if len(sentence):
        filtered_sentences.append(sentence)
sentences = filtered_sentences

In [34]:
for i, sentence in enumerate(sentences):
    sentences[i] = [vocab.loc[word] for word in sentence]

In [35]:
from nltk.util import skipgrams

window_size = 1

data = []
for sentance in sentences:
    data += skipgrams(sentance, 2, window_size)

data = pd.DataFrame(data, columns=['x', 'y'])
data.head()

Unnamed: 0,x,y
0,12,981
1,12,1405
2,981,1405
3,981,30
4,1405,30


In [36]:
data[:20]

Unnamed: 0,x,y
0,12,981
1,12,1405
2,981,1405
3,981,30
4,1405,30
5,1405,37
6,30,37
7,30,197
8,37,197
9,35,1949


In [37]:
len(data)

880287

In [38]:
validation_size = 10000

data_valid = data.iloc[-validation_size:]
data_train = data.iloc[:-validation_size]
print('Train size:', len(data_train), 'Validation size:', len(data_valid))

Train size: 870287 Validation size: 10000


In [49]:
learning_rate = 0.001
embed_size = 300
batch_size = 64
steps = 1000

In [40]:
inputs = tf.placeholder(tf.int32, [None])
targets = tf.placeholder(tf.int32, [None])

In [41]:
embeddings = tf.Variable(tf.random_uniform((len(vocab), embed_size), -1, 1))
embed = tf.nn.embedding_lookup(embeddings, inputs)

In [42]:
logits = tf.layers.dense(embed, len(vocab), activation=None,
    kernel_initializer=tf.random_normal_initializer())

In [43]:
labels = tf.one_hot(targets, len(vocab))
loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels)
loss = tf.reduce_mean(loss)

train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)

In [44]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [45]:
from sklearn.metrics.pairwise import cosine_similarity

def get_batches(x, y, batch_size, n=None):
    if n:
        rand_start = np.random.randint(0, len(x) - batch_size * n)
        x = x[rand_start:]
        y = y[rand_start:]

    for start in range(len(x))[::batch_size][:n]:
        end = start + batch_size
        yield x[start:end], y[start:end]


In [79]:
step = 0
while step < 20000:
    start = time.time()
    
    # shuffle train data once in while
    if step % 1000 == 0:
        data_train = data_train.sample(frac=1.)
    # train part
    train_loss = []
    
    for x, y in get_batches(
        data_train['x'].values, data_train['x'].values, batch_size, n=100):
        step += 1
        _, batch_loss = sess.run([train_op, loss], {inputs: x, targets: y})
        train_loss.append(batch_loss)
            

    # validation prat (one batch of "validation_size")
    feed_dict = {inputs: data_valid['x'].values, targets: data_valid['x'].values}
    valid_loss, x_vectors = sess.run([loss, embed], feed_dict)
    y_vectors = sess.run(embed, {inputs: data_valid['x'].values})

    # outputs
    print('Step:', step, 'TLoss:', np.mean(train_loss), 'VLoss:', np.mean(valid_loss),
          'Seconds %.1f' % (time.time() - start))

Step: 100 TLoss: 0.0 VLoss: 0.0 Seconds 7.3
Step: 200 TLoss: 0.0 VLoss: 0.0 Seconds 6.6
Step: 300 TLoss: 0.0 VLoss: 0.0 Seconds 6.5
Step: 400 TLoss: 0.0 VLoss: 0.0 Seconds 6.3
Step: 500 TLoss: 0.0 VLoss: 0.0 Seconds 6.2
Step: 600 TLoss: 0.0 VLoss: 0.0 Seconds 6.2
Step: 700 TLoss: 0.0 VLoss: 0.0 Seconds 6.1
Step: 800 TLoss: 0.0 VLoss: 0.0 Seconds 6.1
Step: 900 TLoss: 0.0 VLoss: 0.0 Seconds 6.2
Step: 1000 TLoss: 0.0 VLoss: 0.0 Seconds 6.3
Step: 1100 TLoss: 0.0 VLoss: 0.0 Seconds 6.3
Step: 1200 TLoss: 0.0 VLoss: 0.0 Seconds 6.1
Step: 1300 TLoss: 0.0 VLoss: 0.0 Seconds 6.1
Step: 1400 TLoss: 0.0 VLoss: 0.0 Seconds 6.0
Step: 1500 TLoss: 0.0 VLoss: 0.0 Seconds 6.0
Step: 1600 TLoss: 0.0 VLoss: 0.0 Seconds 5.9
Step: 1700 TLoss: 0.0 VLoss: 0.0 Seconds 6.0
Step: 1800 TLoss: 0.0 VLoss: 0.0 Seconds 6.0
Step: 1900 TLoss: 0.0 VLoss: 0.0 Seconds 6.0
Step: 2000 TLoss: 0.0 VLoss: 0.0 Seconds 6.0
Step: 2100 TLoss: 0.0 VLoss: 0.0 Seconds 6.1
Step: 2200 TLoss: 0.0 VLoss: 0.0 Seconds 5.9
Step: 2300 TLoss: 0

Step: 16900 TLoss: 0.0 VLoss: 0.0 Seconds 6.8
Step: 17000 TLoss: 0.0 VLoss: 0.0 Seconds 6.5
Step: 17100 TLoss: 0.0 VLoss: 0.0 Seconds 7.0
Step: 17200 TLoss: 0.0 VLoss: 0.0 Seconds 6.8
Step: 17300 TLoss: 0.0 VLoss: 0.0 Seconds 6.5
Step: 17400 TLoss: 0.0 VLoss: 0.0 Seconds 6.5
Step: 17500 TLoss: 0.0 VLoss: 0.0 Seconds 7.2
Step: 17600 TLoss: 0.0 VLoss: 0.0 Seconds 6.8
Step: 17700 TLoss: 0.0 VLoss: 0.0 Seconds 6.5
Step: 17800 TLoss: 0.0 VLoss: 0.0 Seconds 6.7
Step: 17900 TLoss: 0.0 VLoss: 0.0 Seconds 6.8
Step: 18000 TLoss: 0.0 VLoss: 0.0 Seconds 6.9
Step: 18100 TLoss: 0.0 VLoss: 0.0 Seconds 7.0
Step: 18200 TLoss: 0.0 VLoss: 0.0 Seconds 6.4
Step: 18300 TLoss: 0.0 VLoss: 0.0 Seconds 6.9
Step: 18400 TLoss: 0.0 VLoss: 0.0 Seconds 6.5
Step: 18500 TLoss: 0.0 VLoss: 0.0 Seconds 6.2
Step: 18600 TLoss: 0.0 VLoss: 0.0 Seconds 6.2
Step: 18700 TLoss: 0.0 VLoss: 0.0 Seconds 6.7
Step: 18800 TLoss: 11.075033 VLoss: 0.0 Seconds 6.6
Step: 18900 TLoss: 7.2516384 VLoss: 3.1763575 Seconds 6.6
Step: 19000 TLos

In [51]:
vectors = sess.run(embeddings)
vectors = pd.DataFrame(vectors, index=vocab.index)

In [52]:
len(vectors)

4494

In [53]:
word_list = vocab.index.tolist()

In [80]:
from sklearn.metrics.pairwise import cosine_similarity
test_word = "then"

similarities = []
for word in word_list:
    item = (cosine_similarity(vectors.loc[[test_word]], vectors.loc[[word]])[0][0], word)
    similarities.append(item)
similarities = sorted(similarities)
similarities[-20:]

[(0.23115137, 'tried'),
 (0.23250142, 'statistic'),
 (0.23361155, 'firing'),
 (0.23450589, 'wearing'),
 (0.23545003, 'assume'),
 (0.23781925, 'controller'),
 (0.24547061, 'significantly'),
 (0.24866837, 'tax'),
 (0.25343916, 'matrix'),
 (0.25458717, 'habit'),
 (0.25981885, 'when'),
 (0.26332042, 'animated'),
 (0.26457113, 'blowing'),
 (0.2848406, 'kids'),
 (0.2853072, 'positive'),
 (0.28860277, 'glenn'),
 (0.2981456, 'fractals'),
 (0.40886068, 'detroit'),
 (0.42984682, 'glenns'),
 (1.0, 'then')]

In [82]:
from sklearn.metrics.pairwise import cosine_similarity
test_word = "four"

similarities = []
for word in word_list:
    item = (cosine_similarity(vectors.loc[[test_word]], vectors.loc[[word]])[0][0], word)
    similarities.append(item)
similarities = sorted(similarities)
similarities[-20:]

[(0.24393196, 'use'),
 (0.24490309, 'microwaves'),
 (0.24665128, 'photographs'),
 (0.24775606, 'fewer'),
 (0.2480158, 'specification'),
 (0.25069916, 'ascii'),
 (0.25761282, 'bound'),
 (0.26392493, 'attached'),
 (0.2674393, 'al'),
 (0.27037475, 'additionally'),
 (0.27110022, 'rle'),
 (0.27259976, 'branch'),
 (0.27281642, 'mine'),
 (0.27500162, 'religion'),
 (0.2776731, 'tromsoe'),
 (0.28250164, 'seems'),
 (0.28782457, 'random'),
 (0.2947551, 'lift'),
 (0.3184655, 'pull'),
 (0.9999999, 'four')]

In [81]:
import os
def plot_with_labels(low_dim_embs, labels, filename):
  assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
  plt.figure(figsize=(18, 18))  # in inches
  for i, label in enumerate(labels):
    x, y = low_dim_embs[i, :]
    plt.scatter(x, y)
    plt.annotate(
        label,
        xy=(x, y),
        xytext=(5, 2),
        textcoords='offset points',
        ha='right',
        va='bottom')

  plt.savefig(filename)

try:
  from sklearn.manifold import TSNE
  import matplotlib.pyplot as plt

  tsne = TSNE(
      perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
  plot_only = 500
  low_dim_embs = tsne.fit_transform(vectors)
#   labels = [reverse_dictionary[i] for i in xrange(plot_only)]
  plot_with_labels(low_dim_embs, word_list, os.path.join('tsne.png'))

except ImportError as ex:
  print('Please install sklearn, matplotlib, and scipy to show embeddings.')
  print(ex)

KeyboardInterrupt: 