In [None]:
import time
import numpy as np
import pandas as pd
import tensorflow as tf
import sklearn
import nltk
import re

In [None]:
from sklearn.datasets import fetch_20newsgroups
data = fetch_20newsgroups()

print("Fecthing data..................................")
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
newsgroups_data = newsgroups_train.data + newsgroups_test.data
newsgroups_labels = list(newsgroups_train.target) + list(newsgroups_test.target)

print("Data fetched successfully")
newsgroups_data_downsampled = []
newsgroups_labels_downsampled = []
for index in range(len(newsgroups_data)):
    if newsgroups_labels[index] == 1 or newsgroups_labels[index] == 16:
        newsgroups_data_downsampled.append(newsgroups_data[index])
        newsgroups_labels_downsampled.append(newsgroups_labels[index])
print("Data downsampled")

text = ' '.join(newsgroups_data_downsampled).lower()
text[100:350]

In [None]:
sentences_text = nltk.sent_tokenize(text)
sentences = [nltk.word_tokenize(s) for s in sentences_text]
temp_sentences = []
for sentence in sentences:
    temp_words = []
    for word in sentence:
        temp_word = re.sub('[^A-Za-z]+', '', word)
        if temp_word != '':
            temp_words.append(temp_word)
    temp_sentences.append(temp_words)

sentences = temp_sentences
print(sentences[10])

In [None]:
from collections import  Counter
from string import punctuation

min_count = 5
puncs = set(punctuation)


flat_words = []
for sentence in sentences:
    flat_words += sentence
    
counts = Counter(list(flat_words))
counts = pd.DataFrame(counts.most_common())
counts.columns = ['word', 'count']

counts = counts[counts['count'] >= min_count]
counts = counts[~counts['word'].isin(puncs)]


vocab = pd.Series(range(len(counts)), index=counts['word']).sort_index()

print('The vocabulary has:', len(vocab), 'words')

In [None]:
filtered_sentences = []

for sentence in sentences:
    sentence = [word for word in sentence if word in vocab.index]
    if len(sentence):
        filtered_sentences.append(sentence)
sentences = filtered_sentences

In [None]:
for i, sentence in enumerate(sentences):
    sentences[i] = [vocab.loc[word] for word in sentence]

In [None]:
from nltk.util import skipgrams

window_size = 1

data = []
for sentance in sentences:
    data += skipgrams(sentance, 2, window_size)

data = pd.DataFrame(data, columns=['x', 'y'])
data.head()

In [None]:
data[:20]

In [None]:
len(data)

In [None]:
validation_size = 100000

data_valid = data.iloc[-validation_size:]
data_train = data.iloc[:-validation_size]
print('Train size:', len(data_train), 'Validation size:', len(data_valid))

In [None]:
learning_rate = .01
embed_size = 300
batch_size = 64
steps = 1000000

In [None]:
inputs = tf.placeholder(tf.int32, [None])
targets = tf.placeholder(tf.int32, [None])

In [None]:
embeddings = tf.Variable(tf.random_uniform((len(vocab), embed_size), -1, 1))
embed = tf.nn.embedding_lookup(embeddings, inputs)

In [None]:
logits = tf.layers.dense(embed, len(vocab), activation=None,
    kernel_initializer=tf.random_normal_initializer())

In [None]:
labels = tf.one_hot(targets, len(vocab))
loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels)
loss = tf.reduce_mean(loss)

train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)

In [None]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def get_batches(x, y, batch_size, n=None):
    if n:
        rand_start = np.random.randint(0, len(x) - batch_size * n)
        x = x[rand_start:]
        y = y[rand_start:]

    for start in range(len(x))[::batch_size][:n]:
        end = start + batch_size
        yield x[start:end], y[start:end]


In [None]:
step = 0
while step < 20000:
    start = time.time()
    
    # shuffle train data once in while
    if step % 1000 == 0:
        data_train = data_train.sample(frac=1.)
    # train part
    train_loss = []
    
    for x, y in get_batches(
        data_train['x'].values, data_train['x'].values, batch_size, n=100):
        step += 1
        _, batch_loss = sess.run([train_op, loss], {inputs: x, targets: y})
        print(batch_loss)
        train_loss.append(batch_loss)
            

    # validation prat (one batch of "validation_size")
    feed_dict = {inputs: data_valid['x'].values, targets: data_valid['x'].values}
    valid_loss, x_vectors = sess.run([loss, embed], feed_dict)
    y_vectors = sess.run(embed, {inputs: data_valid['x'].values})

    # outputs
    print('Step:', step, 'TLoss:', np.mean(train_loss), 'VLoss:', np.mean(valid_loss),
          'Seconds %.1f' % (time.time() - start))

In [None]:
vectors = sess.run(embeddings)
vectors = pd.DataFrame(vectors, index=vocab.index)

In [None]:
len(vectors)

In [None]:
word_list = vocab.index.tolist()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
test_word = "surveillance"

similarities = []
for word in word_list:
    similarities.append(cosine_similarity(vectors.loc[[test_word]], vectors.loc[[word]])[0][0])
indices = np.array(similarities).argsort()[:(len(vocab))]

In [None]:
for index in indices[:20]:
    print(word_list[index])

In [None]:
import os
def plot_with_labels(low_dim_embs, labels, filename):
  assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
  plt.figure(figsize=(18, 18))  # in inches
  for i, label in enumerate(labels):
    x, y = low_dim_embs[i, :]
    plt.scatter(x, y)
    plt.annotate(
        label,
        xy=(x, y),
        xytext=(5, 2),
        textcoords='offset points',
        ha='right',
        va='bottom')

  plt.savefig(filename)

try:
  from sklearn.manifold import TSNE
  import matplotlib.pyplot as plt

  tsne = TSNE(
      perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
  plot_only = 500
  low_dim_embs = tsne.fit_transform(vectors)
#   labels = [reverse_dictionary[i] for i in xrange(plot_only)]
  plot_with_labels(low_dim_embs, word_list, os.path.join('tsne.png'))

except ImportError as ex:
  print('Please install sklearn, matplotlib, and scipy to show embeddings.')
  print(ex)