In [1]:
import re
import random
import math

from functools import partial
import itertools

from collections import Counter, deque

import pandas as pd
import numpy as np
import tensorflow as tf

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import PunktSentenceTokenizer

from bs4 import BeautifulSoup

Word2Vec Implementation on Tensor Flow

- Data from https://www.kaggle.com/c/word2vec-nlp-tutorial
- Code from https://www.tensorflow.org/versions/r0.7/tutorials/word2vec/index.html

In [2]:
data_folder = '/home/agrigorev/tmp/data/bagofpopcorn'

In [4]:
train = pd.read_csv(data_folder + '/labeledTrainData.tsv', delimiter="\t", quoting=3)
train_unlab = pd.read_csv(data_folder + '/unlabeledTrainData.tsv', delimiter="\t", quoting=3)
test = pd.read_csv(data_folder + '/testData.tsv', delimiter="\t", quoting=3)

In [3]:
def review_to_wordlist(review, remove_stopwords=True):
    review_text = BeautifulSoup(review, "lxml").get_text()

    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    words = review_text.lower().split()
   
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    
    return words

def clean_review(review, remove_stopwords=True):
    return ' '.join(review_to_wordlist(review, remove_stopwords))

def review_to_sentences(review, tokenizer, remove_stopwords=True):
    raw_sentences = tokenizer.tokenize(review.decode('utf8').strip())

    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) == 0:
            continue
        sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))

    return sentences

In [5]:
tokenizer = PunktSentenceTokenizer()
stokenizer = partial(review_to_sentences, tokenizer=tokenizer)

In [6]:
train['review_clean'] = train.review.apply(stokenizer)
train_unlab['review_clean'] = train.review.apply(stokenizer)
test['review_clean'] = test.review.apply(stokenizer)

  '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)


In [7]:
sentences = []
for ss in train.review_clean:
    if isinstance(ss, list):
        sentences.extend(ss)

for ss in train_unlab.review_clean:
    if isinstance(ss, list):
        sentences.extend(ss)

In [8]:
def build_dataset(sentences, min_word_count):
    count = Counter()
    for s in sentences:
        count.update(s)

    UNK = 0
    dictionary = {u'UNK': UNK}

    for idx, (w, c) in enumerate(count.most_common(), start=1):
        if c < min_word_count:
            break
        dictionary[w] = idx

    data = []
    for s in sentences:
        data.append([dictionary.get(w, 0) for w in s])
    
    reverse_dict = {idx: w for (w, idx) in dictionary.items()}
    return data, count, dictionary, reverse_dict

In [9]:
min_word_count = 30
data, count, dictionary, reverse_dict = build_dataset(sentences, min_word_count)

In [10]:
# http://stackoverflow.com/questions/3190706/nonlocal-keyword-in-python-2-x
class Namespace(object): pass

In [11]:
def batch_generator(data, batch_size, num_skips, skip_window, start_sentence=0):
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window

    window_center = skip_window
    span = 2 * skip_window + 1

    len_data = len(data)

    ns = Namespace()
    ns.current_sentence = start_sentence

    def overlapping_chunks(n):
        while 1:
            alist = data[ns.current_sentence]
            list_len = len(alist)
            for i in xrange(0, list_len-n+1):
                yield alist[i:i+n]
            ns.current_sentence = (ns.current_sentence + 1) % len_data

    def positive_samples(chunks):
        for window in chunks:
            target = window[window_center]
            for i, context in enumerate(window):
                if i == window_center:
                    continue
                yield [target, context]

    chunk_gen = overlapping_chunks(n=span)
    data_gen = positive_samples(chunk_gen)

    while 1:
        target_context = list(itertools.islice(data_gen, batch_size))
        target_context = np.array(target_context, dtype=np.int32)

        target = target_context[:, 0]
        context = target_context[:, 1].reshape(-1, 1)

        yield target, context, ns.current_sentence

In [12]:
vocabulary_size = len(reverse_dict)
vocabulary_size

15558

In [13]:
batch_size = 128
embedding_size = 128 # Dimension of the embedding vector.
skip_window = 1 # How many words to consider left and right.
num_skips = 2 # How many times to reuse an input to generate a label.

# We pick a random validation set to sample nearest neighbors. here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. 
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.array(random.sample(range(valid_window), valid_size))
num_sampled = 64 # Number of negative examples to sample.

In [14]:
graph = tf.Graph()

with graph.as_default():
    # input data
    train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    # variables
    embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    softmax_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],
                                  stddev=1.0 / math.sqrt(embedding_size)))
    softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))

    # model

    # look up embeddings for inputs
    embed = tf.nn.embedding_lookup(embeddings, train_dataset)
    # Compute the softmax loss, using a sample of the negative labels each time.
    loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(softmax_weights, softmax_biases, embed,
                                                     train_labels, num_sampled, vocabulary_size))


    optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)

    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))

In [15]:
num_steps = 500001

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    average_loss = 0

    gen = batch_generator(data, batch_size=batch_size, num_skips=num_skips, skip_window=skip_window)

    for step in xrange(num_steps):
        batch_inputs, batch_labels, sentence = next(gen)
        feed_dict = {train_dataset : batch_inputs, train_labels : batch_labels}

        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000

            print "Average loss at step %d: %0.4f (sentence %d)" % (step, average_loss, sentence)
            average_loss = 0

        if step % 50000 == 0:
            sim = similarity.eval()
            for i in xrange(valid_size):
                valid_word = reverse_dict[valid_examples[i]]
                top_k = 8 # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k+1]
                
                print "Nearest to %s:" % valid_word.ljust(12),
                print ', '.join(reverse_dict[nearest[k]] for k in xrange(top_k))

    final_embeddings = normalized_embeddings.eval()

Average loss at step 0: 6.0981 (sentence 4)
Nearest to love        : mayhem, diagnosed, upper, activists, schneider, completist, genetically, heist
Nearest to many        : depressing, tend, trey, stepmother, knife, punish, prettier, nails
Nearest to end         : blocks, outrageous, uttered, idiots, mattered, silently, privilege, lifting
Nearest to years       : town, curtis, novelty, volunteers, pleasing, discovering, paralyzed, shaved
Nearest to think       : hostel, isabelle, plotline, distributors, kicking, spawned, competitors, abbot
Nearest to something   : personas, roosevelt, hellman, lance, misfire, exhibition, ouch, resident
Nearest to one         : bel, appropriately, ignores, elegant, bone, wears, baker, project
Nearest to acting      : doggie, bickering, bike, struggle, colonies, suburban, glib, babble
Nearest to UNK         : cringed, object, denholm, welfare, kitamura, flower, freddy, crothers
Nearest to way         : morbius, introspection, forced, dumps, depicting, le

In [16]:
words = ['emotion', 'feeling', 'thriller', 'action', 'horror', 'good', 'comedy', 'bad', 'love']
X = final_embeddings[[dictionary[w] for w in words]]

In [17]:
w2v_sim = pd.DataFrame(X.dot(X.T), columns=words, index=words)
w2v_sim = w2v_sim.unstack().reset_index()
w2v_sim = w2v_sim[w2v_sim.level_0 < w2v_sim.level_1]
w2v_sim.columns = ['word1', 'word2', 'w2v']
w2v_sim.sort_values(by='w2v', ascending=0)

Unnamed: 0,word1,word2,w2v
68,bad,good,0.479032
31,action,horror,0.339351
38,horror,thriller,0.33143
1,emotion,feeling,0.290072
56,comedy,thriller,0.287289
29,action,thriller,0.276372
33,action,comedy,0.174267
58,comedy,horror,0.161291
59,comedy,good,0.093321
11,feeling,thriller,0.080254
