# Word2vec

In [7]:
import collections
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
import re
%matplotlib inline

## Configuration

In [13]:
batch_size = 20
embedding_size = 2
num_sampled = 15

## Movie Review Text data

In [14]:
sentences = pd.read_csv("review_text_rev1.txt", header=None)
sentences = sentences.values.ravel().tolist()

## Preprocessing

In [11]:
sentences = [re.sub('[^a-z ]+',' ',x.lower()) for x in sentences]

AttributeError: 'float' object has no attribute 'lower'

In [15]:
sentences_rev = []
for sentence in sentences:
    if type(sentence) == str:
        sentences_rev.append(sentence)
sentences = sentences_rev

## Sentences to words and word count

In [16]:
words = " ".join(sentences).split()
count = collections.Counter(words)

In [17]:
print ("Word count", count.most_common()[:5])

Word count [('the', 189581), ('a', 101833), ('and', 99831), ('of', 94359), ('to', 86688)]


## Build Dictionary

In [18]:
idx_to_word = {idx:w for idx, w in enumerate(count.keys())}
word_to_idx = {w: i for i, w in idx_to_word.items()} 
voc_size = len(word_to_idx)
data = [[word_to_idx[word] for word in sentence.split()] for sentence in sentences]

print('Sample data', data[0], [idx_to_word[t] for t in data[0]])

Sample data [1885, 149418, 71618, 203449, 149337, 23139, 131390, 91427, 30629, 96707] ['\ufeff"Naturally', 'in', 'a', 'film', "who's", 'main', 'themes', 'are', 'of', 'mortality']


## Training data: CBOW

In [19]:
cbow_pairs = []
for sentence in data:
    for i in range(1, len(sentence)-1):
        cbow_pairs.append([[sentence[i-1], sentence[i+1]], sentence[i]])
print("CBOW pairs :" , cbow_pairs[:10])

CBOW pairs : [[[1885, 71618], 149418], [[149418, 203449], 71618], [[71618, 149337], 203449], [[203449, 23139], 149337], [[149337, 131390], 23139], [[23139, 91427], 131390], [[131390, 30629], 91427], [[91427, 96707], 30629], [[189027, 30629], 151302], [[151302, 13641], 30629]]


## Training data: Skip gram

In [None]:
skip_gram_pairs = []
for sentence in data:
    for i in range(1, len(sentence)-1):
        skip_gram_pairs.append([sentence[i], sentence[i-1]])
        skip_gram_pairs.append([sentence[i], sentence[i+1]])
print("Skip gram pairs :" , skip_gram_pairs[:10])

## Generate Batch

In [None]:
def generate_batch(size):
    assert size < len(skip_gram_pairs)
    x_data=[]
    y_data = []
    r = np.random.choice(range(len(skip_gram_pairs)), size, replace=False)
    for i in r:
        x_data.append(skip_gram_pairs[i][0])  # n dim
        y_data.append([skip_gram_pairs[i][1]])  # n, 1 dim
    return x_data, y_data

In [None]:
print ('Batches (x, y)', generate_batch(3))

## Input, Label Variable 만들기

In [None]:
# Input data
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
# need to shape [batch_size, 1] for nn.nce_loss
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])

In [None]:
# Look up embeddings for inputs.
embeddings = tf.Variable(
    tf.random_uniform([voc_size, embedding_size], -1.0, 1.0))
embed = tf.nn.embedding_lookup(embeddings, train_inputs) # lookup table

In [None]:
# Construct the variables for the NCE loss
nce_weights = tf.Variable(
    tf.random_uniform([voc_size, embedding_size],-1.0, 1.0))
nce_biases = tf.Variable(tf.zeros([voc_size]))

## loss function과 optimizer

In [None]:
loss = tf.reduce_mean(
  tf.nn.nce_loss(nce_weights, nce_biases, embed, train_labels,
                 num_sampled, voc_size))

train_op = tf.train.AdamOptimizer(1e-1).minimize(loss)

In [None]:
loss_trn = list()
with tf.Session() as sess:
    # Initializing all variables
    sess.run(tf.initialize_all_variables())

    for step in range(200):
        batch_inputs, batch_labels = generate_batch(batch_size)
        _, loss_val = sess.run([train_op, loss],
                feed_dict={train_inputs: batch_inputs, train_labels: batch_labels})
        loss_trn.append(loss_val)
        if step % 10 == 0:
            print("Loss at ", step, loss_val) # Report the loss

    # Final embeddings are ready for you to use. Need to normalize for practical use
    trained_embeddings = sess.run(embeddings)

## Training loss plot

In [None]:
plt.plot(loss_trn)

## Distributed Representations of words

In [None]:
labels = ["queen", "king", "boy", "girl", "uncle", "aunt", "female", "male"] # Show top 10 words
for i, label in enumerate(labels):
    x, y = trained_embeddings[i,:]
    plt.scatter(x, y)
    plt.annotate(label, xy=(x, y), xytext=(5, 2),
        textcoords='offset points', ha='right', va='bottom')