# Ejemplo sencillo de `word2vec`

En esta libreta se presenta básicamente el ejemplo sencillo de `word2vec` que viene dentro de los tutoriales de Tensorflow, el cual completo se encuentra [aquí](https://github.com/tensorflow/tensorflow/blob/r0.10/tensorflow/examples/tutorials/word2vec/word2vec_basic.py). 

El documento cuento con licencia Apache, y puede ser utilizado de forma libre, por lo que se recomienda fuertemente utilizar el código original en un proyecto más grande. En este caso solamente vamos a mostrar su uso, tratando de explicarlo paso a paso en español.

In [1]:
from __future__ import absolute_import
from __future__ import division

import collections  #  Para usar count
import math         #
import os           #  Para checar si el archivo ya existe en el sistema o no
import random       #
import zipfile      #

# Six es para cargar funciones en Python 2.X compatibles con Python 3.X
from six.moves import urllib
from six.moves import xrange

# Las de siempre
import numpy as np
import tensorflow as tf

# Graficación de resultados
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

from IPython.display import set_matplotlib_formats
set_matplotlib_formats('pdf', 'png')
plt.style.use('ggplot')

## Paso1: Descargar los datos

In [2]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
    """
    Download a file if not present, and make sure it's the right size.
    """
    
    if not os.path.exists(filename):
        filename, _ = urllib.request.urlretrieve(url + filename, filename)

    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print 'Encontrado y verificado el archivo ' + filename
    else:
        print statinfo.st_size
        raise Exception('No se pudo cargar ' + filename + 'descargalo con el navegador')

    return filename

archivo = maybe_download('text8.zip', 31344016)


def lee_datos(filename):
    """
    Extract the first file enclosed in a zip file as a list of words
    
    """
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

palabras = lee_datos(archivo)

print 'Tamaño del texto en palabras: ', len(palabras)


Encontrado y verificado el archivo text8.zip
Tamaño del texto en palabras:  17005207


## Paso 2: Construir el diccionario 

Las palabras myt raras se reemplazan por la palabra UNK (desconocida)

In [3]:
t_vocabulario = 50000

def arma_dataset(palabras, t_vocabulario):
    """
    Construye a partir de la lista de strings:
        - Una lista donde las palabras se sustituyen por indices
        - Una lista de pares ordenados de palabra y apariciones
        - Un diccionario de palabra e indice
        - Un diccionario de indice y palabra
    """
    # Lista de las t_vocabulario parabras más comunes 
    count = [['UNK', -1]]
    count.extend(collections.Counter(palabras).most_common(t_vocabulario - 1))

    # A cada palabra se le asigna un indice
    dictionario = dict()
    for palabra, _ in count:
        dictionario[palabra] = len(dictionario)
    
    # Se genera una lista donde se remplazan las palabras por sus indices
    data = list()
    unk_count = 0
    for palabra in palabras:
        if palabra in dictionario:
            index = dictionario[palabra]
        else:
            index = 0  # dictionario['UNK']
            unk_count += 1
        data.append(index)
        
    # Se le agrega a las cuentas el número de palabras desconocidas
    count[0][1] = unk_count
    
    # Se cambian keys por valores en un nuevo diccionario al revés
    reverse_dictionary = dict(zip(dictionario.values(), dictionario.keys()))
    
    return data, count, dictionario, reverse_dictionary

data, count, dictionary, reverse_dictionary = arma_dataset(palabras, t_vocabulario)

del palabras  # Liberar memoria.

print 'Palabras más comunes: ', count[:5]
print 'Muestra', data[:10], [reverse_dictionary[i] for i in data[:10]]




Palabras más comunes:  [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]
Muestra [5239, 3084, 12, 6, 195, 2, 3137, 46, 59, 156] ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']


## Paso 3: Generar un lote de entrenamiento para el modelo de skip-grama

In [4]:
data_index = 0

def generate_batch(batch_size, num_skips, skip_window):
    """
    Genera un minibatch de un tamaño chico con 
    
    """

    # Esta variable se puede modificar por fuera
    global data_index
    
    # Nos aseguramos de no hacer alguna pendejada
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    
    # En batch es la entrada y labels la salida para entrenamiento
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    
    # Cuantos hacia la izquiera, Cuandos a la derecha, y el suyo propio
    span = 2 * skip_window + 1 # [ skip_window target skip_window ]
    
    # Genera un buffer de tamaño máximo span, y carga datos iniciales
    buffer = collections.deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    
    # Genera los pares ordenados con la palabra del centro con las adyacentes
    # Medio feo y asegurandose que sea aleatorio el orden.
    for i in range(batch_size // num_skips):
        target = skip_window  # target label at the center of the buffer
        targets_to_avoid = [ skip_window ]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
        
        # Avanza cargando otra serie de datos en el buffer
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch, labels

batch, labels = generate_batch(batch_size=12, num_skips=4, skip_window=2)
for i in range(12):
    print batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0], reverse_dictionary[labels[i, 0]]


12 as -> 5239 anarchism
12 as -> 3084 originated
12 as -> 195 term
12 as -> 6 a
6 a -> 2 of
6 a -> 12 as
6 a -> 3084 originated
6 a -> 195 term
195 term -> 6 a
195 term -> 3137 abuse
195 term -> 2 of
195 term -> 12 as


## Paso 4: Construye y entrena el modelo en TensorFlow

In [5]:
batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1       # How many words to consider left and right.
num_skips = 2         # How many times to reuse an input to generate a label.

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64    # Number of negative examples to sample.

graph = tf.Graph()

with graph.as_default():
    # Input data.
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    # Ops and variables pinned to the CPU because of missing GPU implementation
    with tf.device('/cpu:0'):
        # Look up embeddings for inputs.
        embeddings = tf.Variable(
            tf.random_uniform([t_vocabulario, embedding_size], -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)

        # Construct the variables for the NCE loss
        nce_weights = tf.Variable(tf.truncated_normal([t_vocabulario, embedding_size],
                                   stddev=1.0 / math.sqrt(embedding_size)))
        nce_biases = tf.Variable(tf.zeros([t_vocabulario]))

    # Compute the average NCE loss for the batch.
    # tf.nce_loss automatically draws a new sample of the negative labels each
    # time we evaluate the loss.
    loss = tf.reduce_mean(
        tf.nn.nce_loss(nce_weights, nce_biases, embed, train_labels,
                       num_sampled, t_vocabulario))

    # Construct the SGD optimizer using a learning rate of 1.0.
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

    # Compute the cosine similarity between minibatch examples and all embeddings.
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

    # Add variable initializer.
    init = tf.initialize_all_variables()


## Paso 5: Entrenamiento

In [6]:
num_steps = 100001

with tf.Session(graph=graph) as session:
    # We must initialize all variables before we use them.
    init.run()
    print "Inicializado, listo para entrenar"

    average_loss = 0
    for step in xrange(num_steps):
        batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
        feed_dict = {train_inputs : batch_inputs, train_labels : batch_labels}

        # We perform one update step by evaluating the optimizer op (including it
        # in the list of returned values for session.run()
        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print "Average loss en el paso ", step, ": ", average_loss
            average_loss = 0

        # Note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in xrange(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8 # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k+1]
                log_str = "Nearest to %s:" % valid_word
                for k in xrange(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                log_str = "%s %s," % (log_str, close_word)
                print log_str
  
    final_embeddings = normalized_embeddings.eval()



Inicializado, listo para entrenar
Average loss en el paso  0 :  288.454650879
Nearest to it: been,
Nearest to these: godf,
Nearest to all: blockage,
Nearest to i: conformations,
Nearest to on: berwick,
Nearest to at: goggles,
Nearest to states: novelists,
Nearest to would: haircut,
Nearest to for: lard,
Nearest to but: philological,
Nearest to if: phony,
Nearest to however: alfredo,
Nearest to who: stocking,
Nearest to and: niue,
Nearest to d: severly,
Nearest to are: bellini,
Average loss en el paso  2000 :  113.599165546
Average loss en el paso  4000 :  52.0493506567
Average loss en el paso  6000 :  33.5687247963
Average loss en el paso  8000 :  23.7237274334
Average loss en el paso  10000 :  18.0879159422
Nearest to it: constituted,
Nearest to these: proclamation,
Nearest to all: newsgroup,
Nearest to i: agave,
Nearest to on: of,
Nearest to at: georgia,
Nearest to states: methadone,
Nearest to would: reginae,
Nearest to for: is,
Nearest to but: competition,
Nearest to if: sadler,
Ne

In [13]:

# Step 6: Visualize the embeddings.
set_matplotlib_formats('png')
%matplotlib qt

def plot_with_labels(low_dim_embs, labels, filename="ejemplo.png"):

  plt.figure(figsize=(18, 18))  #in inches
  for i, label in enumerate(labels):
    x, y = low_dim_embs[i,:]
    plt.scatter(x, y)
    plt.annotate(label,
                 xy=(x, y),
                 xytext=(5, 2),
                 textcoords='offset points',
                 ha='right',
                 va='bottom')

  plt.savefig(filename)

tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)

plot_only = 500
low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only,:])

labels = [reverse_dictionary[i] for i in xrange(plot_only)]

plot_with_labels(low_dim_embs, labels)



ImportError: cannot import name _macosx