In [1]:
import collections
import os
import random
import requests
import shutil
import zipfile

import tensorflow as tf
import numpy as np
import pandas as pd
import string

tf.logging.set_verbosity(tf.logging.ERROR)

tf.VERSION

'1.4.0'

## Uso do word2vec para medir a similaridade entre termos

## Dados

In [2]:
HOME_DIR = 'wikipedia'
DATA_DIR = os.path.join(HOME_DIR, 'data')

if not os.path.isdir(DATA_DIR):
    os.makedirs(DATA_DIR)
    
TEXT_URL = 'http://mattmahoney.net/dc/text8.zip'
TEXT_FILENAME = TEXT_URL.split('/')[-1]
TEXT_FILE = os.path.join(DATA_DIR, TEXT_FILENAME)

text_missing = not os.path.isfile(TEXT_FILE)

if text_missing:
    print('Downloading {}...'.format(TEXT_FILENAME))
    r = requests.get(TEXT_URL, stream=True)
    with open(TEXT_FILE, 'wb') as f:
        for chunk in r.iter_content(chunk_size=32768):
            if chunk:
                f.write(chunk)
    print('Done!')

## Definindo vocabulário

In [3]:
def load_raw_text_from_zip(file):
    with zipfile.ZipFile(file) as f:
        return f.read(f.namelist()[0]).decode('utf-8')

raw_text = load_raw_text_from_zip(TEXT_FILE)

print('{}...\n\n({:,d} chars)\n\n...{}'.format(
    raw_text[:1000], len(raw_text) - 2000, raw_text[-1000:]))

 anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans culottes of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the organization of society it has also been taken up as a positive label by self defined anarchists the word anarchism is derived from the greek without archons ruler chief king anarchism as a political philosophy is the belief that rulers are unnecessary and should be abolished although there are differing interpretations of what this means anarchism also refers to related social movements that advocate the elimination of authoritarian institutions particularly the state the word anarchy as most anarchists use it does not imply chaos nihilism or anomie but rather a harmonious anti authoritarian society in place of what are regarded as authoritarian political structures and coercive economic instituti

O vocabulário é formado a partir das palavras mais frequentes do texto.

In [4]:
words = raw_text.split()
words_freq = collections.Counter(words).most_common()

print('Words (total):\n\n{:,d}\n'.format(len(words)))
print('Words (unique):\n\n{:,d}\n'.format(len(words_freq)))
print('Most common:\n')
for word, freq in words_freq[:20]:
    print('{} ({:,d})'.format(word, freq))
print('\nLeast common:\n')
for word, freq in words_freq[-20:]:
    print('{} ({:,d})'.format(word, freq))

Words (total):

17,005,207

Words (unique):

253,854

Most common:

the (1,061,396)
of (593,677)
and (416,629)
one (411,764)
in (372,201)
a (325,873)
to (316,376)
zero (264,975)
nine (250,430)
two (192,644)
is (183,153)
as (131,815)
eight (125,285)
for (118,445)
s (116,710)
five (115,789)
three (114,775)
was (112,807)
by (111,831)
that (109,510)

Least common:

triconodonts (1)
katsumoto (1)
spontainous (1)
shpayder (1)
operamusical (1)
malbono (1)
biomacla (1)
cacher (1)
wahlkapitulation (1)
carousers (1)
masn (1)
pash (1)
gwladgarwyr (1)
insatiably (1)
ldbp (1)
higby (1)
hemippus (1)
ramzy (1)
meserii (1)
oogl (1)


Neste exemplo, foram consideradas as palavras que com frequência acima de 10.

In [5]:
words_10plus = sum(1 for _, freq in words_freq if freq >= 10)

print('Words 10+: {:,d}'.format(words_10plus))

Words 10+: 47,134


In [6]:
vocabulary_size = 47134

words_freq[vocabulary_size - 1]

('severance', 10)

In [7]:
words_vocab = words_freq[:(vocabulary_size-1)]

print('Words for the vocabulary: {:,d}'.format(len(words_vocab)))

Words for the vocabulary: 47,133


In [8]:
UNK_ID = 0
word_to_id = dict((word, word_id) for word_id, (word, _) in enumerate(words_vocab, UNK_ID+1))
word_to_id['UNK'] = UNK_ID
word_from_id = dict((word_id, word) for word, word_id in word_to_id.items())

print('Vocabulary size: {:d}'.format(len(word_to_id)))

Vocabulary size: 47134


In [9]:
words_to_unk = words_freq[(vocabulary_size-1):]
unk_freq = sum(freq for _, freq in words_to_unk)

print('UNK words: {:,d}'.format(len(words_to_unk)))
print('UNK frequency: {:,d}'.format(unk_freq))

UNK words: 206,721
UNK frequency: 444,186


A seguir, o vocabulário é salvo em um arquivo .txt e este é carregado para definição de dicionários para as palavras

In [10]:
VOCABULARY_FILE = os.path.join(HOME_DIR, 'vocabulary.txt')

with open(VOCABULARY_FILE, 'w') as f:
    for word_id in range(vocabulary_size):
        f.write(word_from_id[word_id] + '\n')

print('Vocabulary file size: {:,d} bytes'.format(os.stat(VOCABULARY_FILE).st_size))

Vocabulary file size: 394,227 bytes


In [11]:
with open(VOCABULARY_FILE, newline='') as f:
    word_from_id_ = dict((word_id, word.strip()) for word_id, word in enumerate(f))
    word_to_id_ = dict((word, word_id) for word_id, word in word_from_id_.items())

print('Vocabulary size: {:,d}'.format(len(word_to_id_)))
assert word_to_id_ == word_to_id
assert word_from_id_ == word_from_id
del word_to_id_, word_from_id_

Vocabulary size: 47,134


As palavras a partir de agora serão tratadas como índices que serão utilizados para os cálculos do modelo.

In [12]:
data = list(word_to_id.get(word, UNK_ID) for word in words)

print('Size:\n\n{:,d}\n'.format(len(data)))
print('Text (IDs):\n\n{}\n'.format(data[:10]))
print('Text (Words):\n\n{}'.format(list(word_from_id[word_id] for word_id in data[:10])))

Size:

17,005,207

Text (IDs):

[5242, 3082, 12, 6, 195, 2, 3137, 46, 59, 156]

Text (Words):

['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']


A variável data representa o texto inicial codificado através dos índices que representam as palavras.

## Modelo CBOW

### Preparação dos dados

Até o momento o texto está representado por uma lista de índices. Aqui, será definido o dataset que será utilizado no
treinamento do modelo. A variável resposta y é definida como um vetor de uma coluna e n linhas, onde o número de linhas corresponte ao
número de palavras do vocabulário. Já as variáveis preditoras serão definidas pelo vetor X, onde o número de linhas
também será o número de palavras do vocabulário. Já o número de colunas será o número de palavras no contexto.

Um fato interessante sobre essa abordagem, é em relação ao número de colunas reduzido do dataset que quando comparado a abordagens de contagem, apresenta grande vantagem no custo computacional.

In [13]:
def context_window(window_words, target_index):
    '''This function returns the words at the window without the target word.'''
    words = list(window_words)
    del words[target_index]
    return words

def input_cbow(data, batch_size, window_size):
    '''This function goes through the data and creates input-output batches to train the model using gradient'''
    if window_size % 2 == 0 or window_size < 3 \
        or window_size > (len(data) - batch_size) / 2:
        # {window_size} must be odd: (n words left) target (n words right)
        raise Exception(
            'Invalid parameters: window_size must be a small odd number')

    num_words = len(data)
    num_windows = num_words - window_size + 1
    num_batches = num_windows // batch_size
    target_index = window_size // 2
    
    words = collections.deque(data[window_size:])
    window_words = collections.deque(data[:window_size], maxlen=window_size)
    
    for n in range(num_batches):
        batch = np.ndarray(shape=(batch_size, window_size-1), dtype=np.int32)
        labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)

        for i in range(batch_size):
            batch[i,:] = context_window(window_words, target_index)
            labels[i, 0] = window_words[target_index]
            window_words.append(words.popleft())

        yield batch, labels

### Modelo

O modelo implementado retorna uma matriz **embeddings** normalizada com dimensões (vocabulary_size, embedding_size), 
que representa as palavras como vetores em um espaço n-dimensional. O tamanho do espaço vetorial (embedding_size) 
é um hiperparâmetro que deve ser ajustado. É a partir dessa matriz que se torna possível medir a distância entre as
palavras.

In [14]:
def model_cbow(vocabulary_size, embedding_size, num_sampled):
    '''this function creates and returns all the cbow tensors needed.'''
    X = tf.placeholder_with_default([[0]], shape=(None, None), name='X')
    y = tf.placeholder_with_default([[0]], shape=(None, 1), name='y')

    embeddings = tf.Variable(
        tf.random_uniform(shape=(vocabulary_size, embedding_size),
                          minval=-1.0, maxval=1.0),
        name='embeddings')
    print('EMBEDDINGS: ')
    print(embeddings)

    X_embed = tf.nn.embedding_lookup(embeddings, X)
    X_avg = tf.reduce_mean(X_embed, axis=1)

    softmax_weights = tf.Variable(
        tf.truncated_normal(shape=(vocabulary_size, embedding_size),
                            stddev=1.0 / np.sqrt(embedding_size)),
        name='W')
    softmax_biases = tf.Variable(
        tf.zeros(shape=(vocabulary_size,)),
        name='b')

    with tf.name_scope('loss'):
        sampled_loss = tf.nn.sampled_softmax_loss(weights=softmax_weights,
                                                  biases=softmax_biases,
                                                  inputs=X_avg,
                                                  labels=y,
                                                  num_sampled=num_sampled,
                                                  num_classes=vocabulary_size)
        loss = tf.reduce_mean(sampled_loss, name='mean')

    norm = tf.norm(embeddings, axis=1, keep_dims=True)
    normalized_embeddings = embeddings / norm

    return X, y, normalized_embeddings, loss

In [15]:
def opt_adagrad(loss, learning_rate=1.0):
    '''this function adds a gradient descent optimization called adagrad.'''
    return tf.contrib.layers.optimize_loss(
        loss=loss,
        global_step=tf.train.get_or_create_global_step(),
        learning_rate=learning_rate,
        optimizer='Adagrad')

In [16]:
def train(model_fn, input_fn, opt_fn, query,
          num_epochs=1, model_dir='/tmp/embedding_model', remove_model=True):
    '''this function trains the model'''
    if remove_model and os.path.isdir(model_dir):
        shutil.rmtree(model_dir)

    with tf.Graph().as_default():
        X, y, embeddings, loss_op = model_fn()
        train_op = opt_fn(loss_op)

        query.build_graph(embeddings)

        with tf.train.MonitoredTrainingSession(
            checkpoint_dir=model_dir) as session:

            for epoch in range(1, num_epochs+1):
                print('Epoch {}\n'.format(epoch))

                avg_loss = 0
                for step, (X_batch, y_batch) in enumerate(input_fn()):
                    _, loss = session.run([train_op, loss_op],
                                          feed_dict={X: X_batch, y: y_batch})

                    avg_loss = (loss + step * avg_loss) / (step + 1)
                    if step % 10000 == 0:
                        print('...{:,d} Average loss: {:.3f}'.format(
                            step, avg_loss))

                print('\nAverage loss: {:.3f}\n'.format(avg_loss))
                query.run(session)
                print()

            return session.run(embeddings)

In [17]:
def save_embeddings(file, embeddings):
    '''This function saves the word embeddings' matrix into a file.'''
    with open(file, 'w') as f:
        vocabulary_size = embeddings.shape[0]
        for word_id in range(vocabulary_size):
            embedding = embeddings[word_id]
            embedding_string = ('{:.5f}'.format(k) for k in embedding)
            embedding_string = ' '.join(embedding_string)
            f.write(embedding_string)
            f.write('\n')

## Consulta de Palavras mais próximas

A classe **NearestNeibours** é utilizada para medir a distância entre os termos

In [18]:
class NearestWordsQuery:

    def __init__(self, word_from_id, words, k=4):
        self.word_from_id = word_from_id
        self.words = words
        self.k = k

    def build_graph(self, embeddings, name=None):
        with tf.name_scope(name, "nearest_words", [self.words, self.k]):
            input_words = tf.placeholder(tf.int32, shape=(None,))

            input_embed = tf.nn.embedding_lookup(embeddings, input_words)
            similarity = tf.matmul(input_embed, embeddings, transpose_b=True)
            nearest = tf.nn.top_k(similarity, self.k+1)

        self.input_words = {input_words: self.words}
        self.nearest = nearest

    def nearest_words(self, target_id, nearest_indices, nearest_values):
        id_pairs = zip(nearest_indices, nearest_values)
        word_pairs = list((self.word_from_id[word_id], value)
                          for word_id, value in id_pairs
                          if word_id != target_id)
        return word_pairs[:self.k]

    def format_words(self, word_pairs):
        return ('{} ({:,.3f})'.format(word, value)
                for word, value in word_pairs)

    def run(self, session):
        nearest_val, nearest_id = session.run(self.nearest,
                                              feed_dict=self.input_words)
        for i, word_id in enumerate(self.words):
            word = self.word_from_id[word_id]
            nearest_words = self.nearest_words(
                word_id, nearest_id[i], nearest_val[i])
            nearest_words = ', '.join(self.format_words(nearest_words))
            print('{}: {}'.format(word, nearest_words))

# Experimento

O experimento a princípio é realizado a partir do treinamento do modelo CBOW com dados da wikipedia, onde a avaliação
é realizada a partir da média da perda (average loss) a medida que o modelo evolui.

Para se ter uma ‘percepção qualitativa’ do resultado, são amostradas 8 palavras do intervalo das 1000 mais comuns - no final de cada época, essa amostra é usada para gerar a lista de similaridade. Essa lista pode ser observada para ver como o aprendizado evolui.

In [86]:
# valid_num_words = 8
# valid_range_words = 1000
# valid_words = random.sample(range(1, valid_range_words), valid_num_words)

words_to_test = ['ronaldo', 'jesus', 'music', 'world', 'computing']
valid_words = [word_to_id[word] for word in words_to_test]
print(valid_words)


43387
936
166
71
2170


A amostra de palavras é encapsulada no objeto **nearest_words** que consulta palavras similares a partir da representação vetorial, é sendo usada no treinamento para comparação.

In [20]:
nearest_words = NearestWordsQuery(word_from_id, valid_words, 4)

In [21]:
%%time

MODEL_DIR = os.path.join('word2vec', 'cbow')
EMBEDDINGS_FILE = os.path.join('word2vec', 'cbow.txt')

vocabulary_size = len(word_to_id)
embedding_size = 128
num_sampled = 64

batch_size = 128
window_size = 3

model_fn = lambda: model_cbow(vocabulary_size, embedding_size, num_sampled)
input_fn = lambda: input_cbow(data, batch_size, window_size)
opt_fn = lambda loss: opt_adagrad(loss, learning_rate=1.0)

cbow_embeddings = train(model_fn,
                        input_fn,
                        opt_fn,
                        nearest_words,
                        num_epochs=1,
                        model_dir=MODEL_DIR)

save_embeddings(EMBEDDINGS_FILE, cbow_embeddings)

EMBEDDINGS: 
<tf.Variable 'embeddings:0' shape=(47134, 128) dtype=float32_ref>
Epoch 1

...0 Average loss: 7.854
...10,000 Average loss: 3.436
...20,000 Average loss: 3.266
...30,000 Average loss: 3.181
...40,000 Average loss: 3.114
...50,000 Average loss: 3.072
...60,000 Average loss: 3.031
...70,000 Average loss: 2.995
...80,000 Average loss: 2.966
...90,000 Average loss: 2.941
...100,000 Average loss: 2.913
...110,000 Average loss: 2.884
...120,000 Average loss: 2.867
...130,000 Average loss: 2.845

Average loss: 2.842

chemical: physical (0.413), simplest (0.384), crude (0.369), mathematical (0.352)
royal: yunnan (0.361), auld (0.337), alma (0.334), porvoo (0.327)
video: digital (0.390), cooh (0.383), computer (0.373), radio (0.350)
paul: thomas (0.452), peter (0.445), charles (0.422), james (0.393)
alexander: frederick (0.373), leo (0.360), boltzmann (0.349), clement (0.347)
that: which (0.525), however (0.473), what (0.398), permits (0.371)
an: rankin (0.339), classless (0.335), 

A partir dos resultados acima, é possível observar que o **average loss** decresce lentamente se comparado ao número de passos (130 mil). Os exemplos observados parecem fazer sentido.

### Mais um exemplo: Treinamento com dados de Inquéritos

Como o propósito do estudo é utilizar o word2vec em documentos de inquéritos policiais, foi realizado o treinamento de um modelo com resumos de Inquéritos Policiais e foi observado os resultados a seguir:

In [None]:
df = pd.read_csv("../data/casos.csv")
summaries = "".join(l for l in df['DS_RESUMO'].str.cat(sep=', ') if l not in string.punctuation)
words = summaries.split()

In [None]:
words_freq = collections.Counter(words).most_common()

In [None]:
words_3plus = sum(1 for _, freq in words_freq if freq >= 3)

In [None]:
vocabulary_size = 4131

In [None]:
words_vocab = words_freq[:(vocabulary_size-1)]

In [None]:
UNK_ID = 0
word_to_id = dict((word, word_id) for word_id, (word, _) in enumerate(words_vocab, UNK_ID+1))
word_to_id['UNK'] = UNK_ID
word_from_id = dict((word_id, word) for word, word_id in word_to_id.items())

print('Vocabulary size: {:d}'.format(len(word_to_id)))

In [None]:
words_to_unk = words_freq[(vocabulary_size-1):]
unk_freq = sum(freq for _, freq in words_to_unk)

print('UNK words: {:,d}'.format(len(words_to_unk)))
print('UNK frequency: {:,d}'.format(unk_freq))

In [None]:
VOCABULARY_FILE = '../data/vocabulary.txt'

with open(VOCABULARY_FILE, 'w') as f:
    for word_id in range(vocabulary_size):
        f.write(word_from_id[word_id] + '\n')

print('Vocabulary file size: {:,d} bytes'.format(os.stat(VOCABULARY_FILE).st_size))

In [None]:
with open(VOCABULARY_FILE, newline='') as f:
    word_from_id_ = dict((word_id, word.strip()) for word_id, word in enumerate(f))
    word_to_id_ = dict((word, word_id) for word_id, word in word_from_id_.items())

# print(word_from_id_)
print('Vocabulary size: {:,d}'.format(len(word_to_id_)))
assert word_to_id_ == word_to_id
assert word_from_id_ == word_from_id
del word_to_id_, word_from_id_

In [None]:
data = list(word_to_id.get(word, UNK_ID) for word in words)

print('Size:\n\n{:,d}\n'.format(len(data)))
print('Text (IDs):\n\n{}\n'.format(data[:10]))
print('Text (Words):\n\n{}'.format(list(word_from_id[word_id] for word_id in data[:10])))

In [None]:
valid_num_words = 8
valid_range_words = 1000
valid_words = random.sample(range(1, valid_range_words), valid_num_words)

for word_id in valid_words:
    print(word_from_id[word_id])

In [None]:
nearest_words = NearestWordsQuery(word_from_id, valid_words, 4)

In [None]:
%%time

MODEL_DIR = os.path.join('word2vec', 'cbow')
EMBEDDINGS_FILE = os.path.join('word2vec', 'cbow.txt')

vocabulary_size = len(word_to_id)
embedding_size = 128
num_sampled = 64

batch_size = 256
window_size = 3

model_fn = lambda: model_cbow(vocabulary_size, embedding_size, num_sampled)
input_fn = lambda: input_cbow(data, batch_size, window_size)
opt_fn = lambda loss: opt_adagrad(loss, learning_rate=1.0)

cbow_embeddings = train(model_fn,
                        input_fn,
                        opt_fn,
                        nearest_words,
                        num_epochs=1,
                        model_dir=MODEL_DIR)

save_embeddings(EMBEDDINGS_FILE, cbow_embeddings)