In [18]:
import collections
import os
import random
import requests
import shutil
import zipfile

import tensorflow as tf
import numpy as np
import pandas as pd
import string

tf.logging.set_verbosity(tf.logging.ERROR)

tf.VERSION

'1.4.0'

In [22]:
#It loads the dataframe, gets the column, joins all the values, removes the punctuation and split into empty spaces.
df = pd.read_csv("../data/casos.csv")
summaries = "".join(l for l in df['DS_RESUMO'].str.cat(sep=', ') if l not in string.punctuation)
words = summaries.split()

In [38]:
words_freq = collections.Counter(words).most_common()
print("words: {0}".format(len(words)))
print("unique words: {0}".format(len(words_freq)))

words: 110280
unique words: 16603


In [50]:
for word, freq in words_freq[:20]:
    print('{} ({:,d})'.format(word, freq))
print('\nLeast common:\n')
for word, freq in words_freq[-20:]:
    print('{} ({:,d})'.format(word, freq))

de (8,603)
do (2,542)
DE (2,048)
da (1,843)
a (1,740)
em (1,694)
e (1,425)
no (1,347)
o (1,204)
nº (969)
Apurar (940)
na (894)
que (881)
por (828)
DA (756)
crime (751)
com (659)
para (629)
ao (553)
dos (489)

Least common:

sito (1)
má (1)
Dom (1)
Notitia (1)
226228 (1)
CARMEM (1)
ISANA (1)
noticiam (1)
Amanda (1)
27062006 (1)
28022011 (1)
RETRATANDO (1)
Remanso (1)
IKEZIRI (1)
128000001117201640 (1)
320763968811 (1)
DELZUITA (1)
indicio (1)
SUBSCREVEU (1)
128000001242201731 (1)


In [66]:
words_5plus = sum(1 for _, freq in words_freq if freq >= 5)

print('Words 5+: {:,d}'.format(words_5plus))

Words 5+: 2,584


In [79]:
vocabulary_size = 2600

words_freq[vocabulary_size - 1]

('Marinha', 4)

In [80]:
words_vocab = words_freq[:(vocabulary_size-1)]

print('Words for the vocabulary: {:,d}'.format(len(words_vocab)))

Words for the vocabulary: 2,599


In [81]:
UNK_ID = 0
word_to_id = dict((word, word_id) for word_id, (word, _) in enumerate(words_vocab, UNK_ID+1))
word_to_id['UNK'] = UNK_ID
word_from_id = dict((word_id, word) for word, word_id in word_to_id.items())

print('Vocabulary size: {:d}'.format(len(word_to_id)))

Vocabulary size: 2600


In [82]:
VOCABULARY_FILE = '../data/vocabulary.txt'

with open(VOCABULARY_FILE, 'w') as f:
    for word_id in range(vocabulary_size):
        f.write(word_from_id[word_id] + '\n')

print('Vocabulary file size: {:,d} bytes'.format(os.stat(VOCABULARY_FILE).st_size))

Vocabulary file size: 21,968 bytes


In [77]:
with open(VOCABULARY_FILE, newline='') as f:
    word_from_id_ = dict((word_id, word.strip()) for word_id, word in enumerate(f))
    word_to_id_ = dict((word, word_id) for word_id, word in word_from_id_.items())

# print(word_from_id_)
print('Vocabulary size: {:,d}'.format(len(word_to_id_)))
assert word_to_id_ == word_to_id
assert word_from_id_ == word_from_id
del word_to_id_, word_from_id_

Vocabulary size: 3,000


In [78]:
data = list(word_to_id.get(word, UNK_ID) for word in words)

print('Size:\n\n{:,d}\n'.format(len(data)))
print('Text (IDs):\n\n{}\n'.format(data[:10]))
print('Text (Words):\n\n{}'.format(list(word_from_id[word_id] for word_id in data[:10])))

Size:

110,280

Text (IDs):

[0, 729, 15, 22, 72, 399, 959, 1, 368, 13]

Text (Words):

['UNK', 'GONÇALVES', 'DA', 'SILVA', 'teria', 'apresentado', 'comprovante', 'de', 'residência', 'que']


In [85]:
len(words)

110280

## CBOW

**Input**

In [87]:
def context_window(window_words, target_index):
    '''This function returns the words at the window without the target word.'''
    words = list(window_words)
    del words[target_index]
    return words

def input_cbow(data, batch_size, window_size):
    if window_size % 2 == 0 or window_size < 3 \
        or window_size > (len(data) - batch_size) / 2:
        # {window_size} must be odd: (n words left) target (n words right)
        raise Exception(
            'Invalid parameters: window_size must be a small odd number')

    num_words = len(data)
    num_windows = num_words - window_size + 1
    num_batches = num_windows // batch_size
    target_index = window_size // 2
    
    words = collections.deque(data[window_size:])
    window_words = collections.deque(data[:window_size], maxlen=window_size)
    
    for n in range(num_batches):
        batch = np.ndarray(shape=(batch_size, window_size-1), dtype=np.int32)
        labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)

        for i in range(batch_size):
            batch[i,:] = context_window(window_words, target_index)
            labels[i, 0] = window_words[target_index]
            window_words.append(words.popleft())

        yield batch, labels