# Predição de texto com LSTM 

## Carregando as bibliotecas

In [1]:
import regex as re
import string
# Dependência gensim > pip install --upgrade gensim
from gensim.parsing.preprocessing import remove_stopwords
from gensim.models import Word2Vec

## Limpeza dos dados

In [2]:
# Carregando os dados
file = open('republic.txt', 'r')
text = file.read()
file.close()

In [3]:
print(text[:2000])

BOOK I.
I went down yesterday to the Piraeus with Glaucon the son of Ariston,
that I might offer up my prayers to the goddess (Bendis, the Thracian
Artemis.); and also because I wanted to see in what manner they would
celebrate the festival, which was a new thing. I was delighted with the
procession of the inhabitants; but that of the Thracians was equally,
if not more, beautiful. When we had finished our prayers and viewed the
spectacle, we turned in the direction of the city; and at that instant
Polemarchus the son of Cephalus chanced to catch sight of us from a
distance as we were starting on our way home, and told his servant to
run and bid us wait for him. The servant took hold of me by the cloak
behind, and said: Polemarchus desires you to wait.

I turned round, and asked him where his master was.

There he is, said the youth, coming after you, if you will only wait.

Certainly we will, said Glaucon; and in a few minutes Polemarchus
appeared, and with him Adeimantus, Glaucon’s br

In [4]:
def data_cleaning(doc):

    doc = doc.replace('--', ' ')
    tokens = doc.split()
    
    print('--Antes da limpeza--')
    print('Número de Tokens: %d' % len(tokens))
    print('Tokens únicos: %d' % len(set(tokens)))
    

    tokens = [w.lower() for w in tokens]
    tokens = [remove_stopwords(word) for word in tokens]
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub('', word) for word in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    
    print('--Depois da limpeza--')
    print('Número de Tokens: %d' % len(tokens))
    print('Tokens únicos: %d' % len(set(tokens)))
    
    return tokens

In [5]:
def sentences_building(tokens):
    length = 50 + 1
    sequences = list()
    for i in range(length, len(tokens)):
        seq = tokens[i-length:i]
        line = ' '.join(seq)
        sequences.append(line)
    return sequences

In [6]:
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [7]:
token = data_cleaning(text)
print(token[:100])

sentences = sentences_building(token)
print('--Sentenças--')
print(sentences[0:3])
print('Número de sentenças: %d' % len(sentences))

save_doc(sentences, 'republic_sentences.txt')

--Antes da limpeza--
Número de Tokens: 118284
Tokens únicos: 13051
--Depois da limpeza--
Número de Tokens: 44396
Tokens únicos: 7251
['book', 'i', 'went', 'yesterday', 'piraeus', 'glaucon', 'son', 'ariston', 'offer', 'prayers', 'goddess', 'bendis', 'thracian', 'artemis', 'wanted', 'manner', 'celebrate', 'festival', 'new', 'thing', 'delighted', 'procession', 'inhabitants', 'thracians', 'equally', 'more', 'beautiful', 'finished', 'prayers', 'viewed', 'spectacle', 'turned', 'direction', 'city', 'instant', 'polemarchus', 'son', 'cephalus', 'chanced', 'catch', 'sight', 'distance', 'starting', 'way', 'home', 'told', 'servant', 'run', 'bid', 'wait', 'him', 'servant', 'took', 'hold', 'cloak', 'behind', 'said', 'polemarchus', 'desires', 'wait', 'turned', 'round', 'asked', 'master', 'was', 'is', 'said', 'youth', 'coming', 'you', 'wait', 'certainly', 'will', 'said', 'glaucon', 'minutes', 'polemarchus', 'appeared', 'adeimantus', 'brother', 'niceratus', 'son', 'nicias', 'procession', 'polemarchus',

## Word Embedding: Word2Vec

In [8]:
def word_embedding(sent, vector_size):
    w2v_model = Word2Vec(sent, vector_size=vector_size, min_count=1, workers=8)
    w2v_weights = w2v_model.wv.vectors
    return w2v_model, w2v_weights

In [9]:
def word2idx(word):
  return w2v_model.wv.key_to_index[word]

In [10]:
def idx2word(idx):
  return w2v_model.wv.index_to_key[idx]

In [11]:
# Carregando as sentenças
file = open('republic_sentences.txt', 'r')
republic_senteces = file.read()
file.close()
lines = republic_senteces.split('\n')

# Create the list of list format of the custom corpus for gensim modeling 
sent = [line.split(' ') for line in lines]

w2v_model, w2v_weights = word_embedding(sent, 100)
vocab_size, embedding_size = w2v_weights.shape
print(vocab_size, embedding_size)

7251 100
