In [1]:
from __future__ import absolute_import, division, print_function

In [2]:
import codecs
import glob
import logging
import multiprocessing
import os
import pprint
import re

In [3]:
import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [4]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [5]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

### Define Text Sources

In [9]:
filenames = ['datasets/el_pais/all_news.txt', 'datasets/el_pais/all_news2.txt', 
                  'datasets/el_pais/all_news3.txt', 'datasets/el_pais/all_news4.txt',
                 'datasets/folha/all_news.txt']

In [10]:
corpus_raw = u""
for filenames in filenames:
    print("Reading '{0}'...".format(filenames))
    with codecs.open(filenames, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()
    print("Corpus is now {0} characters long".format(len(corpus_raw)))

Reading 'datasets/el_pais/all_news.txt'...
Corpus is now 2217835 characters long
Reading 'datasets/el_pais/all_news2.txt'...
Corpus is now 4407384 characters long
Reading 'datasets/el_pais/all_news3.txt'...
Corpus is now 5466038 characters long
Reading 'datasets/el_pais/all_news4.txt'...
Corpus is now 6173125 characters long
Reading 'datasets/folha/all_news.txt'...
Corpus is now 6554289 characters long


### Pre Proccessing

1) Lower Case Letters

In [11]:
corpus_raw = corpus_raw.lower()

2) Tokenize

In [12]:
tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle')

In [13]:
raw_sentences = tokenizer.tokenize(corpus_raw)

In [14]:
print(len(raw_sentences),'Frases')
print(raw_sentences[0])

49801 Frases
a nova pesquisa datafolha, divulgada na noite desta sexta-feira, confirma as tendências apontadas por outras pesquisas: fernando haddad (pt) segue em sua ascensão, desta vez com 6 pontos além do último levantamento.


3) Remove Ponctuations

In [15]:
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Zçáíúéóàêôãõ]"," ", raw)
    words = clean.split()
    return words

In [16]:
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [17]:
token_count = sum([len(s) for s in sentences])
print("The book corpus contains {0:,} words".format(token_count))

The book corpus contains 1,065,301 words


In [18]:
print(raw_sentences[0])
print(sentence_to_wordlist(raw_sentences[0]))

a nova pesquisa datafolha, divulgada na noite desta sexta-feira, confirma as tendências apontadas por outras pesquisas: fernando haddad (pt) segue em sua ascensão, desta vez com 6 pontos além do último levantamento.
['a', 'nova', 'pesquisa', 'datafolha', 'divulgada', 'na', 'noite', 'desta', 'sexta', 'feira', 'confirma', 'as', 'tendências', 'apontadas', 'por', 'outras', 'pesquisas', 'fernando', 'haddad', 'pt', 'segue', 'em', 'sua', 'ascensão', 'desta', 'vez', 'com', 'pontos', 'além', 'do', 'último', 'levantamento']


4) Remove Stopwords

In [19]:
text_file = open("datasets/stopwords.txt", "r")
lines = text_file.readlines()
lines = list(map(lambda x: x.replace('\n','').replace(' ',''), lines))

In [20]:
new_sentences = []

In [21]:
for s in sentences:
    new_sentences.append(list(filter(lambda x: x not in lines , s)))

In [22]:
print(new_sentences[0])

['nova', 'pesquisa', 'datafolha', 'divulgada', 'noite', 'desta', 'sexta', 'feira', 'confirma', 'tendências', 'apontadas', 'outras', 'pesquisas', 'fernando', 'haddad', 'pt', 'segue', 'ascensão', 'desta', 'vez', 'pontos', 'além', 'último', 'levantamento']


In [23]:
token_count = sum([len(sentence) for sentence in new_sentences])
print("The book corpus contains {0:,} words".format(token_count))

The book corpus contains 596,840 words


### Train CBOW Model

### Parameters

num_features  
min_word_count  
num_workers  
context_size  
downsampling  
seed  

In [24]:
#ONCE we have vectors
#step 3 - build model
#3 main tasks that vectors help with
#DISTANCE, SIMILARITY, RANKING

# Dimensionality of Word Vectors
# More dimensions, more computationally expensive to train but also more accurate
num_features = 300

# Minimum word count threshold.
min_word_count = 3

# Number of threads to run in parallel.
# More workers == More faster
num_workers = multiprocessing.cpu_count()
print(num_workers)

# Context window length.
context_size = 7

# Downsample setting for frequent words.
#0 - 1e-5 is good for this
downsampling = 1e-3

# Seed for the RNG, to make the results reproducible.
# random number generator
# deterministic, good for debugging
seed = 1

4


sg=1
skip-gram is used; 

sg=0
CBOW is used;

In [27]:
model_cbow2vec = w2v.Word2Vec(
    sg=0,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [28]:
model_cbow2vec.build_vocab(new_sentences)

2019-01-12 15:02:38,694 : INFO : collecting all words and their counts
2019-01-12 15:02:38,695 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-01-12 15:02:38,748 : INFO : PROGRESS: at sentence #10000, processed 116703 words, keeping 16607 word types
2019-01-12 15:02:38,813 : INFO : PROGRESS: at sentence #20000, processed 235511 words, keeping 24550 word types
2019-01-12 15:02:38,892 : INFO : PROGRESS: at sentence #30000, processed 351021 words, keeping 29974 word types
2019-01-12 15:02:38,960 : INFO : PROGRESS: at sentence #40000, processed 469446 words, keeping 34182 word types
2019-01-12 15:02:39,013 : INFO : collected 38727 word types from a corpus of 596840 raw words and 49801 sentences
2019-01-12 15:02:39,014 : INFO : Loading a fresh vocabulary
2019-01-12 15:02:39,189 : INFO : effective_min_count=3 retains 17786 unique words (45% of original 38727, drops 20941)
2019-01-12 15:02:39,190 : INFO : effective_min_count=3 leaves 570101 word corpus (95% of 

In [29]:
print("Word2Vec vocabulary length:", len(model_cbow2vec.wv.vocab))

Word2Vec vocabulary length: 17786


In [31]:
model_cbow2vec.corpus_count

49801

In [33]:
model_cbow2vec.train(new_sentences,epochs=5, total_examples=model_cbow2vec.corpus_count)

2019-01-12 15:03:44,400 : INFO : training model with 4 workers on 17786 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=7
2019-01-12 15:03:45,430 : INFO : EPOCH 1 - PROGRESS: at 88.29% examples, 486147 words/s, in_qsize 7, out_qsize 0
2019-01-12 15:03:45,524 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-01-12 15:03:45,568 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-01-12 15:03:45,581 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-01-12 15:03:45,583 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-01-12 15:03:45,584 : INFO : EPOCH - 1 : training on 596840 raw words (565258 effective words) took 1.2s, 484047 effective words/s
2019-01-12 15:03:46,615 : INFO : EPOCH 2 - PROGRESS: at 64.60% examples, 358687 words/s, in_qsize 7, out_qsize 0
2019-01-12 15:03:47,064 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-01-12 15:03:47,079 : INFO : w

(2826086, 2984200)

In [34]:
if not os.path.exists("trained"):
    os.makedirs("trained")

In [35]:
model_cbow2vec.save(os.path.join("trained", "model_cbow2vec.w2v"))

2019-01-12 15:04:33,332 : INFO : saving Word2Vec object under trained/model_cbow2vec.w2v, separately None
2019-01-12 15:04:33,335 : INFO : not storing attribute vectors_norm
2019-01-12 15:04:33,337 : INFO : not storing attribute cum_table
2019-01-12 15:04:34,125 : INFO : saved trained/model_cbow2vec.w2v


In [36]:
model2vec = w2v.Word2Vec.load(os.path.join("trained", "model_cbow2vec.w2v"))

2019-01-12 15:04:57,675 : INFO : loading Word2Vec object from trained/model_cbow2vec.w2v
2019-01-12 15:04:58,061 : INFO : loading wv recursively from trained/model_cbow2vec.w2v.wv.* with mmap=None
2019-01-12 15:04:58,062 : INFO : setting ignored attribute vectors_norm to None
2019-01-12 15:04:58,064 : INFO : loading vocabulary recursively from trained/model_cbow2vec.w2v.vocabulary.* with mmap=None
2019-01-12 15:04:58,065 : INFO : loading trainables recursively from trained/model_cbow2vec.w2v.trainables.* with mmap=None
2019-01-12 15:04:58,070 : INFO : setting ignored attribute cum_table to None
2019-01-12 15:04:58,073 : INFO : loaded trained/model_cbow2vec.w2v


In [37]:
model2vec.most_similar("lula")

  """Entry point for launching an IPython kernel.
2019-01-12 15:10:58,338 : INFO : precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):


[('inácio', 0.9295151233673096),
 ('petista', 0.9198875427246094),
 ('impugnado', 0.915873646736145),
 ('inelegível', 0.8941935300827026),
 ('candidatura', 0.8923618197441101),
 ('preso', 0.8914204835891724),
 ('concorrer', 0.8861750960350037),
 ('indique', 0.8834351897239685),
 ('impeachment', 0.8762634992599487),
 ('indeferir', 0.876238226890564)]

In [46]:
t = ['a', 'nova', 'pesquisa', 'datafolha', 'divulgada', 'na', 'noite', 'desta', 'sexta', 'feira', 'confirma', 'as', 'tendências', 'apontadas', 'por', 'outras', 'pesquisas', 'fernando', 'haddad', 'pt', 'segue', 'em', 'sua', 'ascensão', 'desta', 'vez', 'com', 'pontos', 'além', 'do', 'último', 'levantamento']
s = ' '.join(t)