In [1]:
from __future__ import absolute_import, division, print_function

In [2]:
import codecs
import glob
import logging
import multiprocessing
import os
import pprint
import re

In [3]:
import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [4]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [5]:
!pip install -q keras

In [6]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [7]:
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/carolinesilva/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/carolinesilva/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Define Text Sources

In [10]:
book_filenames = ['datasets/el_pais/all_news.txt', 'datasets/el_pais/all_news2.txt', 
                  'datasets/el_pais/all_news3.txt', 'datasets/el_pais/all_news4.txt',
                 'datasets/folha/all_news.txt']

### Group all text into one string

In [11]:
corpus_raw = u""
for book_filename in book_filenames:
    print("Reading '{0}'...".format(book_filename))
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()
    print("Corpus is now {0} characters long".format(len(corpus_raw)))

Reading 'datasets/el_pais/all_news.txt'...
Corpus is now 2217835 characters long
Reading 'datasets/el_pais/all_news2.txt'...
Corpus is now 4407384 characters long
Reading 'datasets/el_pais/all_news3.txt'...
Corpus is now 5466038 characters long
Reading 'datasets/el_pais/all_news4.txt'...
Corpus is now 6173125 characters long
Reading 'datasets/folha/all_news.txt'...
Corpus is now 6554289 characters long


## Pre Proccessing

### 1) Lower case letters

In [12]:
corpus_raw = corpus_raw.lower()

### 2) Tokenize

In [13]:
tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle')

In [14]:
raw_sentences = tokenizer.tokenize(corpus_raw)

In [15]:
print(len(raw_sentences),'Frases')
print(raw_sentences[0])

49801 Frases
a nova pesquisa datafolha, divulgada na noite desta sexta-feira, confirma as tendências apontadas por outras pesquisas: fernando haddad (pt) segue em sua ascensão, desta vez com 6 pontos além do último levantamento.


### 3) Remove ponctuations

In [16]:
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Zçáíúéóàêôãõ]"," ", raw)
    words = clean.split()
    return words

In [17]:
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [25]:
token_count = sum([len(s) for s in sentences])
print("The book corpus contains {0:,} words".format(token_count))

The book corpus contains 1,065,301 words


In [19]:
print(raw_sentences[0])
print(sentence_to_wordlist(raw_sentences[0]))

a nova pesquisa datafolha, divulgada na noite desta sexta-feira, confirma as tendências apontadas por outras pesquisas: fernando haddad (pt) segue em sua ascensão, desta vez com 6 pontos além do último levantamento.
['a', 'nova', 'pesquisa', 'datafolha', 'divulgada', 'na', 'noite', 'desta', 'sexta', 'feira', 'confirma', 'as', 'tendências', 'apontadas', 'por', 'outras', 'pesquisas', 'fernando', 'haddad', 'pt', 'segue', 'em', 'sua', 'ascensão', 'desta', 'vez', 'com', 'pontos', 'além', 'do', 'último', 'levantamento']


### 4) Remove stopwords

In [21]:
text_file = open("datasets/stopwords.txt", "r")
lines = text_file.readlines()
lines = list(map(lambda x: x.replace('\n','').replace(' ',''), lines))

In [22]:
new_sentences = []

In [23]:
for s in sentences:
    new_sentences.append(list(filter(lambda x: x not in lines , s)))

In [24]:
print(new_sentences[0])

['nova', 'pesquisa', 'datafolha', 'divulgada', 'noite', 'desta', 'sexta', 'feira', 'confirma', 'tendências', 'apontadas', 'outras', 'pesquisas', 'fernando', 'haddad', 'pt', 'segue', 'ascensão', 'desta', 'vez', 'pontos', 'além', 'último', 'levantamento']


In [22]:
token_count = sum([len(sentence) for sentence in new_sentences])
print("The book corpus contains {0:,} words".format(token_count))

The book corpus contains 596,840 tokens/words


## Train Skip Gram Model

### Parameters

num_features  
min_word_count  
num_workers  
context_size  
downsampling  
seed  

In [31]:
#ONCE we have vectors
#step 3 - build model
#3 main tasks that vectors help with
#DISTANCE, SIMILARITY, RANKING

# Dimensionality of Word Vectors
# More dimensions, more computationally expensive to train but also more accurate
num_features = 300

# Minimum word count threshold.
min_word_count = 3

# Number of threads to run in parallel.
# More workers == More faster
num_workers = multiprocessing.cpu_count()
print(num_workers)

# Context window length.
context_size = 7

# Downsample setting for frequent words.
#0 - 1e-5 is good for this
downsampling = 1e-3

# Seed for the RNG, to make the results reproducible.
# random number generator
# deterministic, good for debugging
seed = 1

4


sg=1
skip-gram is used; 

sg=0
CBOW is used;

In [32]:
model2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

### Build vocabulary from a sequence of sentences 

In [33]:
model2vec.build_vocab(new_sentences)

2019-01-12 14:29:17,585 : INFO : collecting all words and their counts
2019-01-12 14:29:17,587 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-01-12 14:29:17,663 : INFO : PROGRESS: at sentence #10000, processed 116703 words, keeping 16607 word types
2019-01-12 14:29:17,752 : INFO : PROGRESS: at sentence #20000, processed 235511 words, keeping 24550 word types
2019-01-12 14:29:17,828 : INFO : PROGRESS: at sentence #30000, processed 351021 words, keeping 29974 word types
2019-01-12 14:29:17,923 : INFO : PROGRESS: at sentence #40000, processed 469446 words, keeping 34182 word types
2019-01-12 14:29:18,030 : INFO : collected 38727 word types from a corpus of 596840 raw words and 49801 sentences
2019-01-12 14:29:18,031 : INFO : Loading a fresh vocabulary
2019-01-12 14:29:18,437 : INFO : effective_min_count=3 retains 17786 unique words (45% of original 38727, drops 20941)
2019-01-12 14:29:18,438 : INFO : effective_min_count=3 leaves 570101 word corpus (95% of 

### Length of Vocab

In [34]:
print("Word2Vec vocabulary length:", len(model2vec.wv.vocab))

Word2Vec vocabulary length: 17786


### Train Model

In [36]:
model2vec.corpus_count

49801

In [38]:
model2vec.train(new_sentences,epochs=5, total_examples=model2vec.corpus_count)

2019-01-12 14:42:17,815 : INFO : training model with 4 workers on 17786 vocabulary and 300 features, using sg=1 hs=0 sample=0.001 negative=5 window=7
2019-01-12 14:42:18,984 : INFO : EPOCH 1 - PROGRESS: at 22.24% examples, 107020 words/s, in_qsize 7, out_qsize 0
2019-01-12 14:42:19,990 : INFO : EPOCH 1 - PROGRESS: at 49.60% examples, 127282 words/s, in_qsize 7, out_qsize 0
2019-01-12 14:42:21,064 : INFO : EPOCH 1 - PROGRESS: at 76.60% examples, 131937 words/s, in_qsize 7, out_qsize 0
2019-01-12 14:42:21,941 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-01-12 14:42:21,988 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-01-12 14:42:21,999 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-01-12 14:42:22,043 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-01-12 14:42:22,044 : INFO : EPOCH - 1 : training on 596840 raw words (565305 effective words) took 4.2s, 134144 effective words/s
2019-01-12 1

(2826257, 2984200)

### Save model

In [39]:
if not os.path.exists("trained"):
    os.makedirs("trained")

In [40]:
model2vec.save(os.path.join("trained", "model2vec.w2v"))

2019-01-12 14:42:47,279 : INFO : saving Word2Vec object under trained/model2vec.w2v, separately None
2019-01-12 14:42:47,282 : INFO : not storing attribute vectors_norm
2019-01-12 14:42:47,287 : INFO : not storing attribute cum_table
2019-01-12 14:42:48,121 : INFO : saved trained/model2vec.w2v


### Load Model

In [41]:
model2vec = w2v.Word2Vec.load(os.path.join("trained", "model2vec.w2v"))

2019-01-12 14:42:50,554 : INFO : loading Word2Vec object from trained/model2vec.w2v
2019-01-12 14:42:50,912 : INFO : loading wv recursively from trained/model2vec.w2v.wv.* with mmap=None
2019-01-12 14:42:50,912 : INFO : setting ignored attribute vectors_norm to None
2019-01-12 14:42:50,913 : INFO : loading vocabulary recursively from trained/model2vec.w2v.vocabulary.* with mmap=None
2019-01-12 14:42:50,915 : INFO : loading trainables recursively from trained/model2vec.w2v.trainables.* with mmap=None
2019-01-12 14:42:50,919 : INFO : setting ignored attribute cum_table to None
2019-01-12 14:42:50,920 : INFO : loaded trained/model2vec.w2v


### Reduce Dimensionality

t-distributed Stochastic Neighbor Embedding.

t-SNE [1] is a tool to visualize high-dimensional data. It converts similarities between data points to joint probabilities and tries to minimize the Kullback-Leibler divergence between the joint probabilities of the low-dimensional embedding and the high-dimensional data. t-SNE has a cost function that is not convex, i.e. with different initializations we can get different results.

It is highly recommended to use another dimensionality reduction method (e.g. PCA for dense data or TruncatedSVD for sparse data) to reduce the number of dimensions to a reasonable amount (e.g. 50) if the number of features is very high. This will suppress some noise and speed up the computation of pairwise distances between samples.

In [42]:
tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)

### Get Raw Vectors to Represent - wv.syn0

The raw vectors array of words in a Word2Vec or Doc2Vec model is available in model.wv.syn0.

In [43]:
all_word_vectors_matrix = model2vec.wv.syn0

  """Entry point for launching an IPython kernel.


In [44]:
len(new_sentences)

49801

In [45]:
len(list(model2vec.wv.vocab))

17786

In [46]:
len(all_word_vectors_matrix)

17786

### Plot Graph and Relations


In [47]:
all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)

In [48]:
len(all_word_vectors_matrix)

17786

In [49]:
vocab = list(model2vec.wv.vocab)
X = model2vec[vocab]
df = pd.DataFrame(all_word_vectors_matrix_2d, index=vocab, columns=['x', 'y'])

  


In [50]:
df.head(10)

Unnamed: 0,x,y
nova,55.469185,28.590986
pesquisa,47.235996,-12.171621
datafolha,20.867083,48.584522
divulgada,-55.27663,11.864432
noite,41.461628,14.854955
desta,-10.946574,-57.402309
sexta,50.063782,-21.332224
feira,9.637318,-51.344612
confirma,11.616053,-28.768379
tendências,54.119797,-14.424649


In [51]:
sns.set_context("poster")

In [54]:
# def plot_region(x_bounds, y_bounds):
#     slice = points
#     ax = slice.plot.scatter("x", "y", s=35, figsize=(10, 8))
#     for i, point in slice.iterrows():
#         ax.text(point.x + 0.005, point.y + 0.005, point.word, fontsize=11)

In [55]:
# plot_region(x_bounds=(1, 2), y_bounds=(-1, -2))

In [56]:
model2vec.most_similar("lula")

  """Entry point for launching an IPython kernel.
2019-01-12 15:17:31,489 : INFO : precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):


[('petista', 0.7718538045883179),
 ('inácio', 0.6719338893890381),
 ('virtualmente', 0.5995986461639404),
 ('petistas', 0.597693145275116),
 ('inscrição', 0.5877524614334106),
 ('aceitação', 0.5820702910423279),
 ('impediria', 0.5805680751800537),
 ('contradições', 0.5790199041366577),
 ('substituto', 0.5783867239952087),
 ('judice', 0.5773203372955322)]

#### Palavra x esta para y, assim como a esta para b

In [None]:
def nearest_similarity_cosmul(start1, end1, end2):
    similarities = model2vec.most_similar_cosmul(
        positive=[end2, start1],
        negative=[end1]
    )
    start2 = similarities[0][0]
    print("{start1} is related to {end1}, as {start2} is related to {end2}".format(**locals()))
    return start2

In [None]:
nearest_similarity_cosmul("doria", "paulo", "haddad")