In [1]:
from __future__ import absolute_import, division, print_function

In [2]:
import codecs
import glob
import logging
import multiprocessing
import os
import pprint
import re

In [3]:
import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [4]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [5]:
!pip install -q keras

In [6]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [7]:
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/carolinesilva/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/carolinesilva/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Define Text Sources

In [8]:
book_filenames = ['el_pais/all_news.txt', 'el_pais/all_news2.txt', 
                  'el_pais/all_news3.txt', 'el_pais/all_news4.txt',
                 'folha/all_news.txt']

### Group all text into one string

In [9]:
corpus_raw = u""
for book_filename in book_filenames:
    print("Reading '{0}'...".format(book_filename))
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()
    print("Corpus is now {0} characters long".format(len(corpus_raw)))

Reading 'el_pais/all_news.txt'...


FileNotFoundError: [Errno 2] No such file or directory: 'el_pais/all_news.txt'

## Pre Proccessing

### 1) Lower case letters

In [10]:
corpus_raw = corpus_raw.lower()

### 2) Tokenize

In [11]:
tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle')

In [12]:
raw_sentences = tokenizer.tokenize(corpus_raw)

In [13]:
print(len(raw_sentences),'Frases')
print(raw_sentences[0])

49801 Frases
a nova pesquisa datafolha, divulgada na noite desta sexta-feira, confirma as tendências apontadas por outras pesquisas: fernando haddad (pt) segue em sua ascensão, desta vez com 6 pontos além do último levantamento.


### 3) Remove ponctuations

In [14]:
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Zçáíúéóàêôãõ]"," ", raw)
    words = clean.split()
    return words

In [15]:
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [16]:
len(sentences)

49801

In [17]:
print(raw_sentences[0])
print(sentence_to_wordlist(raw_sentences[0]))

a nova pesquisa datafolha, divulgada na noite desta sexta-feira, confirma as tendências apontadas por outras pesquisas: fernando haddad (pt) segue em sua ascensão, desta vez com 6 pontos além do último levantamento.
['a', 'nova', 'pesquisa', 'datafolha', 'divulgada', 'na', 'noite', 'desta', 'sexta', 'feira', 'confirma', 'as', 'tendências', 'apontadas', 'por', 'outras', 'pesquisas', 'fernando', 'haddad', 'pt', 'segue', 'em', 'sua', 'ascensão', 'desta', 'vez', 'com', 'pontos', 'além', 'do', 'último', 'levantamento']


### 4) Remove stopwords

In [18]:
text_file = open("stopwords.txt", "r")
lines = text_file.readlines()
lines = list(map(lambda x: x.replace('\n','').replace(' ',''), lines))

In [19]:
new_sentences = []

In [20]:
for s in sentences:
    new_sentences.append(list(filter(lambda x: x not in lines , s)))

In [21]:
print(new_sentences[0])

['nova', 'pesquisa', 'datafolha', 'divulgada', 'noite', 'desta', 'sexta', 'feira', 'confirma', 'tendências', 'apontadas', 'outras', 'pesquisas', 'fernando', 'haddad', 'pt', 'segue', 'ascensão', 'desta', 'vez', 'pontos', 'além', 'último', 'levantamento']


In [22]:
token_count = sum([len(sentence) for sentence in new_sentences])
print("The book corpus contains {0:,} words".format(token_count))

The book corpus contains 596,840 tokens/words


## Train Word2Vec Model

### Parameters

num_features  
min_word_count  
num_workers  
context_size  
downsampling  
seed  

In [23]:
#ONCE we have vectors
#step 3 - build model
#3 main tasks that vectors help with
#DISTANCE, SIMILARITY, RANKING

# Dimensionality of the resulting word vectors.
#more dimensions, more computationally expensive to train
#but also more accurate
#more dimensions = more generalized
num_features = 300
# Minimum word count threshold.
min_word_count = 3

# Number of threads to run in parallel.
# more workers, faster we train
num_workers = multiprocessing.cpu_count()
# Context window length.
context_size = 7

# Downsample setting for frequent words.
#0 - 1e-5 is good for this
downsampling = 1e-3

# Seed for the RNG, to make the results reproducible.
# random number generator
# deterministic, good for debugging
seed = 1

sg=1
skip-gram is used; 

sg=0
CBOW is used;

In [24]:
model2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

### Build vocabulary from a sequence of sentences 

In [25]:
model2vec.build_vocab(new_sentences)

2018-12-21 16:16:15,247 : INFO : collecting all words and their counts
2018-12-21 16:16:15,249 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-12-21 16:16:15,306 : INFO : PROGRESS: at sentence #10000, processed 116703 words, keeping 16607 word types
2018-12-21 16:16:15,374 : INFO : PROGRESS: at sentence #20000, processed 235511 words, keeping 24550 word types
2018-12-21 16:16:15,449 : INFO : PROGRESS: at sentence #30000, processed 351021 words, keeping 29974 word types
2018-12-21 16:16:15,500 : INFO : PROGRESS: at sentence #40000, processed 469446 words, keeping 34182 word types
2018-12-21 16:16:15,565 : INFO : collected 38727 word types from a corpus of 596840 raw words and 49801 sentences
2018-12-21 16:16:15,566 : INFO : Loading a fresh vocabulary
2018-12-21 16:16:15,626 : INFO : effective_min_count=3 retains 17786 unique words (45% of original 38727, drops 20941)
2018-12-21 16:16:15,627 : INFO : effective_min_count=3 leaves 570101 word corpus (95% of 

### Length of Vocab

In [26]:
print("Word2Vec vocabulary length:", len(model2vec.wv.vocab))

Word2Vec vocabulary length: 17786


### Train Model

In [27]:
model2vec.train(new_sentences,epochs=2, total_examples=model2vec.corpus_count)

2018-12-21 16:16:16,053 : INFO : training model with 4 workers on 17786 vocabulary and 300 features, using sg=1 hs=0 sample=0.001 negative=5 window=7
2018-12-21 16:16:17,106 : INFO : EPOCH 1 - PROGRESS: at 29.05% examples, 156953 words/s, in_qsize 7, out_qsize 0
2018-12-21 16:16:18,126 : INFO : EPOCH 1 - PROGRESS: at 63.09% examples, 171173 words/s, in_qsize 7, out_qsize 0
2018-12-21 16:16:19,157 : INFO : EPOCH 1 - PROGRESS: at 96.82% examples, 175236 words/s, in_qsize 3, out_qsize 1
2018-12-21 16:16:19,158 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-12-21 16:16:19,169 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-12-21 16:16:19,179 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-12-21 16:16:19,195 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-12-21 16:16:19,197 : INFO : EPOCH - 1 : training on 596840 raw words (565225 effective words) took 3.1s, 181216 effective words/s
2018-12-21 1

(1130479, 1193680)

### Save model

In [28]:
if not os.path.exists("trained"):
    os.makedirs("trained")

In [29]:
model2vec.save(os.path.join("trained", "model2vec.w2v"))

2018-12-21 16:16:22,331 : INFO : saving Word2Vec object under trained/model2vec.w2v, separately None
2018-12-21 16:16:22,333 : INFO : not storing attribute vectors_norm
2018-12-21 16:16:22,336 : INFO : not storing attribute cum_table
2018-12-21 16:16:23,012 : INFO : saved trained/model2vec.w2v


### Load Model

In [30]:
model2vec = w2v.Word2Vec.load(os.path.join("trained", "model2vec.w2v"))

2018-12-21 16:16:23,022 : INFO : loading Word2Vec object from trained/model2vec.w2v
2018-12-21 16:16:23,437 : INFO : loading wv recursively from trained/model2vec.w2v.wv.* with mmap=None
2018-12-21 16:16:23,438 : INFO : setting ignored attribute vectors_norm to None
2018-12-21 16:16:23,439 : INFO : loading vocabulary recursively from trained/model2vec.w2v.vocabulary.* with mmap=None
2018-12-21 16:16:23,440 : INFO : loading trainables recursively from trained/model2vec.w2v.trainables.* with mmap=None
2018-12-21 16:16:23,441 : INFO : setting ignored attribute cum_table to None
2018-12-21 16:16:23,442 : INFO : loaded trained/model2vec.w2v


### Reduce Dimensionality

t-distributed Stochastic Neighbor Embedding.

t-SNE [1] is a tool to visualize high-dimensional data. It converts similarities between data points to joint probabilities and tries to minimize the Kullback-Leibler divergence between the joint probabilities of the low-dimensional embedding and the high-dimensional data. t-SNE has a cost function that is not convex, i.e. with different initializations we can get different results.

It is highly recommended to use another dimensionality reduction method (e.g. PCA for dense data or TruncatedSVD for sparse data) to reduce the number of dimensions to a reasonable amount (e.g. 50) if the number of features is very high. This will suppress some noise and speed up the computation of pairwise distances between samples.

In [31]:

tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)

### Get Raw Vectors to Represent - wv.syn0

The raw vectors array of words in a Word2Vec or Doc2Vec model is available in model.wv.syn0.

In [32]:
all_word_vectors_matrix = model2vec.wv.syn0

  """Entry point for launching an IPython kernel.


In [33]:
len(new_sentences)

49801

In [34]:
len(list(model2vec.wv.vocab))

17786

In [35]:
len(all_word_vectors_matrix)

17786

### Plot Graph and Relations


In [43]:
all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)

In [None]:
vocab = list(model2vec.wv.vocab)
X = model2vec[vocab]
df = pd.DataFrame(all_word_vectors_matrix_2d, index=vocab, columns=['x', 'y'])

In [None]:
df.head(10)

In [None]:
sns.set_context("poster")

In [None]:
def plot_region(x_bounds, y_bounds):
    slice = points
    ax = slice.plot.scatter("x", "y", s=35, figsize=(10, 8))
    for i, point in slice.iterrows():
        ax.text(point.x + 0.005, point.y + 0.005, point.word, fontsize=11)

In [None]:
plot_region(x_bounds=(1, 2), y_bounds=(-1, -2))

In [None]:
model2vec.most_similar("lula")

#### Palavra x esta para y, assim como a esta para b

In [None]:
def nearest_similarity_cosmul(start1, end1, end2):
    similarities = model2vec.most_similar_cosmul(
        positive=[end2, start1],
        negative=[end1]
    )
    start2 = similarities[0][0]
    print("{start1} is related to {end1}, as {start2} is related to {end2}".format(**locals()))
    return start2

In [None]:
nearest_similarity_cosmul("doria", "paulo", "haddad")