In [48]:
import codecs
import re
import time # uso apenas para evitar o cache das imagens
from IPython.core.display import display, HTML

# Read, filter, clean and structure

In [2]:
re_empty_line = re.compile("^[\s•]+$")
def is_empty_line(line):
    
    return re_empty_line.match(line) is not None

re_chapter_title = re.compile("^[A-ZÁÉÍÓÚÃÕ\. ]+\s*$")
def is_chapter_title(line):
    
    return re_chapter_title.match(line) is not None

re_newline = re.compile("[\r\n]+")

def filter_and_clean(lines):
    
    return [re_newline.sub("", line).strip() for line in lines if not is_empty_line(line)]

In [3]:
chapters = []

# read, filter, clean and structure
with codecs.open("perto_do_coracao_selvagem.txt", "r", "utf-8") as f:
    
    current_chapter = {'title': '__NOT DEFINED__', 'lines': []}
    for line in filter_and_clean(f.readlines()):
        
        if is_chapter_title(line):
            
            current_chapter = {'title': line, 'lines': []}
            
            chapters.append(current_chapter)
        else:
            
            current_chapter['lines'].append(line)
            
for chapter in chapters:
    
    chapter['text'] = " ".join(chapter['lines'])

# Tokenize sentences

In [4]:
import nltk.data

sent_tokenizer = nltk.data.load("tokenizers/punkt/portuguese.pickle")

In [5]:
for chapter in chapters:
    
    chapter['sentences'] = sent_tokenizer.tokenize(chapter['text'])

# Word cloud for chapters

Baseado em https://github.com/GaelVaroquaux/my_topics/blob/master/topics_extraction.py
    
Não, eu não sei o que estou fazendo

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from nltk.tokenize import RegexpTokenizer
import nltk
from nltk.stem import SnowballStemmer

In [68]:
n_features = 20
re_is_word_tokenizer = RegexpTokenizer("[\w']+")

stemmer = SnowballStemmer("portuguese")

def tokenize_and_filter(sent):
    
    return [sent for sent in re_is_word_tokenizer.tokenize(sent) if len(sent) > 2]

# n_features?
tfidf_vec = TfidfVectorizer(max_df = .5, min_df = 4, max_features=n_features, norm='l1',
                            stop_words=nltk.corpus.stopwords.words('portuguese'),
                            tokenizer=tokenize_and_filter,
                            preprocessor=stemmer.stem)

all_sents = set().union(*[chapter['sentences'] for chapter in chapters])
tfidf = tfidf_vec.fit_transform(all_sents)

In [69]:
nmf = NMF(n_components=3, random_state=0).fit(tfidf)

feature_names = tfidf_vec.get_feature_names()

doc_loadings = nmf.transform(tfidf)

In [70]:
import os
import numpy as np

from wordcloud import WordCloud

In [71]:
def my_color_func(word=None, font_size=None, position=None, orientation=None, font_path=None, random_state=None):
    
    return "hsl(%d, 90%%, 20%%)" % (110 + 3*font_size)

In [72]:
x, y = np.ogrid[-1:1:250j, -1:1:450j]
mask = (255 * ((x ** 2 + y ** 2))).astype(int)

In [73]:
for topic_idx, topic in enumerate(nmf.components_):
    
    freq_cloud = WordCloud(max_font_size=100, relative_scaling=.5,
                           background_color='white', mode='RGBA',
                           mask=mask, color_func=my_color_func, scale=2)
    
    frequencies = [(w, f) for w, f in zip(feature_names, topic) if f != 0]
    
    freq_cloud.generate_from_frequencies(frequencies)
    
    freq_cloud.to_file(os.path.join('word_cloud', 'teste_%02i.png' % topic_idx))

In [77]:
random_string = str(time.time())

img_1 = "word_cloud/teste_00.png?{}".format(random_string)
img_2 = "word_cloud/teste_01.png?{}".format(random_string)
img_3 = "word_cloud/teste_02.png?{}".format(random_string)

In [75]:
display(HTML('<img src={} />'.format(img_1)))

In [76]:
display(HTML('<img src={} />'.format(img_2)))

In [78]:
display(HTML('<img src={} />'.format(img_3)))