### Como Preparar Dados de Texto para Machine Learning

#### Formato Bag of Words

In [0]:
# Importando o CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [0]:
# Importando o NLTK
import nltk

In [0]:
# Carregando a base de dados
from nltk.corpus import webtext

In [0]:
# Fazendo download do dataset
nltk.download('webtext')

[nltk_data] Downloading package webtext to /root/nltk_data...
[nltk_data]   Package webtext is already up-to-date!


True

In [0]:
# Arquivos de dados da web
from nltk.corpus import webtext
webtext.fileids()

['firefox.txt',
 'grail.txt',
 'overheard.txt',
 'pirates.txt',
 'singles.txt',
 'wine.txt']

In [0]:
# Carregando o dataset firefox.txt
firefox = webtext.raw('firefox.txt')

In [0]:
# Visualisando o dataset firefox
firefox



In [0]:
# Separando em frases..
frases = firefox.split('\n')

In [0]:
# Visualizando as frases
frases

['Cookie Manager: "Don\'t allow sites that set removed cookies to set future cookies" should stay checked\r',
 'When in full screen mode\r',
 'Pressing Ctrl-N should open a new browser when only download dialog is left open\r',
 'add icons to context menu\r',
 'So called "tab bar" should be made a proper toolbar or given the ability collapse / expand.\r',
 '[XUL] Implement Cocoa-style toolbar customization.\r',
 '#ifdefs for MOZ_PHOENIX\r',
 "customize dialog's toolbar has small icons when small icons is not checked\r",
 'nightly builds and tinderboxen for Phoenix\r',
 'finish tearing prefs UI to pieces and then make it not suck\r',
 '"mozbrowser" script doesn\'t start correct binary\r',
 'Need bookmark groups icon\r',
 'Dropping at top of palette box horks things\r',
 'keyboard shortcut for Increase Text Size is broken\r',
 'default phoenix bookmarks\r',
 '[cust] need a toolbar spacer and spring spacer for customize\r',
 "Can't launch phoenix while Mozilla is running (or vice versa)\r

In [0]:
# Cria o objeto vetorizador
vectorizer = CountVectorizer()

In [0]:
# Aplica o vetorizador nos dados
vectorizer.fit(frases)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [0]:
# Imprime o vocabulário
print(vectorizer.vocabulary_)



In [0]:
# Aplicando o transformador
matrix = vectorizer.transform(frases)

In [0]:
# Visualizando a forma da matriz gerada
print(matrix.shape)

(10002, 6213)


In [0]:
# Imprimindo a matriz binária
print(matrix.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


#### Formato Tf-idf

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
# Instanciar o objeto TFIDF 
vectorizer = TfidfVectorizer()

In [0]:
# Constroi o vetorizador nos dados
vectorizer.fit(frases)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [0]:
# Imprime o vocabulario
print(vectorizer.vocabulary_)



In [0]:
print(vectorizer.idf_)

[8.82434597 8.41888086 9.11202804 ... 9.51749315 9.51749315 9.51749315]


In [0]:
# Aplicando aos dados
matrix = vectorizer.transform(frases)

In [0]:
# Imprime a forma da matrix
print(matrix.shape)

(10002, 6213)


In [0]:
print(matrix.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [0]:
frases[0]

'Cookie Manager: "Don\'t allow sites that set removed cookies to set future cookies" should stay checked\r'

In [0]:
# Aplicando a uma única frase
matrix = vectorizer.transform([frases[0]])

In [0]:
print(matrix.toarray())

[[0. 0. 0. ... 0. 0. 0.]]
