In [4]:
import matplotlib.pyplot as plt
import numpy as np
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk import stem, FreqDist
import pandas as pd
from pandas import DataFrame
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import PassiveAggressiveClassifier, Perceptron, SGDClassifier, RidgeClassifier
from sklearn import metrics

pd.set_option('display.max_rows', 10)

In [6]:
documents=pd.read_csv("Dilma_1turno_limpo.txt",delimiter="\t",encoding='latin-1')

In [8]:
train_set, test_set = train_test_split(documents, test_size=0.2, random_state=37)

In [23]:

def tokenize(text):
    return RegexpTokenizer(r'\w+').tokenize(text)

tokenize(documents['text'][2])[:5]

['dilma', 'rousseff', 'pt', 'e', 'aecio']

In [25]:
nltk.download('stopwords')
stop_words = set(stopwords.words("portuguese"))

# remoção de algumas palavras
stop_words.difference_update(set(['não', 'nem']))

def remove_stop_words(words):
    words = [w for w in words if w not in stop_words]
    return words

list(stop_words)[:10]

[nltk_data] Downloading package stopwords to /home/bruno/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['este',
 'tiver',
 'estes',
 'estas',
 'teriam',
 'tuas',
 'temos',
 'somos',
 'seja',
 'vocês']

In [27]:
text = tokenize(documents['text'][0])
print('Original: ', len(text))

text_cleaned = remove_stop_words(text)
print('Sem as stop words:', len(text_cleaned))

Original:  22
Sem as stop words: 18


In [28]:
text_cleaned[:6]

['geraldo', 'alckmin', 'reeleito', 'governador', 'sao', 'paulo']

In [29]:
def stem_words(words):
    stemmer = nltk.stem.RSLPStemmer()
    words = [stemmer.stem(w) for w in words]
    return words

In [32]:
nltk.download('rslp')
  
stem_words(remove_stop_words(tokenize(documents['text'][0])))[:6]

[nltk_data] Downloading package rslp to /home/bruno/nltk_data...
[nltk_data]   Unzipping stemmers/rslp.zip.


['gerald', 'alckmin', 'reeleit', 'govern', 'sao', 'paul']

In [33]:
all_words = train_set['text'].apply(lambda x: stem_words(remove_stop_words(tokenize(x))))
all_words = [word for doc in all_words for word in doc]

print("All words:", len(all_words))

# Words Most Common
DataFrame(FreqDist(all_words).most_common(10))

All words: 331350


Unnamed: 0,0,1
0,dilm,37291
1,aeci,16055
2,vot,10338
3,nao,6873
4,marin,5677
5,turn,5458
6,segund,4652
7,ganh,4574
8,vai,4188
9,pra,3961


In [34]:
def clean_and_tokenize(text):
    tokens = tokenize(text)
    tokens = remove_stop_words(tokens)
    tokens = stem_words(tokens)
    return tokens

In [39]:
vectorizer = TfidfVectorizer(tokenizer=clean_and_tokenize, max_features=10000)


In [40]:
features = vectorizer.get_feature_names()

NotFittedError: TfidfVectorizer - Vocabulary wasn't fitted.

In [42]:
train_X = vectorizer.fit_transform(train_set['text'].values)

In [43]:
train_X

<35804x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 312411 stored elements in Compressed Sparse Row format>

In [44]:
DataFrame(train_X.toarray()).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
