In [1]:
import glob
import numpy as np
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
import nltk.collocations
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

Lendo cada um dos artigos e salvando em um dicionário

In [2]:
files = glob.glob("DocCol2/*")

docs = {}
for fname in files:
    with open(fname,'r') as f:
        key = fname.split('/')[-1]
        docs[key] = f.read()

Fazendo a limpeza das palavras em cada arquivo

In [3]:
sw = stopwords.words('english')

docs_ready = {}
for doc_name, words in docs.items():
    words = word_tokenize(words)
    words = [w.lower() for w in words if w.isalpha() and len(w) > 1]
    words = [w for w in words if w not in sw]
    words = [PorterStemmer().stem(w) for w in words]
    
    # Adicionando 20 bigramas mais comuns
    bigram_measures = nltk.collocations.BigramAssocMeasures() 
    finder = nltk.collocations.BigramCollocationFinder.from_words(words)
    for i in finder.nbest(bigram_measures.raw_freq, 20):
        words.append(i[0].lower()+'_'+i[1].lower())
    
    docs_ready[doc_name] = words

A função do sklearn espera uma lista de strings onde cada string é um documento diferente. Para isso vamos transformar cada valor do nosso dicionario (que é uma lista de palavras) em uma string juntando cada palavra dessa lista com um espaço.

Também vamos pegar os nomes de cada documentos e salvar em uma lista

In [4]:
docs_names = list(docs_ready.keys())
docs_as_list = [' '.join(words) for words in docs_ready.values()]

Agora podemos usar o CountVectorizer() para gerar nosso bag of words!

O comando *todense()* é utilizado para tranformar o objeto retornado em uma matrix padrão no numpy.

In [5]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(docs_as_list).todense()

Agora vamos salvar em um dataframe.

Estamos fazendo um dictionary comprehension porque temos o mapeamento de nome para numero de coluna, e precisamos justamento do contrário para renomear nossas colunas.

In [6]:
df = pd.DataFrame(X)
df = df.rename(columns={v: c for c, v in vectorizer.vocabulary_.items()})
df.index = docs_names
df

Unnamed: 0,aaai,aacc,aaron,aaron_bryce,aaron_faith,aaron_god,aaron_messag,aaron_origin,aaron_roman,aaronc,...,zf,zhao,zilch,zone,zoom,zopfi,zorg,zorn,zur,zwingli
gr17,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
au8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ch13,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ch12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
gr9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ch26,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
gr4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
gr20,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
gr5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


O processo para gerar o TF-IDF é completamente analogo, porem faz uso do TfidfVectorizer()

In [7]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs_as_list).todense()

df_tfidf = pd.DataFrame(X)
df_tfidf = df_tfidf.rename(columns={v: c for c, v in vectorizer.vocabulary_.items()})
df_tfidf.index = docs_names
df_tfidf

Unnamed: 0,aaai,aacc,aaron,aaron_bryce,aaron_faith,aaron_god,aaron_messag,aaron_origin,aaron_roman,aaronc,...,zf,zhao,zilch,zone,zoom,zopfi,zorg,zorn,zur,zwingli
gr17,0.034736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.034736,0.0
au8,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
ch13,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
ch12,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
gr9,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ch26,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
gr4,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
gr20,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
gr5,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
