In [58]:
import numpy as np
import textwrap
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [59]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/andre/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [60]:
def wrap(x):
  return textwrap.fill(x, replace_whitespace=False, fix_sentence_endings=True)

In [61]:
def get_sentence_score(tfidf_row):
  # return the average of the non-zero values
  # of the tf-idf vector representation of a sentence
  x = tfidf_row[tfidf_row != 0]
  return x.mean()

In [62]:
featurizer = TfidfVectorizer(
    stop_words=stopwords.words('portuguese'),
    norm='l1',
)

In [67]:
def summarize(text, file):
  # extract sentences
  sents = nltk.sent_tokenize(text)

  # perform tf-idf
  X = featurizer.fit_transform(sents)

  # compute scores for each sentence
  scores = np.zeros(len(sents))
  for i in range(len(sents)):
    score = get_sentence_score(X[i,:])
    scores[i] = score
  
  # sort the scores
  sort_idx = np.argsort(-scores)

  # open file.txt for include summarization
  file = file.split('.')[0][:-8] + 'sumarizado.txt'
  print("FILE", file)
  file_sumary = open(file, 'w')

  print('SORT_IDX', sort_idx[:5])

  # print summary
  for i in sort_idx[:5]:
    file_sumary.writelines(f'{sents[i]}\n')

  file_sumary.close()

In [69]:
files = [
    'farrapos.txt',
    'texto_entretenimento_original.txt',
    'texto_esporte_original.txt',
    'texto_negocio_original.txt',
    'texto_politica_original.txt',
    'texto_tecnologia_original.txt',
]

for file in files:
    with open(file) as f:
        noticia = f.read()

    print('='*50)
    summarize(noticia, file)

FILE sumarizado.txt
SORT_IDX [202 147 261 225 201]
FILE texto_entretenimento_sumarizado.txt
SORT_IDX [1 5 6 4 3]
FILE texto_esporte_sumarizado.txt
SORT_IDX [21 12 20  9 19]
FILE texto_negocio_sumarizado.txt
SORT_IDX [29  5 33 27 17]
FILE texto_politica_sumarizado.txt
SORT_IDX [6 7 4 3 5]
FILE texto_tecnologia_sumarizado.txt
SORT_IDX [23 28 16 24 11]
