In [81]:
import re
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [82]:
!pip install goose3



In [83]:
from goose3 import Goose

In [84]:
g = Goose()
url = 'https://en.wikipedia.org/wiki/Automatic_summarization'
article = g.extract(url)

In [85]:
article.infos

{'meta': {'description': '',
  'lang': 'en',
  'keywords': '',
  'favicon': '/static/apple-touch/wikipedia.png',
  'canonical': 'https://en.wikipedia.org/wiki/Automatic_summarization',
  'encoding': 'UTF-8'},
 'image': None,
 'domain': 'en.wikipedia.org',
 'title': 'Automatic summarization - Wikipedia',
 'cleaned_text': 'Automatic summarization is the process of shortening a set of data computationally, to create a subset (a summary) that represents the most important or relevant information within the original content. Artificial intelligence algorithms are commonly developed and employed to achieve this, specialized for different types of data.\n\nText summarization is usually implemented by natural language processing methods, designed to locate the most informative sentences in a given document.[1] On the other hand, visual content can be summarized using computer vision algorithms. Image summarization is the subject of ongoing research; existing approaches typically attempt to dis

In [86]:
original_text=article.cleaned_text

In [87]:
# # I added the word machine at the end of the last sentence
# original_text = """Artificial intelligence is human like intelligence. 
#                    It is the study of intelligent artificial agents. 
#                    Science and engineering to produce intelligent machines. 
#                    Solve problems and have intelligence. 
#                    Related to intelligent behavior. 
#                    Developing of reasoning machines. 
#                    Learn from mistakes and successes. 
#                    Artificial intelligence is related to reasoning in everyday situations."""

In [88]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join tokens back into a single string
    return ' '.join(tokens)


In [89]:
cleand_text=preprocess_text(original_text)

In [90]:
len(cleand_text)

25670

In [91]:
word_frequency = nltk.FreqDist(nltk.word_tokenize(cleand_text))

In [92]:
len(word_frequency.keys())

1100

In [93]:
highest_frequency=max(word_frequency.values())
highest_frequency

79

In [94]:
for word in word_frequency.keys():
    word_frequency[word]=(word_frequency[word]/highest_frequency)

In [95]:
word_frequency

FreqDist({'summarization': 1.0, 'summary': 0.7088607594936709, 'text': 0.6455696202531646, 'sentence': 0.5443037974683544, 'document': 0.5063291139240507, 'keyphrases': 0.4177215189873418, 'example': 0.3924050632911392, 'algorithm': 0.379746835443038, 'system': 0.31645569620253167, 'submodular': 0.31645569620253167, ...})

In [96]:
sentence_list = nltk.sent_tokenize(original_text)
sentence_list

['Automatic summarization is the process of shortening a set of data computationally, to create a subset (a summary) that represents the most important or relevant information within the original content.',
 'Artificial intelligence algorithms are commonly developed and employed to achieve this, specialized for different types of data.',
 'Text summarization is usually implemented by natural language processing methods, designed to locate the most informative sentences in a given document.',
 '[1] On the other hand, visual content can be summarized using computer vision algorithms.',
 'Image summarization is the subject of ongoing research; existing approaches typically attempt to display the most representative images from a given image collection, or generate a video that only includes the most important content from the entire collection.',
 '[2][3][4] Video summarization algorithms identify and extract from the original video content the most important frames (key-frames), and/or t

In [97]:
len(sentence_list)

304

In [98]:
score_sentence={}
for sentence in sentence_list:
    for word in nltk.word_tokenize(sentence):
        if sentence not in score_sentence.keys():
            score_sentence[sentence]=word_frequency[word]
        else:
            score_sentence[sentence]+=word_frequency[word]

In [99]:
score_sentence

{'Automatic summarization is the process of shortening a set of data computationally, to create a subset (a summary) that represents the most important or relevant information within the original content.': 3.227848101265823,
 'Artificial intelligence algorithms are commonly developed and employed to achieve this, specialized for different types of data.': 0.4050632911392405,
 'Text summarization is usually implemented by natural language processing methods, designed to locate the most informative sentences in a given document.': 1.9367088607594938,
 '[1] On the other hand, visual content can be summarized using computer vision algorithms.': 0.4936708860759494,
 'Image summarization is the subject of ongoing research; existing approaches typically attempt to display the most representative images from a given image collection, or generate a video that only includes the most important content from the entire collection.': 2.430379746835444,
 '[2][3][4] Video summarization algorithms ide

In [100]:
import heapq
best_sentences = heapq.nlargest(100, score_sentence, key = score_sentence.get)

In [101]:
best_sentences

['It is worth noting that TextRank was applied to summarization exactly as described here, while LexRank was used as part of a larger summarization system (MEAD) that combines the LexRank score (stationary probability) with other features like sentence position and length using a linear combination with either user-specified or automatically tuned weights.',
 'An example of a summarization problem is document summarization, which attempts to automatically produce an abstract from a given document.',
 'For example, in document summarization, one would like the summary to cover all important and relevant concepts in the document.',
 'The main difficulty in supervised extractive summarization is that the known summaries must be manually created by extracting sentences so the sentences in an original training document can be labeled as "in summary" or "not in summary".',
 'They can enable document browsing by providing a short summary, improve information retrieval (if documents have keyph

In [102]:
summary=' '.join(best_sentences)

In [103]:
len(summary)

16825

In [104]:
len(original_text)

35369

In [76]:
def summarize(text, number_of_sentences, percentage = 0):
  original_text = text
  formatted_text = preprocess_text(original_text)

  word_frequency = nltk.FreqDist(nltk.word_tokenize(formatted_text))
  highest_frequency = max(word_frequency.values())
  for word in word_frequency.keys():
    word_frequency[word] = (word_frequency[word] / highest_frequency)
  sentence_list = nltk.sent_tokenize(original_text)
  
  score_sentences = {}
  for sentence in sentence_list:
    for word in nltk.word_tokenize(sentence):
      if word in word_frequency.keys():
        if sentence not in score_sentences.keys():
          score_sentences[sentence] = word_frequency[word]
        else:
          score_sentences[sentence] += word_frequency[word]

  import heapq
  if percentage > 0:
    best_sentences = heapq.nlargest(int(len(sentence_list) * percentage), score_sentences, key=score_sentences.get)
  else:
    best_sentences = heapq.nlargest(number_of_sentences, score_sentences, key=score_sentences.get)

  return sentence_list, best_sentences, word_frequency, score_sentences

In [77]:
sentence_list, best_sentences, word_frequency, score_sentences = summarize(article.cleaned_text, 100)

In [78]:
from IPython.core.display import HTML

In [105]:
def visualize(title, sentence_list, best_sentences):
  from IPython.core.display import HTML
  text = ''

  display(HTML(f'<h1>Summary - {title}</h1>'))
  for sentence in sentence_list:
    if sentence in best_sentences:
      text += ' ' + str(sentence).replace(sentence, f"<mark>{sentence}</mark>")
    else:
      text += ' ' + sentence
  display(HTML(f""" {text} """))

In [106]:
visualize(article.title, sentence_list, best_sentences)