# TEXT SUMMERIZATION - FREQUENCY BASED ALGORITHM

**1. Preprocessing the texts**

In [188]:
import re # regular expressions
import nltk # natural language toolkit
import string # for string operations
import heapq # for finding n largest elements
from IPython.core.display import HTML # for displaying HTML in Jupyter Notebook
from goose3 import Goose # for extracting text from web pages

In [189]:
original_text = """Artificial intelligence is human like intelligence.
                   It is the study of intelligent artificial agents.
                   Science and engineering to produce intelligent machines.
                   Solve problems and have intelligence.
                   Related to intelligent behavior.
                   Developing of reasoning machines.
                   Learn from mistakes and successes.
                   Artificial intelligence is related to reasoning in everyday situations."""

In [190]:
original_text = re.sub(r'\s+', ' ', original_text)  # remove extra spaces and newlines
original_text

'Artificial intelligence is human like intelligence. It is the study of intelligent artificial agents. Science and engineering to produce intelligent machines. Solve problems and have intelligence. Related to intelligent behavior. Developing of reasoning machines. Learn from mistakes and successes. Artificial intelligence is related to reasoning in everyday situations.'

In [191]:
stopwords = nltk.corpus.stopwords.words('english')  # get the list of stopwords in English
print(stopwords)
len(stopwords)  # number of stopwords
print(string.punctuation)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [192]:
def preprocess(text): 
    formatted_text = text.lower()
    tokens = []
    # tokenize the text using word tokenizer 
    for token in nltk.word_tokenize(formatted_text, language="english", preserve_line=False): 
        tokens.append(token)
    #print(tokens)
    tokens = [word for word in tokens if word not in stopwords and word not in string.punctuation] # remove stopwords and punctuation from the text 
    formatted_text = " ".join(element for element in tokens)  # join the tokens back to string
    
    return formatted_text

In [193]:
formatted_text = preprocess(original_text)
formatted_text

'artificial intelligence human like intelligence study intelligent artificial agents science engineering produce intelligent machines solve problems intelligence related intelligent behavior developing reasoning machines learn mistakes successes artificial intelligence related reasoning everyday situations'

**2. Word frequency**

In [194]:
word_frequency = nltk.FreqDist(nltk.word_tokenize(formatted_text))
word_frequency

FreqDist({'intelligence': 4, 'artificial': 3, 'intelligent': 3, 'machines': 2, 'related': 2, 'reasoning': 2, 'human': 1, 'like': 1, 'study': 1, 'agents': 1, ...})

**6. Weighted word frequency**

In [195]:
highest_frequency = max(word_frequency.values()) # get the highest frequency
highest_frequency

4

In [196]:
for word in word_frequency.keys():
    # print(word)
    word_frequency[word] = (word_frequency[word]/highest_frequency)     

In [197]:
word_frequency

FreqDist({'intelligence': 1.0, 'artificial': 0.75, 'intelligent': 0.75, 'machines': 0.5, 'related': 0.5, 'reasoning': 0.5, 'human': 0.25, 'like': 0.25, 'study': 0.25, 'agents': 0.25, ...})

**4. Sentence tokenization**

In [198]:
sentence_list = nltk.sent_tokenize(original_text) # tokenize the original text into sentences
sentence_list

['Artificial intelligence is human like intelligence.',
 'It is the study of intelligent artificial agents.',
 'Science and engineering to produce intelligent machines.',
 'Solve problems and have intelligence.',
 'Related to intelligent behavior.',
 'Developing of reasoning machines.',
 'Learn from mistakes and successes.',
 'Artificial intelligence is related to reasoning in everyday situations.']

**5. Generate the summary(score for sentences)**

In [199]:
# Score for the senteces
score_sentences = {}
for sentence in sentence_list: 
    for word in nltk.word_tokenize(sentence.lower()):
        if word in word_frequency.keys(): 
            if sentence not in score_sentences.keys():
                score_sentences[sentence] = word_frequency[word]
            else: 
                score_sentences[sentence] += word_frequency[word]
score_sentences # show the score for each sentence

# Order the sentences
best_sentences = heapq.nlargest(3, score_sentences, key=score_sentences.get) # get the 3 sentences with the highest score
best_sentences

# Generate the summary
summary = " ".join(best_sentences)
summary # show the summary

'Artificial intelligence is human like intelligence. Artificial intelligence is related to reasoning in everyday situations. It is the study of intelligent artificial agents.'

**6. Visulaize the summary in HTML**

In [200]:
text = " "
display(HTML(f'<h2>Summary</h2>'))
for sentence in sentence_list:
    if sentence in best_sentences: 
        text += ' ' + sentence.replace(sentence, f"<mark>{sentence}</mark>") # highlight the best sentences
    else: 
        text += " " + sentence#
display(HTML(f'<p>{text}</p>'))

**7. Extracting text from the Internet**

In [None]:
g = Goose()
url = 'https://en.wikipedia.org/wiki/Artificial_intelligence'
article = g.extract(url=url)

In [202]:
# Function to summarize any text
def summerizer(text, number_of_sentences, percantage = 0): # text, number of important sentences
    original_text = text
    formatted_text = preprocess(original_text) # preprocess the text
    
    word_frequency = nltk.FreqDist(nltk.word_tokenize(formatted_text)) # word frequency distribution
    highest_frequency = max(word_frequency.values()) # get the highest frequency
    for word in word_frequency.keys():
        word_frequency[word] = (word_frequency[word]/highest_frequency)
    sentence_list = nltk.sent_tokenize(original_text) # tokenize the original text into sentences
    
    score_sentences = {}
    for sentence in sentence_list:
        for word in nltk.word_tokenize(sentence.lower()):
            if word in word_frequency.keys():
                if sentence not in score_sentences.keys():
                    score_sentences[sentence] = word_frequency[word]
                else: 
                    score_sentences[sentence] += word_frequency[word]
    
    if percantage > 0: 
        best_sentences = heapq.nlargest(int(len(sentence_list) * percantage), score_sentences, key=score_sentences.get)
    else: 
        best_sentences = heapq.nlargest(number_of_sentences, score_sentences, key=score_sentences.get) # get the n sentences with the highest score
    
    return sentence_list, best_sentences, word_frequency, score_sentences

In [203]:
sentence_list, best_sentences, word_frequency, score_sentences = summerizer(article.cleaned_text, 100)

In [208]:
# Visualization 
def visualize(title, sentence_list, best_sentences): 
    text = " "
    
    for sentence in sentence_list:
        if sentence in best_sentences: 
            text += ' ' + sentence.replace(sentence, f"<mark>{sentence}</mark>") 
        else: 
            text += " " + sentence
        html = f"""
    <h2>{title}</h2>
    <div style='max-height:300px; overflow:auto; border:1px solid #ccc; padding:10px;'>
        {text}
    </div>
    """
    display(HTML(html))

In [209]:
visualize(article.title, sentence_list, best_sentences)

**8. Summarizing multiple texts**

In [210]:
article_list = [
    "https://en.wikipedia.org/wiki/Artificial_intelligence",
    "https://en.wikipedia.org/wiki/Machine_learning",
    "https://en.wikipedia.org/wiki/Deep_learning"
]

In [211]:
for url in article_list:
    g = Goose()
    article = g.extract(url)
    sentence_list, best_sentences, _, _ = summerizer(article.cleaned_text, 100, percantage=0.5) # get 50% of the sentences as summary
    # print(len(sentence_list), len(best_sentences))
    visualize(article.title, sentence_list, best_sentences)