# TEXT SUMMERIZATION - LUHN ALGORITHM

**1. Preprocessing the texts**

In [1]:
import re # regular expressions
import nltk # natural language toolkit
import string # for string operations
import heapq # for finding n largest elements
from IPython.core.display import HTML # for displaying HTML in Jupyter Notebook
from goose3 import Goose # for extracting text from web pages
from rouge_score import rouge_scorer

In [72]:
original_text = """Artificial intelligence is human like intelligence machines.
                   It is the study of intelligent artificial agents.
                   Science and engineering to produce intelligent machines.
                   Solve problems and have intelligence.
                   Related to intelligent behavior machines.
                   Developing of reasoning machines.
                   Learn from mistakes and successes.
                   Artificial intelligence is related to reasoning in everyday situations."""

In [6]:
original_text = re.sub(r'\s+', ' ', original_text)  # remove extra spaces and newlines
original_text

'Artificial intelligence is human like intelligence. It is the study of intelligent artificial agents. Science and engineering to produce intelligent machines. Solve problems and have intelligence. Related to intelligent behavior. Developing of reasoning machines. Learn from mistakes and successes. Artificial intelligence is related to reasoning in everyday situations.'

In [2]:
stopwords = nltk.corpus.stopwords.words('english')  # get the list of stopwords in English
print(stopwords)
len(stopwords)  # number of stopwords
print(string.punctuation)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [3]:
def preprocess(text): 
    formatted_text = text.lower()
    tokens = []
    # tokenize the text using word tokenizer 
    for token in nltk.word_tokenize(formatted_text, language="english", preserve_line=False): 
        tokens.append(token)
    #print(tokens)
    tokens = [word for word in tokens if word not in stopwords and word not in string.punctuation] # remove stopwords and punctuation from the text 
    formatted_text = " ".join(element for element in tokens)  # join the tokens back to string
    
    return formatted_text

In [7]:
formatted_text = preprocess(original_text)
formatted_text

'artificial intelligence human like intelligence study intelligent artificial agents science engineering produce intelligent machines solve problems intelligence related intelligent behavior developing reasoning machines learn mistakes successes artificial intelligence related reasoning everyday situations'

**2. Function to calculate sentences score**

In [91]:
def calculate_sentences_score(sentences, important_words, distance): 
    scores = []
    sentence_index = 0
    
    for sentence in [nltk.word_tokenize(sentence) for sentence in sentences]: # tokenize each sentence
        # print("---------")
        # print("Words in each Sentence: ", sentence)
        
        word_index = []
        for word in important_words: 
            # print("Important word: ", word)
            try: 
                word_index.append(sentence.index(word))
            except ValueError:
                pass 
        word_index.sort()
        # print("Word Indexes: ", word_index) # indexes of important words in the sentence
        
        if len(word_index) == 0:
            continue
        
        # [0, 1, 2, 5, 8, 9]
        group_list = [] # list of groups of important words
        group = [word_index[0]] # current group of important words
        
        i = 1 # 3
        while i < len(word_index): # 3
            # first execution: 1 - 0
            # second execution: 2 - 1
            # third execution: 5 - 2
            if word_index[i] - word_index[i-1] < distance: 
                group.append(word_index[i])
                # print("Group after append: ", group)
            else: 
                group_list.append(group[:])
                group = [word_index[i]]
                # print("New Group: ", group)
            i += 1
            
        group_list.append(group) # append the last group
        # print("Group List: ", group_list)
            
        # calculate the score for each group
        max_group_score = 0
        for g in group_list: 
            # print(g)
            important_words_in_group = len(g)
            total_words_in_group = g[-1] - g[0] + 1 # total words in the group  
            score = 1.0 * (important_words_in_group ** 2) / total_words_in_group # multiplied by one in order to have the value in the same scale 
            # print("Score: ", score)
            
            if score > max_group_score:
                max_group_score = score
            
        scores.append((max_group_score, sentence_index))
        sentence_index += 1
        
    return scores

**6. Function to summarize the texts**

In [100]:
def summarize(text, top_n_words, distance, number_of_sentences):
    original_setences = [sentence for sentence in nltk.sent_tokenize(text)] # tokenize the text into sentences
    # print("Before Preprocessing text: ", original_setences)
    formatted_sentences = [preprocess(original_sentence) for original_sentence in original_setences] # preprocess each sentence
    # print("After Preprocessing text: ", formatted_sentences)
    words = [word for sentence in formatted_sentences for word in nltk.word_tokenize(sentence)] # tokenize the preprocessed sentences into words
    # print("Tokenized words: ", words)
    frequency = nltk.FreqDist(words) # get the frequency distribution of the words
    # return frequency
    top_n_words = [word[0] for word in frequency.most_common(top_n_words)] # get the top n words
    # print("Top n words: ", top_n_words)
    sentences_score = calculate_sentences_score(formatted_sentences, top_n_words, distance)
    # print("Score in each sentence (score, sentence_index): ", sentences_score)
    best_sentences = heapq.nlargest(number_of_sentences, sentences_score)
    # print("Best sentences (score, sentence_index): ", best_sentences)
    best_sentences = [original_setences[i] for (score, i) in best_sentences]
    # print(best_sentences)
    return original_setences, best_sentences, sentences_score

In [101]:
original_setences, best_sentences, sentences_score = summarize(original_text, 5, 2, 3)

In [102]:
text = " "
display(HTML(f"<h2>Summary</h2>"))
for sentence in original_setences:
    if sentence in best_sentences: 
        text += sentence.replace(sentence, f"<mark>{sentence}</mark>")
    else: 
        text += " " + sentence
display(HTML(f"""{text}"""))

**9. Evaluation**