In [1]:
#Extractive Summarization Model-II
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
import bs4 as BeautifulSoup
import urllib.request

In [2]:
def _create_dictionary_table(text_string) -> dict:
    #removing stop words
    stop_words = set(stopwords.words("english"))
    words = word_tokenize(text_string)
    #reducing words to their root form
    stem = PorterStemmer()
    #creating dictionary for the word frequency table
    frequency_table = dict()
    for wd in words:
        wd = stem.stem(wd)
        if wd in stop_words:
            continue
        if wd in frequency_table:
            frequency_table[wd] += 1
        else:
            frequency_table[wd] = 1

    return frequency_table

In [3]:
def _calculate_sentence_scores(sentences, frequency_table) -> dict:   

    #algorithm for scoring a sentence by its words
    sentence_weight = dict()

    for sentence in sentences:
        sentence_wordcount = (len(word_tokenize(sentence)))
        sentence_wordcount_without_stop_words = 0
        for word_weight in frequency_table:
            if word_weight in sentence.lower():
                sentence_wordcount_without_stop_words += 1
                if sentence[:4] in sentence_weight:
                    sentence_weight[sentence[:4]] += frequency_table[word_weight]
                else:
                    sentence_weight[sentence[:4]] = frequency_table[word_weight]

        sentence_weight[sentence[:4]] = sentence_weight[sentence[:4]] / sentence_wordcount_without_stop_words

       

    return sentence_weight

In [4]:
def _calculate_average_score(sentence_weight) -> int:
   
    #calculating the average score for the sentences
    sum_values = 0
    for entry in sentence_weight:
        sum_values += sentence_weight[entry]

    #getting sentence average value from source text
    average_score = (sum_values / len(sentence_weight))

    return average_score

In [5]:
def _get_article_summary(sentences, sentence_weight, threshold):
    sentence_counter = 0
    article_summary = ''

    for sentence in sentences:
        if sentence[:4] in sentence_weight and sentence_weight[sentence[:4]] >= (threshold):
            article_summary += " " + sentence
            sentence_counter += 1

    return article_summary


In [6]:
def _run_article_summary(article):
    
    #creating a dictionary for the word frequency table
    frequency_table = _create_dictionary_table(article)

    #tokenizing the sentences
    sentences = sent_tokenize(article)

    #algorithm for scoring a sentence by its words
    sentence_scores = _calculate_sentence_scores(sentences, frequency_table)

    #getting the threshold
    threshold = _calculate_average_score(sentence_scores)

    #producing the summary
    article_summary = _get_article_summary(sentences, sentence_scores, 1.25*threshold)

    return article_summary

In [7]:
# fetching the content from the URL
fetched_data = urllib.request.urlopen('https://www.vedantu.com/english/essay-on-diwali')

article_read = fetched_data.read()

#parsing the URL content and storing in a variable
article_parsed = BeautifulSoup.BeautifulSoup(article_read,'html.parser')

#returning <p> tags
paragraphs = article_parsed.find_all('p')

article_content = ''

#looping through the paragraphs and adding them to the variable
for p in paragraphs: 
    article_content += p.text
# print(article_content)

# f=open('input.txt','r',encoding='utf-8')
# article_content1=''
# article_content1=f.read().replace('\n','')
# f.close()
summary_results = _run_article_summary(article_content)
print(summary_results)

 It is an Indian festival that marks the victory of good over bad (evil). It is a festival celebrated by Indians with great enthusiasm. The festival commemorates joy, harmony and victory. Thus, it becomes the festival of lights. It is celebrated in the Hindu month called Kartika. It is one of the biggest and grandest festivals celebrated mainly in India. It is celebrated after 20 days of the Dussehra festival. The word ‘Deepavali’ is a Hindi word which means an array of lamps (‘Deep’ means earthen lamps and ‘Avali’ means a queue or an array).Diwali is celebrated in the honour of Lord Ramchandra because on this day Lord Rama returned to Ayodhya after 14 years of exile. The five days are Dhanteras, Naraka Chaturdashi, Lakshmi Pooja, Govardhan Pooja, and Bhai Dooj. The preparations start from one month before the actual date of the festival and people indulge in buying new clothes, gifts, new books, lights, crackers, sweets, dry fruits, etc.Some also believe in discarding old things and b