<a href="https://colab.research.google.com/github/abhisriv-466/Extractive_Text_Summarization/blob/main/Text_Summarization_Extractive_model_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Source:- https://blog.floydhub.com/gentle-introduction-to-text-summarization-in-machine-learning/

In [None]:
import bs4 as BeautifulSoup
import urllib.request

# Fetching the content from the URL
fetched_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/20th_century')

article_read = fetched_data.read()

# Parsing the URL content and storing in a variable
article_parsed = BeautifulSoup.BeautifulSoup(article_read,'html.parser')

# Returning <p> tags
paragraphs = article_parsed.find_all('p')

article_content = ''

# Looping through the paragraphs and adding them to the variable
for p in paragraphs:
    article_content += p.text

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer

def _create_dictionary_table(text_string):
  #removing stopwords
  stop_words=set(stopwords.words('english'))
  words=word_tokenize(text_string)

  #Reducing words to root form through stemming
  stem=PorterStemmer()

  #creating dictionary for the word frequency table
  freq_table=dict()
  for word in words:
    if word in stop_words:
      continue
    elif word in freq_table:
      freq_table[word]+=1
    else:
      freq_table[word]=1

  return freq_table

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def cal_sentence_scores(sentences,freq_table)->dict:
  #sentence_score_dict
  sentence_weight=dict()
  for sentence in sentences:
    sentence_wordcount = len(word_tokenize(sentence))
    sentence_wordcount_without_stopwords=0
    for word_weight in freq_table:
      if word_weight in sentence.lower():
        sentence_wordcount_without_stopwords+=1
        if word_weight in sentence_weight:
          sentence_weight[sentence[:7]]+=freq_table[word_weight]
        else:
          sentence_weight[sentence[:7]]=freq_table[word_weight]
    sentence_weight[sentence[:7]]=sentence_weight[sentence[:7]]/sentence_wordcount_without_stopwords

  return sentence_weight

In [None]:
def calculate_average_score(sentence_weight):
  sum_values=0
  for entry in sentence_weight:
    sum_values+=sentence_weight[entry]

  average_score=sum_values/len(sentence_weight)
  return average_score

In [None]:
def get_article_summary(sentences,sentence_weight,threshold):
  sentence_counter=0
  article_summary=""
  for sentence in sentences:
    if sentence[:7] in sentence_weight and sentence_weight[sentence[:7]]>=threshold:
      article_summary+="\n"+sentence
      sentence_counter+=1
  return article_summary

In [None]:
def run_article_summary(article):
  freq_table=_create_dictionary_table(article)
  import nltk
  nltk.download('punkt')
  sentences=sent_tokenize(article)
  sentence_scores=cal_sentence_scores(sentences,freq_table)
  threshold=calculate_average_score(sentence_scores)
  article_summary=get_article_summary(sentences,sentence_scores,threshold)
  return article_summary

In [None]:
if __name__=='__main__':
  import nltk
  nltk.download('punkt')
  summary=run_article_summary(article_content)
  print(summary)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.



The Earth's sixth mass extinction event, the Holocene extinction, continued, and human conservation efforts increased.
Germany returned as a great power in 1933, when the Nazi Party replaced the Weimar Republic as the new government of Germany.
The end of World War II saw the United States, France, United Kingdom, Soviet Union and the Republic of China emerging as the primary victors.
Political pressures from the United States, the Soviet Union and the United Nations led France and the United Kingdom to withdraw from Egypt during the Suez Crisis.
The episode demonstrated to the world that the United Kingdom and France had ceased to be global superpowers.
China began rising rapidly as an economic and geopolitical power after the USSR's collapse.
[12] It was the last century of the 2nd millennium.
Humans explored space for the first time, taking their first footsteps on the Moon.
World War II alone killed over 60 million people, while nuclear weapons gave humankind the means to annihila

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import difflib
'''
Find the matching substrings in 2 strings.
:parameter
    :param a: string - raw text
    :param b: string - raw text
:return
    2 lists used in to display matches
'''
def utils_split_sentences(a, b):
    ## find clean matches
    match = difflib.SequenceMatcher(isjunk=None, a=a, b=b, autojunk=True)
    lst_match = [block for block in match.get_matching_blocks() if block.size > 20]

    ## difflib didn't find any match
    if len(lst_match) == 0:
        lst_a, lst_b = nltk.sent_tokenize(a), nltk.sent_tokenize(b)

    ## work with matches
    else:
        first_m, last_m = lst_match[0], lst_match[-1]

        ### a
        string = a[0 : first_m.a]
        lst_a = [t for t in nltk.sent_tokenize(string)]
        for n in range(len(lst_match)):
            m = lst_match[n]
            string = a[m.a : m.a+m.size]
            lst_a.append(string)
            if n+1 < len(lst_match):
                next_m = lst_match[n+1]
                string = a[m.a+m.size : next_m.a]
                lst_a = lst_a + [t for t in nltk.sent_tokenize(string)]
            else:
                break
        string = a[last_m.a+last_m.size :]
        lst_a = lst_a + [t for t in nltk.sent_tokenize(string)]

        ### b
        string = b[0 : first_m.b]
        lst_b = [t for t in nltk.sent_tokenize(string)]
        for n in range(len(lst_match)):
            m = lst_match[n]
            string = b[m.b : m.b+m.size]
            lst_b.append(string)
            if n+1 < len(lst_match):
                next_m = lst_match[n+1]
                string = b[m.b+m.size : next_m.b]
                lst_b = lst_b + [t for t in nltk.sent_tokenize(string)]
            else:
                break
        string = b[last_m.b+last_m.size :]
        lst_b = lst_b + [t for t in nltk.sent_tokenize(string)]

    return lst_a, lst_b

'''
Highlights the matched strings in text.
:parameter
    :param a: string - raw text
    :param b: string - raw text
    :param both: bool - search a in b and, if True, viceversa
    :param sentences: bool - if False matches single words
:return
    text html, it can be visualized on notebook with display(HTML(text))
'''
import re
def display_string_matching(a, b, both=True, sentences=True, titles=[]):
    if sentences is True:
        lst_a, lst_b = utils_split_sentences(a, b)
    else:
        lst_a, lst_b = a.split(), b.split()

    ## highlight a
    first_text = []
    for i in lst_a:
        if re.sub(r'[^\w\s]', '', i.lower()) in [re.sub(r'[^\w\s]', '', z.lower()) for z in lst_b]:
            first_text.append('<span style="background-color:rgba(255,215,0,0.3);">' + i + '</span>')
        else:
            first_text.append(i)
    first_text = ' '.join(first_text)

    ## highlight b
    second_text = []
    if both is True:
        for i in lst_b:
            if re.sub(r'[^\w\s]', '', i.lower()) in [re.sub(r'[^\w\s]', '', z.lower()) for z in lst_a]:
                second_text.append('<span style="background-color:rgba(255,215,0,0.3);">' + i + '</span>')
            else:
                second_text.append(i)
    else:
        second_text.append(b)
    second_text = ' '.join(second_text)

    ## concatenate
    if len(titles) > 0:
        first_text = "<strong>"+titles[0]+"</strong><br>"+first_text
    if len(titles) > 1:
        second_text = "<strong>"+titles[1]+"</strong><br>"+second_text
    else:
        second_text = "---"*65+"<br><br>"+second_text
    final_text = first_text +'<br><br>'+ second_text
    return final_text

In [None]:
match = display_string_matching(article_content, summary, both=True, sentences=False, titles=["Full text", "Predicted Summary"])
from IPython.core.display import display, HTML
display(HTML(match))

In [None]:
match = display_string_matching(article_content, summary, both=True, sentences=True, titles=["Full Text", "Predicted Summary"])

from IPython.core.display import display, HTML
display(HTML(match))