In [1]:
from bs4 import BeautifulSoup as bs
import re
from urllib.request import urlopen
import nltk
from nltk.stem import WordNetLemmatizer 

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aman2\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aman2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aman2\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
lemmatizer = WordNetLemmatizer() 
stopwords = nltk.corpus.stopwords.words('english')
stopwords.append('.')

# Scraping raw data

In [4]:


def scrap_text(title):
    """
    scrap text data from wikipedia of given title
    
    """
    
    wiki = "https://en.wikipedia.org/wiki/"+title
    scraped_data = urlopen(wiki)
    article = scraped_data.read()
    parsed_article = bs(article,'html.parser')
    paragraphs = parsed_article.find_all('p')
    article_text = ""

    for p in paragraphs:
        article_text += p.text
        
    return article_text

# Text Preprocessing

In [5]:
def preprocessing_text(article_text):
    """
    removing digits and special sumbols
    """
    
    article_text = re.sub(r'\[[0-9]*\]', ' ', article_text)
    article_text = re.sub(r'\s+', ' ', article_text)
    
    formatted_article_text = re.sub('[^a-zA-Z\.]', ' ', article_text )
    formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text)
    return formatted_article_text

- ## Convert paragraphs to sentences
- ## Tokenizing the sentences
- ## Find weighted frequency of occurrence
- ## Replace words by weighted frequency in sentences

In [6]:
def find_word_frequencies(article_text,lemmatizer = lemmatizer,stopwords = stopwords):
    word_frequencies = {}
    for w in nltk.word_tokenize(article_text):
        if w not in stopwords:
            w = w.lower()
            word = lemmatizer.lemmatize(w)
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1
    return word_frequencies

In [7]:
def find_weighted_word_frequencies(word_frequencies):
    max_frequency = max(word_frequencies.values())
    for i in word_frequencies:
        word_frequencies[i] /= max_frequency 
    return word_frequencies


In [8]:
def find_sentence_weight(sentence_list,word_frequencies):
    sentences = []
    weighted_sentences = []
    for sent in sentence_list:
        w_sent = []
        weight = 0
        for word in nltk.word_tokenize(sent):
            if(word in word_frequencies):
                weight+=word_frequencies[word]
                w_sent.append(word_frequencies[word])
        weighted_sentences.append(w_sent)
        weight = weight/len(sent)
        sentences.append((sent,weight))
    return sentences,weighted_sentences

# Sort sentences in descending order of weights


In [9]:
def select_top_sentences(article_text,k = 0.3,get_weighted_sentences = False):
    word_frequencies = find_word_frequencies(article_text)   
    word_frequencies = find_weighted_word_frequencies(word_frequencies)
    sentence_list = nltk.sent_tokenize(article_text)
    sentences,w_s = find_sentence_weight(sentence_list,word_frequencies)
    '''
    In w_s words are replaced by weighted frequency in sentences
    
    '''
    if(get_weighted_sentences):
        return w_s
    
    sorted_sentences = sorted(sentences,key = lambda x: -x[1])
    length = int(len(sentences)*k)
    selected_sentences = sorted_sentences[:length]
    selected_sentences = [sent[0] for sent in selected_sentences]
    return selected_sentences
    
    

In [10]:
def replace_words_in_sentences(sentences,word_frequencies,lemmatizer = lemmatizer,stopwords = stopwords):
    weighted_sent = []
    for sent in sentences:
        w_sent = []
        for w in nltk.word_tokenize(sent):
            if w not in stopwords:
                w = w.lower()
                word = lemmatizer.lemmatize(w)
                if word in word_frequencies.keys():
                    w_sent.append(word)
        weighted_sent.append(w_sent)

# Summarizing the Article

In [11]:
def text_summarization_pipeline(Titles =['science'],k = 0.1,get_weighted_sentences = False):
    for title in Titles:
        print()
        print("{} Summary-------------------------------------------------".format(title.upper()))
        print()
        article_text = scrap_text(title)
        article_text = preprocessing_text(article_text)
        top_sentences = select_top_sentences(article_text,k =k,get_weighted_sentences = get_weighted_sentences)
        if(get_weighted_sentences):
            return top_sentences
        return ' '.join(top_sentences)

In [12]:
text_summarization_pipeline()


SCIENCE Summary-------------------------------------------------



'Modern science is commonly divided into three major branches natural science social science and formal science. it integrates various themes and activities such as science communication science museums science festivals science fairs citizen science and science in popular culture. It can be divided into two main branches life science or biological science and physical science. Formal science is involved in the study of formal systems. of PhDs in science and engineering fields. Modern natural science is the successor to the natural philosophy that began in Ancient Greece. This is especially the case in the more macroscopic fields of science e.g. There are different schools of thought in the philosophy of science. Tangentially the science fiction genre primarily fantastic in nature engages the public imagination and transmits the ideas if not the methods of science. Modern science is distinct in its approach and successful in its results so it now defines what science is in the strictes

In [13]:
Titles = ['technology','space']
text_summarization_pipeline(Titles = Titles,k = 0.05)


TECHNOLOGY Summary-------------------------------------------------



'Theories of technology often attempt to predict the future of technology based on the high technology and science of the time. that is created by technology. State of the art technology refers to the high technology available to humanity in any field. The use of the term technology has changed significantly over the last years. Theories and concepts in technology Economics of technology Technology journalism Other The simplest form of technology is the development and use of basic tools. By the s technology referred not only to the study of the industrial arts but to the industrial arts themselves. Generally technicism is the belief in the utility of technology for improving human societies. Information technology subsequently led to the birth in the s of the Internet which ushered in the current Information Age. The term is often used to imply a specific field of technology or to refer to high technology or just consumer electronics rather than technology as a whole.'

# Getting Weighted sentences replace by weighted word

In [14]:


Titles = ['Science']
text_summarization_pipeline(Titles = Titles,k = 0.05,get_weighted_sentences=True)


SCIENCE Summary-------------------------------------------------



[[0.3395061728395062,
  0.05555555555555555,
  0.006172839506172839,
  0.012345679012345678,
  0.25925925925925924,
  0.037037037037037035,
  0.037037037037037035,
  0.018518518518518517,
  0.018518518518518517,
  0.006172839506172839,
  0.25925925925925924,
  0.18518518518518517,
  0.3395061728395062,
  0.030864197530864196,
  0.012345679012345678,
  0.3395061728395062,
  0.037037037037037035],
 [0.018518518518518517,
  1.0,
  0.006172839506172839,
  0.18518518518518517,
  0.018518518518518517],
 [0.08024691358024691,
  0.037037037037037035,
  0.08641975308641975,
  0.006172839506172839,
  0.006172839506172839,
  0.25925925925925924,
  0.12345679012345678,
  0.06790123456790123,
  0.037037037037037035,
  0.006172839506172839,
  0.09259259259259259,
  0.07407407407407407,
  0.012345679012345678,
  0.18518518518518517,
  0.3395061728395062,
  0.04938271604938271,
  0.08024691358024691,
  0.06172839506172839,
  0.006172839506172839,
  0.25925925925925924],
 [0.3395061728395062,
  0.01851