<a href="https://colab.research.google.com/github/ace26597/DataSummarization/blob/master/DataSummarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [39]:
import bs4 as bs   #beautiful soup , useful Python utility for web scraping

In [40]:
import urllib.request    # parse XML and HTML

In [41]:
import re     #for regular expressions

In [42]:
import nltk         #for nlp processing

In [43]:
from nltk.tokenize import word_tokenize

In [44]:
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

In [45]:
nltk.download('words')

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [46]:
scraped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Elon_Musk')     #accessing/scraping wiki data

In [47]:
article = scraped_data.read()    #reading the data byte-by-byte

In [48]:
print(article)



In [49]:
parsed_article = bs.BeautifulSoup(article,'lxml')      #to parse the data

In [50]:
paragraphs = parsed_article.find_all('p')     #text enclosed within <p> tag is retrieved

In [51]:
article_text = ""     #to combine the paragraphs

In [52]:
for p in paragraphs:  
    article_text += p.text

In [53]:
#Pre-processing
#Removing square brackets and extra spaces
article_text = re.sub(r'\[[0-9]*\]', ' ', article_text)  
article_text = re.sub(r'\s+', ' ', article_text)

In [54]:
print(article_text)



In [55]:
#Removing special characters and digits
formatted_article_text = re.sub('[^a-zA-Z]', ' ', article_text )  
formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text)

In [56]:
print(formatted_article_text)



In [57]:
nltk.download('punkt')
text = word_tokenize(article_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [58]:
nltk.download('averaged_perceptron_tagger')
tagged = nltk.pos_tag(text)
print(tagged)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [59]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'

In [60]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(tagged)
print(cs)

(S
  Elon/NNP
  Reeve/NNP
  Musk/NNP
  FRS/NNP
  (/(
  /ˈiːlɒn//JJ
  EE-lon/NNP
  ;/:
  born/VBN
  June/NNP
  28/CD
  ,/,
  1971/CD
  )/)
  is/VBZ
  (NP a/DT business/NN)
  (NP magnate/NN)
  ,/,
  (NP industrial/JJ designer/NN)
  ,/,
  and/CC
  (NP engineer/NN)
  ./.
  He/PRP
  is/VBZ
  (NP the/DT founder/NN)
  ,/,
  CEO/NNP
  ,/,
  CTO/NNP
  ,/,
  and/CC
  (NP chief/JJ designer/NN)
  of/IN
  SpaceX/NNP
  ;/:
  (NP early/JJ stage/NN)
  (NP investor/NN)
  ,/,
  [/NNP
  note/VBP
  2/CD
  ]/NNP
  CEO/NNP
  ,/,
  and/CC
  (NP product/NN)
  (NP architect/NN)
  of/IN
  Tesla/NNP
  ,/,
  Inc./NNP
  ;/:
  (NP founder/NN)
  of/IN
  The/DT
  Boring/NNP
  Company/NNP
  ;/:
  (NP co-founder/NN)
  of/IN
  Neuralink/NNP
  ;/:
  and/CC
  (NP co-founder/NN)
  and/CC
  (NP initial/JJ co-chairman/NN)
  of/IN
  OpenAI/NNP
  ./.
  A/NNP
  (NP centibillionaire/NN)
  ,/,
  Musk/NNP
  is/VBZ
  one/CD
  of/IN
  the/DT
  richest/JJS
  people/NNS
  in/IN
  (NP the/DT world/NN)
  ./.
  Musk/NNP
  was/VBD
  born/

In [61]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint

In [62]:
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)
#B - beginning, O - outsite, I - Internal

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 ('cohesive', 'JJ', 'I-NP'),
 ('business', 'NN', 'I-NP'),
 ('model', 'NN', 'B-NP'),
 (',', ',', 'O'),
 ('the', 'DT', 'B-NP'),
 ('board', 'NN', 'I-NP'),
 ('ousted', 'VBD', 'O'),
 ('Musk', 'NNP', 'O'),
 ('and', 'CC', 'O'),
 ('replaced', 'VBD', 'O'),
 ('him', 'PRP', 'O'),
 ('with', 'IN', 'O'),
 ('Thiel', 'NNP', 'O'),
 ('in', 'IN', 'O'),
 ('September', 'NNP', 'O'),
 ('2000', 'CD', 'O'),
 ('.', '.', 'O'),
 ('[', 'JJ', 'B-NP'),
 ('note', 'NN', 'I-NP'),
 ('3', 'CD', 'O'),
 (']', 'NN', 'B-NP'),
 ('Under', 'IN', 'O'),
 ('Thiel', 'NNP', 'O'),
 (',', ',', 'O'),
 ('the', 'DT', 'B-NP'),
 ('company', 'NN', 'I-NP'),
 ('focused', 'VBD', 'O'),
 ('on', 'IN', 'O'),
 ('the', 'DT', 'O'),
 ('PayPal', 'NNP', 'O'),
 ('service', 'NN', 'B-NP'),
 ('and', 'CC', 'O'),
 ('was', 'VBD', 'O'),
 ('renamed', 'VBN', 'O'),
 ('PayPal', 'NNP', 'O'),
 ('in', 'IN', 'O'),
 ('2001', 'CD', 'O'),
 ('.', '.', 'O'),
 ('In', 'IN', 'O'),
 ('2002', 'CD', 'O'),
 (',', ','

In [63]:
ne_tree = nltk.ne_chunk(nltk.pos_tag(word_tokenize(article_text)))
print(ne_tree)

(S
  (PERSON Elon/NNP)
  (PERSON Reeve/NNP Musk/NNP FRS/NNP)
  (/(
  /ˈiːlɒn//JJ
  EE-lon/NNP
  ;/:
  born/VBN
  June/NNP
  28/CD
  ,/,
  1971/CD
  )/)
  is/VBZ
  a/DT
  business/NN
  magnate/NN
  ,/,
  industrial/JJ
  designer/NN
  ,/,
  and/CC
  engineer/NN
  ./.
  He/PRP
  is/VBZ
  the/DT
  founder/NN
  ,/,
  (ORGANIZATION CEO/NNP)
  ,/,
  (ORGANIZATION CTO/NNP)
  ,/,
  and/CC
  chief/JJ
  designer/NN
  of/IN
  (GPE SpaceX/NNP)
  ;/:
  early/JJ
  stage/NN
  investor/NN
  ,/,
  [/NNP
  note/VBP
  2/CD
  ]/NNP
  CEO/NNP
  ,/,
  and/CC
  product/NN
  architect/NN
  of/IN
  (GPE Tesla/NNP)
  ,/,
  (GPE Inc./NNP)
  ;/:
  founder/NN
  of/IN
  The/DT
  (ORGANIZATION Boring/NNP Company/NNP)
  ;/:
  co-founder/NN
  of/IN
  (GPE Neuralink/NNP)
  ;/:
  and/CC
  co-founder/NN
  and/CC
  initial/JJ
  co-chairman/NN
  of/IN
  (ORGANIZATION OpenAI/NNP)
  ./.
  A/NNP
  centibillionaire/NN
  ,/,
  (PERSON Musk/NNP)
  is/VBZ
  one/CD
  of/IN
  the/DT
  richest/JJS
  people/NNS
  in/IN
  the/DT
  worl

In [64]:
for sent in nltk.sent_tokenize(article_text):
   for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
      if hasattr(chunk, 'label'):
         print(chunk.label(), ' '.join(c[0] for c in chunk))

PERSON Elon
PERSON Reeve Musk FRS
ORGANIZATION CEO
ORGANIZATION CTO
GPE SpaceX
GPE Tesla
GPE Inc.
ORGANIZATION Boring Company
GPE Neuralink
ORGANIZATION OpenAI
PERSON Musk
PERSON Musk
GPE Canadian
GPE South African
GPE Pretoria
GPE South Africa
ORGANIZATION University
GPE Pretoria
GPE Canada
PERSON Queen
ORGANIZATION University
ORGANIZATION University
GPE Pennsylvania
GPE California
ORGANIZATION Stanford University
PERSON Kimbal
ORGANIZATION Compaq
GPE Musk
ORGANIZATION Confinity
ORGANIZATION PayPal
ORGANIZATION eBay
PERSON Musk
ORGANIZATION SpaceX
ORGANIZATION CEO
ORGANIZATION CTO
PERSON Tesla Motors
GPE Inc.
PERSON Tesla
ORGANIZATION SolarCity
ORGANIZATION OpenAI
PERSON Neuralink
ORGANIZATION Boring Company
GPE Musk
ORGANIZATION Hyperloop
GPE Musk
ORGANIZATION Tham Luang
GPE California
GPE Musk
ORGANIZATION US Securities
ORGANIZATION Exchange Commission
ORGANIZATION SEC
GPE Tesla
ORGANIZATION SEC
GPE Musk
PERSON Elon
PERSON Reeve Musk
GPE Pretoria
GPE South Africa
PERSON Maye Musk
GP

In [65]:
sentence_list = nltk.sent_tokenize(article_text)  #fommatted data doesn;t contain punctuation so article_text used

In [66]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')     #all stopwords stored here

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [67]:
word_frequencies = {}    #calculating freq of words(excluding stopwords)

In [68]:
for word in nltk.word_tokenize(formatted_article_text):  
    if word not in stopwords:
        if word not in word_frequencies.keys():    #encountered for the first time
            word_frequencies[word] = 1     
        else:                                      #incrementing count
            word_frequencies[word] += 1

In [69]:
print(word_frequencies)



In [70]:
maximum_frequncy = max(word_frequencies.values())

In [71]:
#Calculating weighted freq of all words
for word in word_frequencies.keys():
    word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)

In [72]:
sentence_scores = {}    #calculating freq of sentences

In [73]:
for sent in sentence_list:  
    for word in nltk.word_tokenize(sent.lower()):   #convert to lowercase
        if word in word_frequencies.keys():        
            if len(sent.split(' ')) < 30:      #for smaller sentences < 30
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = word_frequencies[word]
                else:
                    sentence_scores[sent] += word_frequencies[word]

In [74]:
import heapq    #to retrieve highest scores

In [75]:
summary_sentences = heapq.nlargest(7, sentence_scores, key=sentence_scores.get)

In [76]:
summary = ' '.join(summary_sentences)
print(summary) 

In 2016, he co-founded Neuralink, a neurotechnology company focused on developing brain–computer interfaces, and founded The Boring Company, a tunnel construction company. In 2020, SpaceX launched its first manned flight, the Demo-2, becoming the first private company to place a person into orbit and dock a crewed space-craft with the ISS. A second set of test satellites and the first large deployment of a piece of the constellation occurred in May 2019, when the first 60 operational satellites were launched. The track was used in January 2017, and Musk also announced that the company started a tunnel project with Hawthorne airport as its destination. Musk says that before the company became successful, he could not afford an apartment and instead slept on the office couch and showered at the YMCA. Musk co-founded online bank X.com that same year, which merged with Confinity in 2000 to form the company PayPal and was bought by eBay in 2002 for $1.5 billion. Musk instead decided to star