<a href="https://colab.research.google.com/github/ayushb2002/textSummarization/blob/main/Audio_to_text_summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Machine Learning algorithm for text summariation and notes making using Natural Language Toolkit

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
import os
import nltk
import nltk.corpus
from IPython.display import Image
from punctuator import Punctuator
# Identification of different figures of speech such as nouns, pronouns, verbs etc.
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

In [6]:
nltk.download('words')

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [7]:
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [8]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [19]:
from nltk.corpus import stopwords # for stopwords

#stopwords.words("english")

In [20]:
para = "Artificial intelligence (AI) refers to the simulation of human intelligence in machines that are programmed to think like humans and mimic their actions. The term may also be applied to any machine that exhibits traits associated with a human mind such as learning and problem-solving."

In [21]:
AI_tokens = word_tokenize(para)

In [22]:
import re

punctuation = re.compile(r'[-.?!,:;()|0-9]')

In [23]:
post_punctuation = []
for words in AI_tokens:
  word = punctuation.sub("", words)
  if len(word)>0:
    post_punctuation.append(word)

In [19]:
len(post_punctuation), post_punctuation

(45,
 ['Artificial',
  'intelligence',
  'AI',
  'refers',
  'to',
  'the',
  'simulation',
  'of',
  'human',
  'intelligence',
  'in',
  'machines',
  'that',
  'are',
  'programmed',
  'to',
  'think',
  'like',
  'humans',
  'and',
  'mimic',
  'their',
  'actions',
  'The',
  'term',
  'may',
  'also',
  'be',
  'applied',
  'to',
  'any',
  'machine',
  'that',
  'exhibits',
  'traits',
  'associated',
  'with',
  'a',
  'human',
  'mind',
  'such',
  'as',
  'learning',
  'and',
  'problemsolving'])

In [None]:
sent1 = "John is eating a delicious cake"
sent1_tokens = word_tokenize(sent1)
for token in sent1_tokens:
  print(nltk.pos_tag([token]))

[('John', 'NNP')]
[('is', 'VBZ')]
[('eating', 'VBG')]
[('a', 'DT')]
[('delicious', 'JJ')]
[('cake', 'NN')]


In [None]:
from nltk import ne_chunk

In [None]:
NE_sent = "The US President stays in the white house"
NE_tokens = word_tokenize(NE_sent)
NE_tags = nltk.pos_tag(NE_tokens)

In [None]:
NE_ner = ne_chunk(NE_tags)
print(NE_ner)

(S
  The/DT
  (ORGANIZATION US/NNP)
  President/NNP
  stays/VBZ
  in/IN
  the/DT
  white/JJ
  house/NN)


In [None]:
pip install ghostscript

Collecting ghostscript
  Downloading ghostscript-0.7-py2.py3-none-any.whl (25 kB)
Installing collected packages: ghostscript
Successfully installed ghostscript-0.7


In [None]:
# Process of chunking

newStr = "The big cat ate the little mouse who was after the fresh cheese"
newToken = nltk.pos_tag(word_tokenize(newStr))
newToken

[('The', 'DT'),
 ('big', 'JJ'),
 ('cat', 'NN'),
 ('ate', 'VBD'),
 ('the', 'DT'),
 ('little', 'JJ'),
 ('mouse', 'NN'),
 ('who', 'WP'),
 ('was', 'VBD'),
 ('after', 'IN'),
 ('the', 'DT'),
 ('fresh', 'JJ'),
 ('cheese', 'NN')]

In [None]:
# Creating a grammer which we want in chunk phrase 

grammer_np = r"NP: {<DT>?<JJ>*<NN>}"

In [None]:
chunk_parser = nltk.RegexpParser(grammer_np)

In [None]:
chunk_result = chunk_parser.parse(newToken)
chunk_result # Giving error as colab does not support this method. Run on system to avoid this error.

TclError: ignored

Tree('S', [Tree('NP', [('The', 'DT'), ('big', 'JJ'), ('cat', 'NN')]), ('ate', 'VBD'), Tree('NP', [('the', 'DT'), ('little', 'JJ'), ('mouse', 'NN')]), ('who', 'WP'), ('was', 'VBD'), ('after', 'IN'), Tree('NP', [('the', 'DT'), ('fresh', 'JJ'), ('cheese', 'NN')])])

In [None]:
from sklearn.feature_extraction.text import CountVectorizer 

In [None]:
print(os.listdir(nltk.data.find("corpora")))

['movie_reviews.zip', 'movie_reviews', 'stopwords.zip', 'stopwords', 'words', 'words.zip']


In [None]:
# To generate synonyms etc.

from nltk.corpus import wordnet
syns = wordnet.synsets("important") 
  
print(syns[0].name()) 
  
print(syns[0].lemmas()[0].name()) 
  
print(syns[0].definition()) 
  
print(syns[0].examples())

important.a.01
important
of great significance or value
['important people', 'the important questions of the day']


In [None]:
from gensim.models import Word2Vec
import nltk
# define training data
content="""Cake is a form of sweet food made from flour, sugar, and other ingredients, that is usually baked.
In their oldest forms, cakes were modifications of bread, but cakes now cover a wide range of preparations that can be simple or elaborate, and that share features with other desserts such as pastries, meringues, custards, and pies."""
sentences=nltk.sent_tokenize(content)
words=[]

for i in sentences:
    words.append(nltk.word_tokenize(i))

# train model
model = Word2Vec(words, min_count=1)

# summarize the loaded model
print(model)

# summarize vocabulary
word_vec_words = list(model.wv.vocab)
print(word_vec_words)

# access vector for one word
print(model['sugar'])

# save model
model.save('model.bin')

# load model
new_model = Word2Vec.load('model.bin')
print(new_model)

Word2Vec(vocab=48, size=100, alpha=0.025)
['Cake', 'is', 'a', 'form', 'of', 'sweet', 'food', 'made', 'from', 'flour', ',', 'sugar', 'and', 'other', 'ingredients', 'that', 'usually', 'baked', '.', 'In', 'their', 'oldest', 'forms', 'cakes', 'were', 'modifications', 'bread', 'but', 'now', 'cover', 'wide', 'range', 'preparations', 'can', 'be', 'simple', 'or', 'elaborate', 'share', 'features', 'with', 'desserts', 'such', 'as', 'pastries', 'meringues', 'custards', 'pies']
[-1.0738604e-03 -3.5498452e-03 -3.3292570e-03  4.7651720e-03
  2.1713910e-04 -2.5128417e-03 -1.4641852e-03  2.1422445e-03
 -3.1401056e-03 -3.8788847e-03  1.3065654e-04 -3.3815168e-03
  2.0487199e-03 -4.9280291e-03 -4.0164446e-03 -2.4478142e-03
 -2.9044661e-03 -2.3760861e-03 -3.0464644e-03  3.9215842e-03
 -4.2540435e-04  3.7881292e-03 -4.0674102e-03  4.1024084e-04
 -3.5644886e-03 -4.1863038e-03  3.8727461e-03  2.8823775e-03
  4.4398019e-03 -3.6213074e-03  2.0774538e-03 -1.4005350e-03
  3.8649875e-03  1.8776137e-03  4.6833856



In [None]:
# IMDB dataset classification using tf keras utils and sequential model

import numpy as np
from tensorflow.keras.utils import to_categorical
from keras import models
from keras import layers
from keras.datasets import imdb
 
(train_data, train_target), (test_data, test_target) = imdb.load_data(num_words=10000)
dt = np.concatenate((train_data, test_data), axis=0)
tar = np.concatenate((train_target, test_target), axis=0)
 
def convert(sequences, dimension = 10000):
 results = np.zeros((len(sequences), dimension))
 for i, sequence in enumerate(sequences):
  results[i, sequence] = 1
 return results
 
dt = convert(dt)
tar = np.array(tar).astype("float32")
test_x = dt[:9000]
test_y = tar[:9000]
train_x = dt[9000:]
train_y = tar[9000:]
model = models.Sequential()
# Input - Layer
model.add(layers.Dense(50, activation = "relu", input_shape=(10000, )))
# Hidden - Layers
model.add(layers.Dropout(0.4, noise_shape=None, seed=None))
model.add(layers.Dense(50, activation = "relu"))
model.add(layers.Dropout(0.3, noise_shape=None, seed=None))
model.add(layers.Dense(50, activation = "relu"))
# Output- Layer
model.add(layers.Dense(1, activation = "sigmoid"))
model.summary()
# compiling the model
 
model.compile(
 optimizer = "adam",
 loss = "binary_crossentropy",
 metrics = ["accuracy"]
)
results = model.fit(
 train_x, train_y,
 epochs= 2,
 batch_size = 500,
 validation_data = (test_x, test_y)
)
print("Test-Accuracy:", np.mean(results.history["val_accuracy"]))

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_8 (Dense)             (None, 50)                500050    
                                                                 
 dropout_4 (Dropout)         (None, 50)                0         
                                                                 
 dense_9 (Dense)             (None, 50)                2550      
                                                                 
 dropout_5 (Dropout)         (None, 50)                0         
                                                                 
 dense_10 (Dense)            (None, 50)                2550      
                                                                 
 dense_11 (Dense)            (None, 1)                 51        
                                                                 
Total params: 505,201
Trainable params: 505,201
Non-tr

In [24]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [61]:
para = "The endless source of knowledge, information, entertainment, and training are books. Before the age of the internet, book's were the most dominating source of knowledge. But of course, with technology, the forms of books have changed, and books have become more accessible to everyone. Regardless of all other supplements of books, the contribution and role of books in our life are indispensable. In the education system, books are mostly followed to date for knowledge providing and gain. Books on several subjects enhance several aspects of education and learning. Writers can express their thoughts, views, and observations about any topic through their writings, which are published in books."
para

"The endless source of knowledge, information, entertainment, and training are books. Before the age of the internet, book's were the most dominating source of knowledge. But of course, with technology, the forms of books have changed, and books have become more accessible to everyone. Regardless of all other supplements of books, the contribution and role of books in our life are indispensable. In the education system, books are mostly followed to date for knowledge providing and gain. Books on several subjects enhance several aspects of education and learning. Writers can express their thoughts, views, and observations about any topic through their writings, which are published in books."

In [62]:
sent_list = nltk.sent_tokenize(para)
sent_list

['The endless source of knowledge, information, entertainment, and training are books.',
 "Before the age of the internet, book's were the most dominating source of knowledge.",
 'But of course, with technology, the forms of books have changed, and books have become more accessible to everyone.',
 'Regardless of all other supplements of books, the contribution and role of books in our life are indispensable.',
 'In the education system, books are mostly followed to date for knowledge providing and gain.',
 'Books on several subjects enhance several aspects of education and learning.',
 'Writers can express their thoughts, views, and observations about any topic through their writings, which are published in books.']

In [63]:
post_punctuation = []
for sentences in sent_list:
  sent = punctuation.sub(" ", sentences)
  if len(sent)>0:
    post_punctuation.append(sent)

post_punctuation

['The endless source of knowledge  information  entertainment  and training are books ',
 "Before the age of the internet  book's were the most dominating source of knowledge ",
 'But of course  with technology  the forms of books have changed  and books have become more accessible to everyone ',
 'Regardless of all other supplements of books  the contribution and role of books in our life are indispensable ',
 'In the education system  books are mostly followed to date for knowledge providing and gain ',
 'Books on several subjects enhance several aspects of education and learning ',
 'Writers can express their thoughts  views  and observations about any topic through their writings  which are published in books ']

In [64]:
formatted_str = ' '.join([str(pp) for pp in post_punctuation])
formatted_str

"The endless source of knowledge  information  entertainment  and training are books  Before the age of the internet  book's were the most dominating source of knowledge  But of course  with technology  the forms of books have changed  and books have become more accessible to everyone  Regardless of all other supplements of books  the contribution and role of books in our life are indispensable  In the education system  books are mostly followed to date for knowledge providing and gain  Books on several subjects enhance several aspects of education and learning  Writers can express their thoughts  views  and observations about any topic through their writings  which are published in books "

In [65]:
stopwords = nltk.corpus.stopwords.words('english')

notes = []

for sent in post_punctuation:
  newSent = ""
  if sent not in stopwords:
      newSent+=sent+" "
  notes.append(newSent)

notes

['The endless source of knowledge  information  entertainment  and training are books  ',
 "Before the age of the internet  book's were the most dominating source of knowledge  ",
 'But of course  with technology  the forms of books have changed  and books have become more accessible to everyone  ',
 'Regardless of all other supplements of books  the contribution and role of books in our life are indispensable  ',
 'In the education system  books are mostly followed to date for knowledge providing and gain  ',
 'Books on several subjects enhance several aspects of education and learning  ',
 'Writers can express their thoughts  views  and observations about any topic through their writings  which are published in books  ']

In [66]:
stopwords = nltk.corpus.stopwords.words('english')

word_frequencies = {}
for word in nltk.word_tokenize(formatted_str):
    if word not in stopwords:
        if word not in word_frequencies.keys():
            word_frequencies[word] = 1
        else:
            word_frequencies[word] += 1

In [67]:
word_frequencies

{"'s": 1,
 'Before': 1,
 'Books': 1,
 'But': 1,
 'In': 1,
 'Regardless': 1,
 'The': 1,
 'Writers': 1,
 'accessible': 1,
 'age': 1,
 'aspects': 1,
 'become': 1,
 'book': 1,
 'books': 7,
 'changed': 1,
 'contribution': 1,
 'course': 1,
 'date': 1,
 'dominating': 1,
 'education': 2,
 'endless': 1,
 'enhance': 1,
 'entertainment': 1,
 'everyone': 1,
 'express': 1,
 'followed': 1,
 'forms': 1,
 'gain': 1,
 'indispensable': 1,
 'information': 1,
 'internet': 1,
 'knowledge': 3,
 'learning': 1,
 'life': 1,
 'mostly': 1,
 'observations': 1,
 'providing': 1,
 'published': 1,
 'role': 1,
 'several': 2,
 'source': 2,
 'subjects': 1,
 'supplements': 1,
 'system': 1,
 'technology': 1,
 'thoughts': 1,
 'topic': 1,
 'training': 1,
 'views': 1,
 'writings': 1}

In [68]:
maximum_frequncy = max(word_frequencies.values())

for word in word_frequencies.keys():
    word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)

In [69]:
sentence_scores = {}
for sent in sent_list:
    for word in nltk.word_tokenize(sent.lower()):
        if word in word_frequencies.keys():
            if len(sent.split(' ')) < 30:
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = word_frequencies[word]
                else:
                    sentence_scores[sent] += word_frequencies[word]

In [70]:
sentence_scores

{"Before the age of the internet, book's were the most dominating source of knowledge.": 1.4285714285714284,
 'Books on several subjects enhance several aspects of education and learning.': 2.428571428571428,
 'But of course, with technology, the forms of books have changed, and books have become more accessible to everyone.': 2.9999999999999996,
 'In the education system, books are mostly followed to date for knowledge providing and gain.': 2.571428571428571,
 'Regardless of all other supplements of books, the contribution and role of books in our life are indispensable.': 2.714285714285714,
 'The endless source of knowledge, information, entertainment, and training are books.': 2.2857142857142856,
 'Writers can express their thoughts, views, and observations about any topic through their writings, which are published in books.': 1.9999999999999998}

In [71]:
import heapq
summary_sentences = heapq.nlargest(7, sentence_scores, key=sentence_scores.get)

summary = ' '.join(summary_sentences)
print(summary)

But of course, with technology, the forms of books have changed, and books have become more accessible to everyone. Regardless of all other supplements of books, the contribution and role of books in our life are indispensable. In the education system, books are mostly followed to date for knowledge providing and gain. Books on several subjects enhance several aspects of education and learning. The endless source of knowledge, information, entertainment, and training are books. Writers can express their thoughts, views, and observations about any topic through their writings, which are published in books. Before the age of the internet, book's were the most dominating source of knowledge.


In [148]:
note = {}

i=0
for key in sentence_scores:
  note[notes[i]] = sentence_scores[key]
  i+=1

notes_sentences = heapq.nlargest(3, note, key=note.get)
notes_sentences

['But of course  with technology  the forms of books have changed  and books have become more accessible to everyone  ',
 'Regardless of all other supplements of books  the contribution and role of books in our life are indispensable  ',
 'In the education system  books are mostly followed to date for knowledge providing and gain  ']

reference from - https://stackabuse.com/text-summarization-with-nltk-in-python/

In [37]:
from nltk.tokenize import word_tokenize, sent_tokenize
import heapq
import nltk
import re
from nltk.corpus import stopwords
# n - number of lines for summary to be generated , para - paragraph to be summarized 

def generateSummary(n, para):
  sent_list = nltk.sent_tokenize(para)
  if n>len(sent_list)/2:
    return "Summary cannot be greater in length than half of provided data!"
  post_punctuation = [] 
  punctuation = re.compile(r'[-.?!,:;()|0-9]')
  for sentences in sent_list:
    sent = punctuation.sub(" ", sentences)
    sent = re.sub(r'\[[0-9]*\]', ' ', sentences)
    sent = re.sub(r'\s+', ' ', sentences)
    if len(sent)>0:
      post_punctuation.append(sent)
  
  formatted_str = ' '.join([str(pp) for pp in post_punctuation])
  stopwords = nltk.corpus.stopwords.words('english')

  word_frequencies = {}
  for word in nltk.word_tokenize(formatted_str):
      if word not in stopwords:
          if word not in word_frequencies.keys():
              word_frequencies[word] = 1
          else:
              word_frequencies[word] += 1

  maximum_frequncy = max(word_frequencies.values())

  for word in word_frequencies.keys():
      word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)  

  sentence_scores = {}
  for sent in sent_list:
      for word in nltk.word_tokenize(sent.lower()):
          if word in word_frequencies.keys():
              if len(sent.split(' ')) < 30:
                  if sent not in sentence_scores.keys():
                      sentence_scores[sent] = word_frequencies[word]
                  else:
                      sentence_scores[sent] += word_frequencies[word]  

  summary_sentences = heapq.nlargest(n, sentence_scores, key=sentence_scores.get)

  summary = ' '.join(summary_sentences)
  return summary

In [38]:
para1 = "France (French: [fʁɑ̃s] Listen), officially the French Republic (French: République française[12]), is a transcontinental country spanning Western Europe and overseas regions and territories in the Americas and the Atlantic, Pacific and Indian Oceans.[XII] Its metropolitan area extends from the Rhine to the Atlantic Ocean and from the Mediterranean Sea to the English Channel and the North Sea; overseas territories include French Guiana in South America, Saint Pierre and Miquelon in the North Atlantic, the French West Indies, and many islands in Oceania and the Indian Ocean. Due to its several coastal territories, France has the largest exclusive economic zone in the world. France borders Belgium, Luxembourg, Germany, Switzerland, Monaco, Italy, Andorra and Spain in Europe, as well as the Netherlands, Suriname and Brazil in the Americas. Its eighteen integral regions (five of which are overseas) span a combined area of 643,801 km2 (248,573 sq mi) and over 67 million people (as of May 2021).[3] France is a unitary semi-presidential republic with its capital in Paris, the country's largest city and main cultural and commercial centre; other major urban areas include Marseille, Lyon, Toulouse, Lille, Bordeaux, and Nice. Inhabited since the Palaeolithic era, the territory of Metropolitan France was settled by Celtic tribes known as Gauls during the Iron Age. Rome annexed the area in 51 BC, leading to a distinct Gallo-Roman culture that laid the foundation of the French language. The Germanic Franks formed the Kingdom of Francia, which became the heartland of the Carolingian Empire. The Treaty of Verdun of 843 partitioned the empire, with West Francia becoming the Kingdom of France in 987. In the High Middle Ages, France was a powerful but highly decentralised feudal kingdom. Philip II successfully strengthened royal power and defeated his rivals to double the size of the crown lands; by the end of his reign, France had emerged as the most powerful state in Europe. From the mid-14th to the mid-15th century, France was plunged into a series of dynastic conflicts involving England, collectively known as the Hundred Years' War, and a distinct French identity emerged as a result. The French Renaissance saw art and culture flourish, conflict with the House of Habsburg, and the establishment of a global colonial empire, which by the 20th century would become the second-largest in the world.[13] The second half of the 16th century was dominated by religious civil wars between Catholics and Huguenots that severely weakened the country. France again emerged as Europe's dominant power in the 17th century under Louis XIV following the Thirty Years' War.[14] Inadequate economic policies, inequitable taxes and frequent wars (notably a defeat in the Seven Years' War and costly involvement in the American War of Independence), left the kingdom in a precarious economic situation by the end of the 18th century. This precipitated the French Revolution of 1789, which overthrew the Ancien Régime and produced the Declaration of the Rights of Man, which expresses the nation's ideals to this day."

In [41]:
summ = generateSummary(9, para1)
summ

"France borders Belgium, Luxembourg, Germany, Switzerland, Monaco, Italy, Andorra and Spain in Europe, as well as the Netherlands, Suriname and Brazil in the Americas. This precipitated the French Revolution of 1789, which overthrew the Ancien Régime and produced the Declaration of the Rights of Man, which expresses the nation's ideals to this day. Its eighteen integral regions (five of which are overseas) span a combined area of 643,801 km2 (248,573 sq mi) and over 67 million people (as of May 2021). Due to its several coastal territories, France has the largest exclusive economic zone in the world. Rome annexed the area in 51 BC, leading to a distinct Gallo-Roman culture that laid the foundation of the French language. The Treaty of Verdun of 843 partitioned the empire, with West Francia becoming the Kingdom of France in 987. Inhabited since the Palaeolithic era, the territory of Metropolitan France was settled by Celtic tribes known as Gauls during the Iron Age. The Germanic Franks 

In [59]:
ntags = nltk.pos_tag(nltk.word_tokenize(summ))
print(ntags)

[('France', 'NNP'), ('borders', 'NNS'), ('Belgium', 'NNP'), (',', ','), ('Luxembourg', 'NNP'), (',', ','), ('Germany', 'NNP'), (',', ','), ('Switzerland', 'NNP'), (',', ','), ('Monaco', 'NNP'), (',', ','), ('Italy', 'NNP'), (',', ','), ('Andorra', 'NNP'), ('and', 'CC'), ('Spain', 'NNP'), ('in', 'IN'), ('Europe', 'NNP'), (',', ','), ('as', 'RB'), ('well', 'RB'), ('as', 'IN'), ('the', 'DT'), ('Netherlands', 'NNP'), (',', ','), ('Suriname', 'NNP'), ('and', 'CC'), ('Brazil', 'NNP'), ('in', 'IN'), ('the', 'DT'), ('Americas', 'NNPS'), ('.', '.'), ('This', 'DT'), ('precipitated', 'VBD'), ('the', 'DT'), ('French', 'JJ'), ('Revolution', 'NNP'), ('of', 'IN'), ('1789', 'CD'), (',', ','), ('which', 'WDT'), ('overthrew', 'VBD'), ('the', 'DT'), ('Ancien', 'NNP'), ('Régime', 'NNP'), ('and', 'CC'), ('produced', 'VBD'), ('the', 'DT'), ('Declaration', 'NNP'), ('of', 'IN'), ('the', 'DT'), ('Rights', 'NNPS'), ('of', 'IN'), ('Man', 'NNP'), (',', ','), ('which', 'WDT'), ('expresses', 'VBZ'), ('the', 'DT'), 

In [140]:
ntags = nltk.pos_tag(nltk.word_tokenize(notes_sentences[0]))
ntags

[('But', 'CC'),
 ('of', 'IN'),
 ('course', 'NN'),
 ('with', 'IN'),
 ('technology', 'NN'),
 (',', ','),
 (',', ','),
 ('the', 'DT'),
 ('forms', 'NNS'),
 ('of', 'IN'),
 ('books', 'NNS'),
 ('have', 'VBP'),
 ('changed', 'VBN'),
 ('and', 'CC'),
 ('books', 'NNS'),
 ('have', 'VBP'),
 ('become', 'VBN'),
 ('more', 'RBR'),
 ('accessible', 'JJ'),
 ('to', 'TO'),
 ('everyone', 'NN'),
 ('..', 'NN')]

In [60]:
print(nltk.pos_tag(nltk.word_tokenize('nations')))

[('nations', 'NNS')]


In [174]:
def punctuate(para):
  after = ['NN', 'NNP', 'NNPS', 'NNS']
  before = ['WDT']
  avoids = ['IN', 'CC', 'VBP']
  para = re.sub(' +', ' ', para)
  tagDef = nltk.pos_tag(nltk.word_tokenize(para))
  n = len(tagDef)
  for i in range(0,n-1):
    words = para.split()
    if tagDef[i][1] == 'NNS' and tagDef[i+1][1] == 'NNS':  
      temp = len(words[i])-1
      newWord = words[i][:temp-1]+"'s"
      words.pop(i)
      words.insert(i, newWord)
    elif tagDef[i][1] == 'NN' and tagDef[i+1][1] == 'NN':
      pass
    elif tagDef[i][1] in after and tagDef[i+1][1] not in avoids:
      words.insert(i+1, ',')
    elif tagDef[i][1] in before:
      words.insert(i-1, ',')
    elif tagDef[i][1] == ',' and tagDef[i+1][1] == ',':
      words.pop(i)
      words.pop(i+1)
    para = ' '.join(str(x) for x in words)
    tagDef = nltk.pos_tag(nltk.word_tokenize(para))
    n = len(tagDef)

  return para

In [159]:
punctuated_notes = []
for i in range(len(notes_sentences)):
   punctuated_notes.append(punctuate(notes_sentences[i]))

punctuated_notes

['But of course with technology , the forms of books have changed and books have become more accessible to everyone.',
 'Regardless of all other supplements of books , the contribution and role of books in our life are indispensable.',
 'In the education system , books are mostly followed to date for knowledge providing and gain.']

In [191]:
def generateNotes(para, n=3):
  sent_list = nltk.sent_tokenize(para)

  if n>len(sent_list)/2:
      return "Summary cannot be greater in length than half of provided data!"

  post_punctuation = [] 
  punctuation = re.compile(r'[-.?!,:;()|0-9]')
  for sentences in sent_list:
    sent = punctuation.sub(" ", sentences)
    sent = re.sub(r'\[[0-9]*\]', ' ', sentences)
    sent = re.sub(r'\s+', ' ', sentences)
    sent = re.sub("[\(\[].*?[\)\]]", "", sentences)
    if len(sent)>0:
      post_punctuation.append(sent)

  formatted_str = ' '.join([str(pp) for pp in post_punctuation])
  stopwords = nltk.corpus.stopwords.words('english')

  word_frequencies = {}
  for word in nltk.word_tokenize(formatted_str):
      if word not in stopwords:
          if word not in word_frequencies.keys():
              word_frequencies[word] = 1
          else:
              word_frequencies[word] += 1

  maximum_frequncy = max(word_frequencies.values())

  for word in word_frequencies.keys():
      word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)  

  sentence_scores = {}
  for sent in sent_list:
      for word in nltk.word_tokenize(sent.lower()):
          if word in word_frequencies.keys():
              if len(sent.split(' ')) < 30:
                  if sent not in sentence_scores.keys():
                      sentence_scores[sent] = word_frequencies[word]
                  else:
                      sentence_scores[sent] += word_frequencies[word]  

  notes = []
  for sent in post_punctuation:
    newSent = ""
    if sent not in stopwords:
        newSent+=sent+" "
    notes.append(newSent)

    note = {}

  i=0
  for key in sentence_scores:
    note[notes[i]] = sentence_scores[key]
    i+=1

  notes_sentences = heapq.nlargest(n, note, key=note.get)
  
  # punctuated_notes = []
  # for i in range(len(notes_sentences)):
  #   punctuated_notes.append(punctuate(notes_sentences[i]))

  return notes_sentences

In [193]:
notes = generateNotes(para1, 5)
for i in range(len(notes)):
  print(f'{i+1}: {notes[i]}\n')

1:  Its metropolitan area extends from the Rhine to the Atlantic Ocean and from the Mediterranean Sea to the English Channel and the North Sea; overseas territories include French Guiana in South America, Saint Pierre and Miquelon in the North Atlantic, the French West Indies, and many islands in Oceania and the Indian Ocean. 

2: In the High Middle Ages, France was a powerful but highly decentralised feudal kingdom. 

3: France  Listen), officially the French Republic ), is a transcontinental country spanning Western Europe and overseas regions and territories in the Americas and the Atlantic, Pacific and Indian Oceans. 

4: Its eighteen integral regions  span a combined area of 643,801 km2  and over 67 million people . 

5: Inhabited since the Palaeolithic era, the territory of Metropolitan France was settled by Celtic tribes known as Gauls during the Iron Age. 

