In [54]:
import pandas as pd
from keybert import KeyBERT
from bertopic import BERTopic
from simplet5 import SimpleT5
from sklearn.datasets import fetch_20newsgroups
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
import json

KeyBERT - Extracting Keywords from the corpus.

BERTopic - Retrive the topic of give input text.

SimpleT5 - Utilized to get the Title of the text



In [55]:
!pip install keybert
!pip install bertopic
!pip install simplet5



In [56]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Fetch the data from predefined corpus

In [57]:
docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']

**docs[0] is our input text for rest of the implementation**

In [58]:
input_text = docs[0]
print(input_text)



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!




**Keywords Extraction**

In [59]:
model_keyBert = KeyBERT('distilbert-base-nli-mean-tokens')

In [60]:
keywords = model_keyBert.extract_keywords(input_text, highlight = True)

In [61]:
keywords

[('killing', 0.1001),
 ('playoffs', 0.0917),
 ('devils', 0.0917),
 ('massacre', 0.0858),
 ('bashers', 0.0443)]

In [62]:
keyword_items = list(map(lambda x:x[0], keywords))
print(keyword_items)

['killing', 'playoffs', 'devils', 'massacre', 'bashers']


**Abstractive Summarization**

In [63]:
!pip install transformers



In [64]:
from transformers import pipeline
summarizer = pipeline("summarization", model="t5-base", tokenizer="t5-base")
summarizer(input_text, min_length=5, max_length=100)

[{'summary_text': "mike downey: some pens fans are puzzled about lack of posts about recent massacre . he says the pens are killing the Devils worse than i thought . downee: i am going to put an end to non-PIttsburghers' relief with praise for the Pens ."}]

In [65]:
Abs_summ = summarizer(input_text, min_length=5, max_length=100)
Abs_summ

[{'summary_text': "mike downey: some pens fans are puzzled about lack of posts about recent massacre . he says the pens are killing the Devils worse than i thought . downee: i am going to put an end to non-PIttsburghers' relief with praise for the Pens ."}]

**Extractive summarizer**

In [66]:
!pip install bert-extractive-summarizer



In [67]:
from summarizer import Summarizer
body = input_text
model = Summarizer()
result1 = model.run_embeddings(body, ratio=0.2)  # Specified with ratio. 
result2 = model.run_embeddings(body, num_sentences=3)  # Will return (3, N) embedding numpy matrix.
result3 = model.run_embeddings(body, num_sentences=3, aggregate='mean')  # Will return Mean aggregate over embeddings. 
result1,result2,result3

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(array([[-0.16550298, -0.43937498, -0.496167  , ..., -0.30777082,
          0.6834121 , -0.0481769 ],
        [-0.13165128, -0.5110878 , -0.40450478, ..., -0.40772933,
          0.2574005 ,  0.12032402]], dtype=float32),
 array([[-0.16550298, -0.43937498, -0.496167  , ..., -0.30777082,
          0.6834121 , -0.0481769 ],
        [-1.1733667 , -1.1871829 , -0.32441446, ...,  0.08517452,
          0.8798207 ,  0.58078945],
        [-0.13165128, -0.5110878 , -0.40450478, ..., -0.40772933,
          0.2574005 ,  0.12032402]], dtype=float32),
 array([-0.49017367, -0.71254855, -0.40836206, ..., -0.21010853,
         0.60687774,  0.21764553], dtype=float32))

In [68]:
result = model(body, min_length=60)
full = ''.join(result)
print(full)

I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens.


One line summary

In [69]:
from transformers import pipeline
summarizer = pipeline("summarization", model="snrspeaks/t5-one-line-summary", tokenizer="snrspeaks/t5-one-line-summary")
title = summarizer(input_text, min_length=5, max_length=100)

In [70]:
title

[{'summary_text': 'The Pens RULE!'}]

**Contextual topic modeling**

In [71]:
!pip install contextualized-topic-models==2.2.0



In [72]:
!pip install pyldavis



In [73]:
from contextualized_topic_models.models.ctm import ZeroShotTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessing


In [74]:
nltk.download('stopwords')

documents = docs[0:5]
sp = WhiteSpacePreprocessing(documents, stopwords_language='english')

preprocessed_documents, unpreprocessed_corpus, vocab = sp.preprocess()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [75]:
preprocessed_documents[:2]

['sure bashers pens fans pretty confused lack kind posts recent pens massacre devils actually bit puzzled bit relieved however going put end non pittsburghers relief bit praise pens man killing devils worse thought jagr showed much better regular season stats also lot fo fun watch playoffs bowman let jagr lot fun next couple games since pens going beat pulp jersey anyway disappointed see islanders lose final regular season game pens rule',
 'brother market high performance video card supports vesa local bus ram anyone suggestions ideas diamond stealth pro local bus orchid farenheit ati graphics ultra pro high performance vlb card please post email thank matt']

In [76]:
tp = TopicModelDataPreparation("paraphrase-multilingual-mpnet-base-v2")

training_dataset = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [77]:
tp.vocab[:10]

['ability',
 'actually',
 'also',
 'ancient',
 'announced',
 'another',
 'anyone',
 'anyway',
 'april',
 'aquire']

In [78]:
ctm = ZeroShotTM(bow_size=len(tp.vocab), contextual_size=768, n_components=50, num_epochs=20)
ctm.fit(training_dataset) # run the model

Epoch: [20/20]	 Seen Samples: [100/100]	Train Loss: 451.3189453125	Time: 0:00:00.444889: : 20it [00:09,  2.09it/s]


In [79]:
ctm.get_topic_lists(5)

[['disappointed', 'upsate', 'pay', 'lie', 'compatability'],
 ['watch', 'end', 'forgot', 'much', 'lose'],
 ['dream', 'government', 'area', 'killings', 'containing'],
 ['anyone', 'device', 'drive', 'formatters', 'system'],
 ['inexpensive', 'swedish', 'called', 'seen', 'pro'],
 ['massacre', 'question', 'disk', 'thought', 'modern'],
 ['diamond', 'fans', 'nothing', 'killer', 'pay'],
 ['send', 'actually', 'transfers', 'device', 'used'],
 ['july', 'price', 'post', 'start', 'tape'],
 ['also', 'os', 'matt', 'ready', 'market'],
 ['beckup', 'islanders', 'showed', 'thank', 'posts'],
 ['attached', 'see', 'wanting', 'showed', 'old'],
 ['aquire', 'recent', 'tasking', 'next', 'devices'],
 ['actually', 'armenian', 'high', 'government', 'armenia'],
 ['fo', 'seeked', 'send', 'july', 'butter'],
 ['going', 'give', 'personel', 'armenian', 'program'],
 ['season', 'program', 'vlb', 'showed', 'seeked'],
 ['planes', 'suggestions', 'attack', 'data', 'high'],
 ['pretty', 'idea', 'vesa', 'dma', 'market'],
 ['weapo

**Question generation**

In [80]:
from transformers import pipeline
text2text_generator = pipeline("text2text-generation",model="ZhangCheng/T5v1.1-Base-Fine-Tuned-for-Question-Generation",tokenizer="ZhangCheng/T5v1.1-Base-Fine-Tuned-for-Question-Generation")
text = "<answer> The Pens RULE! <context>"+str(input_text)
# text2text_generator("question: What is 42 ? context: 42 is the answer to life, the universe and everything")

In [81]:
list_questions = text2text_generator(text)

In [82]:
list_questions

[{'generated_text': 'What was the final regular season game called?'}]

**Various Summary**

In [83]:
import torch
from transformers import T5ForConditionalGeneration,T5Tokenizer


def set_seed(seed):
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

model = T5ForConditionalGeneration.from_pretrained('t5-base')
tokenizer = T5Tokenizer.from_pretrained('t5-base')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print ("device ",device)
model = model.to(device)

sentence = input_text
text =  input_text


max_len = 256

encoding = tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt")
input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)


# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
beam_outputs = model.generate(
    input_ids=input_ids, attention_mask=attention_masks,
    do_sample=True,
    max_length=256,
    top_k=120,
    top_p=0.98,
    early_stopping=True,
    num_return_sequences=10
)


print ("\nOriginal doc ::")
print (sentence)
print ("\n")
print ("various summaries :: ")
final_outputs =[]
for beam_output in beam_outputs:
    sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
    if sent.lower() != sentence.lower() and sent not in final_outputs:
        final_outputs.append(sent)

summaries = []
for i, final_output in enumerate(final_outputs):
    print("{}: {}".format(i, final_output))
    summaries.append(final_output)


device  cuda

Original doc ::


I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!




various summaries :: 
0: some fans must be pretty frustrated. But I have to say,. Jagr is a real threat to Pens fans. Jagr is a good guy.. Islanders Islanders. MAN!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! But still, THANK YOU! NO WORRY! So disappointe

In [84]:
list_questions[0]["generated_text"]

'What was the final regular season game called?'

In [85]:
def getInformationInJson(title, inputText, full, abs_summary,summaries, questions, keyword_items):
  dict_json = {'title' : title}
  dict_json['inputText'] = inputText
  dict_json['extractive_summarization'] = full
  dict_json['abstractive_summarization'] = abs_summary
  dict_json['summaries'] = summaries
  dict_json['questions'] = questions
  dict_json['keywords'] = keyword_items
  return json.dumps(dict_json, indent=2)

# Final Json result from the given corpus

In [86]:

finalJsonResult = getInformationInJson(title[0]["summary_text"], 
                           input_text.replace('\n',""),
                           full, 
                           Abs_summ[0]['summary_text'], 
                           summaries, 
                           list_questions[0]["generated_text"], 
                           keyword_items)

print(finalJsonResult)

{
  "title": "The Pens RULE!",
  "inputText": "I am sure some bashers of Pens fans are pretty confused about the lackof any kind of posts about the recent Pens massacre of the Devils. Actually,I am  bit puzzled too and a bit relieved. However, I am going to put an endto non-PIttsburghers' relief with a bit of praise for the Pens. Man, theyare killing those Devils worse than I thought. Jagr just showed you whyhe is much better than his regular season stats. He is also a lotfo fun to watch in the playoffs. Bowman should let JAgr have a lot offun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the finalregular season game.          PENS RULE!!!",
  "extractive_summarization": "I am sure some bashers of Pens fans are pretty confused about the lack\nof any kind of posts about the recent Pens massacre of the Devils. However, I am going to put an end\nto non-PIttsburghers' relief with a bit of pr

# **More information to retrive from corpus**

**POS Tagger**

In [87]:
!pip install flair

Collecting flair
  Downloading flair-0.10-py3-none-any.whl (322 kB)
[?25l[K     |█                               | 10 kB 32.3 MB/s eta 0:00:01[K     |██                              | 20 kB 9.9 MB/s eta 0:00:01[K     |███                             | 30 kB 8.4 MB/s eta 0:00:01[K     |████                            | 40 kB 4.4 MB/s eta 0:00:01[K     |█████                           | 51 kB 4.4 MB/s eta 0:00:01[K     |██████                          | 61 kB 5.2 MB/s eta 0:00:01[K     |███████                         | 71 kB 5.6 MB/s eta 0:00:01[K     |████████▏                       | 81 kB 5.3 MB/s eta 0:00:01[K     |█████████▏                      | 92 kB 5.9 MB/s eta 0:00:01[K     |██████████▏                     | 102 kB 5.2 MB/s eta 0:00:01[K     |███████████▏                    | 112 kB 5.2 MB/s eta 0:00:01[K     |████████████▏                   | 122 kB 5.2 MB/s eta 0:00:01[K     |█████████████▏                  | 133 kB 5.2 MB/s eta 0:00:01[K     |█

In [88]:
from flair.data import Sentence
from flair.models import SequenceTagger

# load tagger
tagger = SequenceTagger.load("flair/pos-english")

# make example sentence
sentence = Sentence(input_text)

# predict NER tags
tagger.predict(sentence)

# print sentence
print(sentence)

# print predicted NER spans
print('The following NER tags are found:')
# iterate over entities and print
for entity in sentence.get_spans('pos'):
    print(entity)


Downloading:   0%|          | 0.00/249M [00:00<?, ?B/s]

2022-03-27 00:56:46,604 loading file /root/.flair/models/pos-english/a9a73f6cd878edce8a0fa518db76f441f1cc49c2525b2b4557af278ec2f0659e.121306ea62993d04cd1978398b68396931a39eb47754c8a06a87f325ea70ac63
Sentence: "I am sure some bashers of Pens fans are pretty confused about the lack of any kind of posts about the recent Pens massacre of the Devils . Actually , I am bit puzzled too and a bit relieved . However , I am going to put an end to non-PIttsburghers' relief with a bit of praise for the Pens . Man , they are killing those Devils worse than I thought . Jagr just showed you why he is much better than his regular season stats . He is also a lot fo fun to watch in the playoffs . Bowman should let JAgr have a lot of fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway . I was very disappointed not to see the Islanders lose the final regular season game . PENS RULE !! !"   [− Tokens: 150  − Token-Labels: "I <PRP> am <VBP> sure <JJ> some <DT> bashe

tag	meaning
ADD	Email
AFX	Affix
CC	Coordinating conjunction
CD	Cardinal number
DT	Determiner
EX	Existential there
FW	Foreign word
HYPH	Hyphen
IN	Preposition or subordinating conjunction
JJ	Adjective
JJR	Adjective, comparative
JJS	Adjective, superlative
LS	List item marker
MD	Modal
NFP	Superfluous punctuation
NN	Noun, singular or mass
NNP	Proper noun, singular
NNPS	Proper noun, plural
NNS	Noun, plural
PDT	Predeterminer
POS	Possessive ending
PRP	Personal pronoun
PRP$	Possessive pronoun
RB	Adverb
RBR	Adverb, comparative
RBS	Adverb, superlative
RP	Particle
SYM	Symbol
TO	to
UH	Interjection
VB	Verb, base form
VBD	Verb, past tense
VBG	Verb, gerund or present participle
VBN	Verb, past participle
VBP	Verb, non-3rd person singular present
VBZ	Verb, 3rd person singular present
WDT	Wh-determiner
WP	Wh-pronoun
WP$	Possessive wh-pronoun
WRB	Wh-adverb
XX	Unknown

Consider verb related tags as relations

In [89]:
# tag	meaning
# ADD	Email
# AFX	Affix
# CC	Coordinating conjunction
# CD	Cardinal number
# DT	Determiner
# EX	Existential there
# FW	Foreign word
# HYPH	Hyphen
# IN	Preposition or subordinating conjunction
# JJ	Adjective
# JJR	Adjective, comparative
# JJS	Adjective, superlative
# LS	List item marker
# MD	Modal
# NFP	Superfluous punctuation
# NN	Noun, singular or mass
# NNP	Proper noun, singular
# NNPS	Proper noun, plural
# NNS	Noun, plural
# PDT	Predeterminer
# POS	Possessive ending
# PRP	Personal pronoun
# PRP$	Possessive pronoun
# RB	Adverb
# RBR	Adverb, comparative
# RBS	Adverb, superlative
# RP	Particle
# SYM	Symbol
# TO	to
# UH	Interjection
# VB	Verb, base form
# VBD	Verb, past tense
# VBG	Verb, gerund or present participle
# VBN	Verb, past participle
# VBP	Verb, non-3rd person singular present
# VBZ	Verb, 3rd person singular present
# WDT	Wh-determiner
# WP	Wh-pronoun
# WP$	Possessive wh-pronoun
# WRB	Wh-adverb
# XX	Unknown

# Consider verb related tags as relations

In [90]:
# SimpleT5 model is used to get the Title of the paragraph
model_T5 = SimpleT5()
model_T5.load_model("t5","snrspeaks/t5-one-line-summary")

In [91]:
title_t5 = model_T5.predict(input_text)
print(title_t5)

['The Pens Rule!']
