# Summarizer

In [1]:
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from sentence_splitter import SentenceSplitter, split_text_into_sentences


In [2]:
def get_response(input_text, num_return_sequences):
    batch = tokenizer.prepare_seq2seq_batch([input_text], truncation=True, padding='longest', max_length=60,
                                            return_tensors="pt").to(torch_device)
    translated = model1.generate(**batch, max_length=60, num_beams=10, num_return_sequences=num_return_sequences,
                                 temperature=1.5)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text

In [3]:
model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model1 = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

In [4]:
paraphrase = []
splitter = SentenceSplitter(language='en')
text=input("Enter Text :")
print()
print(text)
sentence_list = splitter.split(text)

Enter Text :A blockchain is an open, distributed ledger that records transactions in code. In practice, it’s a little like a checkbook that’s distributed across countless computers around the world. Transactions are recorded in “blocks” that are then linked together on a “chain” of previous cryptocurrency transactions.  “Imagine a book where you write down everything you spend money on each day,” says Buchi Okoro, CEO and co-founder of African cryptocurrency exchange Quidax. “Each page is similar to a block, and the entire book, a group of pages, is a blockchain.”  With a blockchain, everyone who uses a cryptocurrency has their own copy of this book to create a unified transaction record. Software logs each new transaction as it happens, and every copy of the blockchain is updated simultaneously with the new information, keeping all records identical and accurate.  To prevent fraud, each transaction is checked using one of two main validation techniques: proof of work or proof of stake

In [5]:
for i in sentence_list:
    if i == '':
        continue
    a = get_response(i, 1)
    paraphrase.append(a)

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



In [6]:
print(paraphrase)
paraphrase2=[' '.join(x[0] for x in paraphrase)]
textsumm=paraphrase2[0]
textsumm

[['There is a distributed ledger that records transactions.'], ['It is similar to a checkbook that is distributed across countless computers around the world.'], ['Transactions are recorded in blocks that are linked together on a chain of previous transactions.'], ['"Imagine a book where you write down everything you spend money on each day," says Buchi Okoro, CEO and co-founder of Africancryptocurrencies exchangeQuidax.'], ['The entire book, a group of pages, is a block.'], ['Everyone who uses acryptocurrencies has their own copy of the book to create a unified transaction record.'], ['Every copy of the blockchain is updated with new information at the same time as the software logs the new transactions.'], ['Proof of work or proof of stake are two main validation techniques that are used to prevent fraud.']]


'There is a distributed ledger that records transactions. It is similar to a checkbook that is distributed across countless computers around the world. Transactions are recorded in blocks that are linked together on a chain of previous transactions. "Imagine a book where you write down everything you spend money on each day," says Buchi Okoro, CEO and co-founder of Africancryptocurrencies exchangeQuidax. The entire book, a group of pages, is a block. Everyone who uses acryptocurrencies has their own copy of the book to create a unified transaction record. Every copy of the blockchain is updated with new information at the same time as the software logs the new transactions. Proof of work or proof of stake are two main validation techniques that are used to prevent fraud.'

# Dataframe

In [7]:
import pandas as pd

In [8]:
df = pd.DataFrame(columns = ['Question', 'Answer','Options'])

In [9]:
df

Unnamed: 0,Question,Answer,Options


# Bert Keyword Extractor

In [10]:
from keybert import KeyBERT
kw_model = KeyBERT()

In [11]:
for i in paraphrase:
    a=kw_model.extract_keywords(i[0], keyphrase_ngram_range=(1,2),stop_words=None)[0][0]
    b=kw_model.extract_keywords(i[0], keyphrase_ngram_range=(1,1),stop_words=None)[0][0]
    df.loc[len(df.index)]=[i[0],a,[]]
   # print(b,a,b in a)
    if b not in a:
        print(b not in a,b,a)
        df.loc[len(df.index)]=[i[0],b,[]]
    else:
        continue

True blockchain new transactions


In [12]:
df

Unnamed: 0,Question,Answer,Options
0,There is a distributed ledger that records tra...,distributed ledger,[]
1,It is similar to a checkbook that is distribut...,to checkbook,[]
2,Transactions are recorded in blocks that are l...,previous transactions,[]
3,"""Imagine a book where you write down everythin...",africancryptocurrencies exchangequidax,[]
4,"The entire book, a group of pages, is a block.",of pages,[]
5,Everyone who uses acryptocurrencies has their ...,uses acryptocurrencies,[]
6,Every copy of the blockchain is updated with n...,new transactions,[]
7,Every copy of the blockchain is updated with n...,blockchain,[]
8,Proof of work or proof of stake are two main v...,validation techniques,[]


# Distractors (MCQ Options)

In [14]:
import spacy

nlp = spacy.load("en_core_web_sm")
s2v = nlp.add_pipe("sense2vec")
s2v.from_disk("s2v_old/") 
for row in df.iterrows():
    ans=df['Answer']
    assert ans in s2v
    vector = s2v[ans]
    freq = s2v.get_freq(ans)
    most_similar = s2v.most_similar(ans, n=3)
    print(ans,most_similar)

ValueError: [E003] Not a valid pipeline component. Expected callable, but got 'sense2vec' (name: 'None').[E004] If you meant to add a built-in component, use `create_pipe`: `nlp.add_pipe(nlp.create_pipe('sense2vec'))`

In [None]:
most_similar