# Text Summarizer

## Import dependencies from transformers

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

## Load tokenizer

In [3]:
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")

## Load model

In [4]:
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")

## Perform abstractive summarization

In [5]:
text = """
Data science is an interdisciplinary academic field that uses statistics, 
scientific computing, scientific methods, processes, algorithms and 
systems to extract or extrapolate knowledge and insights from noisy, 
structured, and unstructured data.

Data science also integrates domain knowledge from the underlying application 
domain (e.g., natural sciences, information technology, and medicine).
Data science is multifaceted and can be described as a science, a 
research paradigm, a research method, a discipline, a workflow, and a 
profession.

Data science is a "concept to unify statistics, data analysis, informatics, 
and their related methods" to "understand and analyze actual phenomena" 
with data. It uses techniques and theories drawn from many fields within 
the context of mathematics, statistics, computer science, information 
science, and domain knowledge.

However, data science is different from computer science and information 
science. Turing Award winner Jim Gray imagined data science as 
a "fourth paradigm" of science (empirical, theoretical, computational, and 
now data-driven) and asserted that "everything about science is changing 
because of the impact of information technology" and the data deluge.
"""

## Create tokens

In [6]:
tokens = tokenizer(text, truncation=True, padding='longest',return_tensors='pt')

In [7]:
tokens

{'input_ids': tensor([[ 2331,  1578,   117,   142,   115, 13920,  2232,   764,   120,  1481,
          4412,   108,  3189,  6506,   108,  3189,  1625,   108,  1994,   108,
          8970,   111,   747,   112,  5703,   132, 63533,   825,   111,  4275,
           135, 16208,   108,  7314,   108,   111, 41831,   335,   107,  2331,
          1578,   163, 15133,  2641,   825,   135,   109,  5910,   723,  2641,
           143,   326,   107,   838,   107,   108,   710,  9059,   108,   257,
           552,   108,   111,  3025,   250,  2331,  1578,   117, 35657,   111,
           137,   129,  2540,   130,   114,  1578,   108,   114,   473, 17142,
           108,   114,   473,  1356,   108,   114,  6270,   108,   114,  9901,
           108,   111,   114,  5948,   107,  2331,  1578,   117,   114,   198,
         33683,   112, 51573,  4412,   108,   335,  1382,   108, 52482,   108,
           111,   153,   985,  1625,   194,   112,   198, 36169,   111,  5935,
          1916, 16327,   194,   122,  

## Summarize

In [8]:
from transformers import GenerationConfig

In [9]:
generation_config = GenerationConfig.from_pretrained("google/pegasus-xsum")

In [10]:
generation_config

GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 0,
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "forced_eos_token_id": 1,
  "length_penalty": 0.6,
  "max_length": 64,
  "num_beams": 8,
  "pad_token_id": 0,
  "transformers_version": "4.30.1"
}

In [11]:
summary = model.generate(**tokens)

In [12]:
summary

tensor([[   0, 2331, 1578,  117,  109,  692,  111, 1382,  113,  423, 3912,  113,
          335,  107,    1]])

## Decode

In [13]:
output = tokenizer.decode(summary[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

In [14]:
output

'Data science is the study and analysis of large amounts of data.'

## Create function to wrap everything

In [15]:
def text_summarization(text):
    tokens = tokenizer(text, truncation=True, padding='longest',return_tensors='pt')
    summary = model.generate(**tokens)
    output = tokenizer.decode(summary[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return output

In [16]:
result = text_summarization("My name is Nikhil Raj and I am a data scientist. I have total of 5 years of experience.")

In [17]:
result

'Hello my name is Nikhil Raj and I am a data scientist.'