In [2]:
# NLP_Summarization_Code.ipynb

# -------------------------------------------------------------
# 1. Install Dependencies
# -------------------------------------------------------------
!pip install transformers rouge-score pandas

# -------------------------------------------------------------
# 2. Import Libraries
# -------------------------------------------------------------
import pandas as pd
from transformers import pipeline
from rouge_score import rouge_scorer

# -------------------------------------------------------------
# 3. Load Dataset
# -------------------------------------------------------------
# Using a small demo dataset for summarization
data = {
    "text": [
        "Natural Language Processing (NLP) is a field of AI that focuses on the interaction between computers and humans through language. "
        "It has applications in chatbots, machine translation, sentiment analysis, and summarization.",
        "Transformers are a powerful model architecture in NLP. "
        "Models like BERT, GPT, and T5 have revolutionized text understanding and generation tasks.",
        "Data preprocessing is essential for NLP tasks. Tokenization, lowercasing, and removing special characters help models learn effectively."
    ]
}

df = pd.DataFrame(data)
print("Dataset Loaded:\n", df)

# -------------------------------------------------------------
# 4. Initialize Summarization Pipeline
# -------------------------------------------------------------
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# -------------------------------------------------------------
# 5. Generate Summaries
# -------------------------------------------------------------
summaries = []
for text in df['text']:
    summary = summarizer(text, max_length=50, min_length=20, do_sample=False)
    summaries.append(summary[0]['summary_text'])

df['summary'] = summaries
print("\nSummaries Generated:\n", df[['text','summary']])

# -------------------------------------------------------------
# 6. Evaluate Summaries using ROUGE
# -------------------------------------------------------------
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

for i, row in df.iterrows():
    scores = scorer.score(row['text'], row['summary'])
    print(f"\nOriginal Text:\n{row['text']}")
    print(f"Summary:\n{row['summary']}")
    print(f"ROUGE Scores: {scores}")


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=d0d2e1d63ef355591b9956cace5e3a8298986ecd70d87b21c8212dcc6595bfc3
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
Dataset Loaded:
                                                 text
0  Natural Language Processing (NLP) is a field o...
1  Transformers are a powerful model architecture...
2  Data preprocessing is essential for NLP tasks....


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu
Your max_length is set to 50, but your input_length is only 43. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=21)
Your max_length is set to 50, but your input_length is only 32. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=16)
Your max_length is set to 50, but your input_length is only 28. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)



Summaries Generated:
                                                 text  \
0  Natural Language Processing (NLP) is a field o...   
1  Transformers are a powerful model architecture...   
2  Data preprocessing is essential for NLP tasks....   

                                             summary  
0  Natural Language Processing (NLP) is a field o...  
1  Transformers are powerful model architecture i...  
2  Data preprocessing is essential for NLP tasks....  

Original Text:
Natural Language Processing (NLP) is a field of AI that focuses on the interaction between computers and humans through language. It has applications in chatbots, machine translation, sentiment analysis, and summarization.
Summary:
Natural Language Processing (NLP) is a field of AI that focuses on the interaction between computers and humans through language. It has applications in chatbots, machine translation, sentiment analysis, and summarization.
ROUGE Scores: {'rouge1': Score(precision=1.0, recall=1.0, fme

In [3]:
# Transformer Models - Small Demos

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import T5Tokenizer, T5ForConditionalGeneration

# ---- BERT Demo ----
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
inputs = bert_tokenizer("Transformers are amazing for NLP tasks.", return_tensors="pt")
outputs = bert_model(**inputs)
print("BERT logits:", outputs.logits)

# ---- GPT2 Demo ----
gpt_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt_model = GPT2LMHeadModel.from_pretrained("gpt2")
input_text = "Artificial intelligence is"
inputs = gpt_tokenizer(input_text, return_tensors="pt")
outputs = gpt_model.generate(**inputs, max_length=50)
print("GPT2 text:", gpt_tokenizer.decode(outputs[0]))

# ---- T5 Demo ----
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")
input_text = "summarize: Transformers are powerful models for NLP."
inputs = t5_tokenizer(input_text, return_tensors="pt")
outputs = t5_model.generate(**inputs, max_length=50)
print("T5 summary:", t5_tokenizer.decode(outputs[0]))


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT logits: tensor([[0.2301, 0.2425]], grad_fn=<AddmmBackward0>)


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


GPT2 text: Artificial intelligence is a new field of research that has been in the works for a while now. It is a field that has been in the works for a while now. It is a field that has been in the works for a while now.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

T5 summary: <pad> Transformers are powerful models for NLP.</s>
