In [1]:
import requests
from bs4 import BeautifulSoup
from transformers import T5Tokenizer, T5ForConditionalGeneration

2025-02-22 12:50:28.906924: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-22 12:50:28.923906: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740214228.943887    9762 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740214228.950133    9762 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-22 12:50:28.971370: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

### Model

In [2]:
def initialize_summarization_model():
    model_name = 't5-base'  # You can also use 't5-base' or 'bart-large-cnn'
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    return model, tokenizer

In [3]:
def preprocess_text(text, tokenizer):
    input_text = "summarize: " + text
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    return inputs

In [4]:
def generate_summary(text, model, tokenizer):
    inputs = preprocess_text(text, tokenizer)
    summary_ids = model.generate(inputs['input_ids'], max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

### Data Scraping

In [5]:
def scrape_wikipedia_content(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text_content = ""
        for para in paragraphs:
            text_content += para.text
        return text_content
    else:
        print(f"Failed to retrieve page. Status code: {response.status_code}")
        return None

In [6]:
def summarize_wikipedia_page(url):
    model, tokenizer = initialize_summarization_model()
    
    print(f"Scraping content from {url}...")
    article_content = scrape_wikipedia_content(url)
    
    if article_content:
        print("\nGenerating summary...")
        summary = generate_summary(article_content, model, tokenizer)
        return summary
    else:
        return "Failed to retrieve article content."

### Check

In [7]:
url = 'https://en.wikipedia.org/wiki/Statue_of_George_Washington_(Trenton,_New_Jersey)'  # Change this URL to any Wikipedia page you want to scrape
summary = summarize_wikipedia_page(url)
print(f"\nSummary of the Wikipedia page:\n{summary}")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Scraping content from https://en.wikipedia.org/wiki/Statue_of_George_Washington_(Trenton,_New_Jersey)...

Generating summary...

Summary of the Wikipedia page:
the sculpture depicts general George Washington in a pose taken from the 1851 painting . it was owned by the banker Mahlon Dickerson Eyre . the statue is currently in the mill hill neighborhood of the city of Trenton in Mercer County, new jersey .
