In [None]:
!pip install transformers --upgrade


Collecting transformers
  Downloading transformers-4.44.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m711.1 kB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.44.0-py3-none-any.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.42.4
    Uninstalling transformers-4.42.4:
      Successfully uninstalled transformers-4.42.4
Successfully installed transformers-4.44.0


In [None]:
import torch
from transformers import XLNetTokenizer, XLNetForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset, load_metric

# Load the tokenizer and model
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
model = XLNetForConditionalGeneration.from_pretrained("xlnet-base-cased")

# Load your dataset
dataset = load_dataset('cnn_dailymail', '3.0.0')

# Preprocess the dataset
def preprocess_function(examples):
    inputs = examples["article"]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained("./summarization_model")
tokenizer.save_pretrained("./summarization_model")

# Function to generate summaries
def generate_summary(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs["input_ids"], max_length=150, min_length=40, length_penalty=5., num_beams=4)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Example usage
text = "Your long article goes here."
summary = generate_summary(text)
print(summary)


ImportError: cannot import name 'XLNetForConditionalGeneration' from 'transformers' (/usr/local/lib/python3.10/dist-packages/transformers/__init__.py)

In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_dataset

# Load the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Load a dataset (example with CNN/DailyMail)
dataset = load_dataset('cnn_dailymail', '3.0.0')

# Preprocess the dataset
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Function to generate summaries
def generate_summary(text):
    inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs["input_ids"], max_length=150, min_length=40, length_penalty=5., num_beams=4)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Example usage
text = "Your long article goes here."
summary = generate_summary(text)
print(summary)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/259M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Map:   0%|          | 0/287113 [00:00<?, ? examples/s]



Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

Your long article goes here. your long article goes here. click here to read your long article. if you have a question, you have a look at your long article. if you have a question about your long article, it’s time to write a post about your long article. if you have a question about your long article, it’s time to write a post about your long article. if you have a question about your long article, it’s time to


In [None]:
sample_text = ''' Making a strong pitch for a Uniform Civil Code in his Independence Day speech this morning,
Prime Minister Narendra Modi said laws that divide the country have no place in a modern society and must be done away with.
The Supreme Court has repeatedly held discussions on uniform civil code, given orders,
because a large section of the country feels, and rightly so, that the current civil code is a communal civil code,
 a discriminatory civil code. The Constitution tells us, the Supreme Court tells us to and it was the dream of the Constitution makers.
  So it is our duty to fulfill it," the Prime Minister said from the ramparts of Red Fort, two months after he started his third term.
There must be widespread discussions, everyone should come forward with their opinions and laws that divide the country on religious
lines must be done away with. They have no place in a modern society. Time demands a secular civil code. And then we will be free of religious discrimination,
 the Prime Minister said. The Prime Minister today said he feels the outrage in society over atrocities against women
 and state governments need to take this seriously. His statement comes at a time when the Mamata Banerjee government
 is facing massive anger and protests over the rape and murder of a doctor at a Kolkata hospital.

'''
summary = generate_summary(sample_text)
print(summary)

a large section of the country feels that the current civil code is a communal civil code, a discriminatory civil code. the Constitution tells us, the Supreme Court tells us to and it was the dream of the Constitution makers. there must be widespread discussions, everyone should come forward with their opinions and laws that divide the country on religious lines must be done away with. the prime minister today said he feels the outrage in society over atrocities against women and state governments need to take this seriously.


In [None]:
sample_text = '''  A large group of unidentified men stormed into Kolkata’s RG Kar Medical College and Hospital
around 12.40am on Thursday and destroyed hospital property
at rampant in the middle of the junior doctors’ ongoing agitation against the August 9 rape and murder of a post-graduate trainee doctor.
The incident took place while midnight protests named ‘Reclaim the Night’ were being held by women against the horrific
rape-murder of the doctor at the hospital.

Kolkata Police said 40 people, allegedly disguised as protestors, entered the hospital premises,
vandalised property and cars, and pelted stones at the police.

BJP leader Suvendu Adhikari alleged that the vandalism was carried out by "TMC goons" sent by chief minister Mamata Banerjee.

The protests were a reaction to the rape and murder of a postgraduate trainee doctor at the RG Kar Medical College and Hospital in Kolkata while on duty.

The body of the 31-year-old woman was found on August 9 in the seminar hall of the hospital.

Kolkata police commissioner Vineet Goyal blamed the media for the mob attack on August 15,
 alleging that what happened was the result of 'wrong and malicious media campaign'.

He said rumours about the main accused having political connections spread by the media had caused
 anger which led to the outburst. He added that the police could not arrest someone based on hearsay and asked for patience. '''

summary = generate_summary(sample_text)
formatted_summary = summary.replace('. ', '.\n')
print(formatted_summary)


a large group of unidentified men stormed into RG Kar Medical College and Hospital.
they destroyed hospital property at rampant in the middle of the junior doctors' agitation against the rape and murder of a post-graduate trainee doctor.
the incident took place while midnight protests called ‘Reclaim the Night’ were being held by women against the horrific rape-murder of the doctor at the hospital.
BJP leader Suvendu Adhikari alleged that the vandalism was carried out


In [None]:
import torch
from transformers import XLNetTokenizer, XLNetModel, BartTokenizer, BartForConditionalGeneration
from datasets import load_dataset

# Load XLNet tokenizer and model
xlnet_tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
xlnet_model = XLNetModel.from_pretrained("xlnet-base-cased")

# Load BART tokenizer and model for summarization
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

# Load the CNN/DailyMail dataset
dataset = load_dataset('cnn_dailymail', '3.0.0')

# Preprocess the dataset
def preprocess_function(examples):
    inputs = examples["article"]
    model_inputs = bart_tokenizer(inputs, max_length=1024, truncation=True)

    # Setup the tokenizer for targets
    with bart_tokenizer.as_target_tokenizer():
        labels = bart_tokenizer(examples["highlights"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Function to generate summaries using BART
def generate_summary(text):
    inputs = bart_tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = bart_model.generate(inputs["input_ids"], max_length=150, min_length=40, length_penalty=5., num_beams=4)
    return bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Example usage
text = "Your long article goes here."
summary = generate_summary(text)
print(summary)


spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Map:   0%|          | 0/287113 [00:00<?, ? examples/s]



Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

Your long article goes here. Your long article on CNN.com will appear at the bottom of the page. Click here to read the rest of the article. For more information on CNN iReport, go to www.cnn.com.
