# Install necessary libraries

Using PyTorch here because huggingface models and facebook 😛

In [None]:
%pip install -q torch transformers datasets accelerate evaluate rouge_score textstat

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/105.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.3/105.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m939.4/939.4 kB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m58.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# import stuff
import pandas as pd
import torch
from datasets import load_dataset, DatasetDict
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import TrainingArguments, Trainer
import evaluate
import textstat

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Load the dataset

In [None]:
dataset = pd.read_json("/content/data.jsonl", lines=True)

In [None]:
cols = dataset.columns
cols

Index(['title', 'input_text', 'target_text', 'compression_ratio',
       'summary_len', 'readability_score', 'rouge-1', 'rouge-2', 'rouge-l'],
      dtype='object')

In [None]:
dataset = load_dataset("json", data_files="/content/data.jsonl")["train"]

# remove columns we don't need
# for now, we just need input-output pairs
# dataset = dataset.remove_columns(["title", "compression_ratio",
#                                   "summary_len",
#                                   "readability_score", "rouge-1",
#                                   "rouge-2", "rouge-l"])

In [None]:
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)

train = split_dataset["train"]
test = split_dataset["test"]

test_valid = test.train_test_split(test_size=0.5, seed=42)
test = test_valid["test"]
valid = test_valid["train"]

split_dataset = DatasetDict({
    "train": train,
    "test": test,
    "valid": valid
})

print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 7890
    })
    test: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 987
    })
    valid: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 986
    })
})


# Tokenization

In [None]:
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

In [None]:
def preprocess_function(examples):
    model_inputs = tokenizer(examples["input_text"],
                             max_length=512,
                             truncation=True,
                             padding='max_length')
    labels = tokenizer(examples["target_text"],
                       max_length=128,
                       truncation=True,
                       padding="max_length")
    # replace pad tokens with -100
    labels["input_ids"] = [
        (label if label != tokenizer.pad_token_id else -100)
        for label in labels["input_ids"]
    ]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_dataset = split_dataset.map(preprocess_function,
                                batched=True,
                                remove_columns=["input_text", "target_text"])

Map:   0%|          | 0/7890 [00:00<?, ? examples/s]

Map:   0%|          | 0/987 [00:00<?, ? examples/s]

Map:   0%|          | 0/986 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 7895
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 987
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 987
    })
})

# BART

In [None]:
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

## Training

In [None]:
training_args = TrainingArguments(
    output_dir="./bart-finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    fp16=True,
    learning_rate=3e-5,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    logging_dir="./logs",
    report_to="none"
)

In [None]:
# increase dropout to not overfit
model.config.attention_dropout = 0.3
model.config.activation_dropout = 0.3
model.config.dropout = 0.3

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.4038,0.637854
2,0.6459,0.618152
3,0.5971,0.61251




TrainOutput(global_step=2961, training_loss=0.7701749225115301, metrics={'train_runtime': 773.3074, 'train_samples_per_second': 30.628, 'train_steps_per_second': 3.829, 'total_flos': 7220804006707200.0, 'train_loss': 0.7701749225115301, 'epoch': 3.0})

In [None]:
def generate_predictions(model, dataset, tokenizer):
    model.eval()
    predictions, references = [], []

    for sample in dataset:
        input_text = sample["input_text"]
        reference_text = sample["target_text"]

        inputs = tokenizer(
            input_text, return_tensors="pt",
            truncation=True, padding="max_length", max_length=512
        ).to(model.device)

        with torch.no_grad():
            summary_ids = model.generate(
                inputs["input_ids"],
                max_length=128,
                num_beams=4,
                early_stopping=True,
            )

        pred_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        predictions.append(pred_summary)
        references.append(reference_text)

    return predictions, references

In [None]:
rouge = evaluate.load("rouge")

preds, refs = generate_predictions(model, split_dataset["valid"], tokenizer)

rouge_scores = rouge.compute(predictions=preds, references=refs)
print(rouge_scores)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

{'rouge1': np.float64(0.6479886046080181), 'rouge2': np.float64(0.510203191329782), 'rougeL': np.float64(0.5940310623278866), 'rougeLsum': np.float64(0.6056519015107994)}


In [None]:
def compute_readability_scores(texts):
    scores = [textstat.flesch_reading_ease(text) for text in texts]
    return sum(scores) / len(scores)

avg_pred_score = compute_readability_scores(preds)
avg_ref_score = compute_readability_scores(refs)

print(f"Average Flesch Reading Ease (Preds): {avg_pred_score:.2f}")
print(f"Average Flesch Reading Ease (Refs): {avg_ref_score:.2f}")

Average Flesch Reading Ease (Preds): 58.50
Average Flesch Reading Ease (Refs): 62.90


In [None]:
trainer.save_model("./bart-finetuned")
tokenizer.save_pretrained("./bart-finetuned")

('./bart-finetuned/tokenizer_config.json',
 './bart-finetuned/special_tokens_map.json',
 './bart-finetuned/vocab.json',
 './bart-finetuned/merges.txt',
 './bart-finetuned/added_tokens.json')

## train some summarization data

In [None]:
model_path = "/content/drive/MyDrive/bart-finetuned/bart-finetuned/"
model = BartForConditionalGeneration.from_pretrained(model_path)
tokenizer = BartTokenizer.from_pretrained(model_path)

In [None]:
cnn_dataset = load_dataset("cnn_dailymail", "3.0.0")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [None]:
def preprocess_cnn(examples):
    model_inputs = tokenizer(
        examples["article"],
        max_length=512,
        truncation=True,
        padding=False  # Let DataCollator handle padding
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["highlights"],  # Target summaries
            max_length=128,
            truncation=True,
            padding=False
        )

    labels["input_ids"] = [
        [label if label != tokenizer.pad_token_id else -100 for label in seq]
        for seq in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
cnn_small = cnn_dataset["train"].shuffle(seed=42).select(range(1000))
tokenized_cnn = cnn_small.map(
    preprocess_cnn,
    batched=True,
    remove_columns=["article", "highlights", "id"],
    batch_size=8
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]



In [None]:
training_args = TrainingArguments(
    output_dir="./bart-cnn-sequential",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=2e-5,
    num_train_epochs=2,
    fp16=True,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    report_to="none",
    # early stopping if validation ROUGE stops improving
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",  # Or use custom ROUGE metric
    greater_is_better=False,
)

In [None]:
from transformers import DataCollatorForSeq2Seq

In [None]:
model.config.attention_dropout = 0.2
model.config.activation_dropout = 0.2
model.config.dropout = 0.2

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_cnn,      # CNN/DailyMail data
    eval_dataset=tokenized_dataset["test"],  # Original Wikipedia test set
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, padding="longest"),
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.658505


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=124, training_loss=5.1235307262789815, metrics={'train_runtime': 112.7086, 'train_samples_per_second': 17.745, 'train_steps_per_second': 1.1, 'total_flos': 602419620741120.0, 'train_loss': 5.1235307262789815, 'epoch': 1.976})

In [None]:
trainer.save_model("./bart-cnn-sequential")
tokenizer.save_pretrained("./bart-cnn-sequential")

('./bart-cnn-sequential/tokenizer_config.json',
 './bart-cnn-sequential/special_tokens_map.json',
 './bart-cnn-sequential/vocab.json',
 './bart-cnn-sequential/merges.txt',
 './bart-cnn-sequential/added_tokens.json')

In [None]:
!ls /content/drive/MyDrive/bart-finetuned/

bart-finetuned


In [None]:
!mv bart-cnn-sequential/ /content/drive/MyDrive/bart-finetuned/

In [None]:
!ls /content/drive/MyDrive/bart-finetuned/

bart-cnn-sequential  bart-finetuned


generate some summaries

In [None]:
# model_path = "/content/bart-finetuned"
# model = BartForConditionalGeneration.from_pretrained(model_path)
# tokenizer = BartTokenizer.from_pretrained(model_path)

def summarize_text(text, model=model, tokenizer=tokenizer, max_length=512):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    input_ids = inputs["input_ids"].to(model.device)

    with torch.no_grad():
        summary_ids = model.generate(
            input_ids,
            max_length=max_length,       # Maximum summary length
            num_beams=4,                 # Beam search for better quality
            do_sample=True,
            top_p=0.95,
            early_stopping=True,         # Stop when complete
            repetition_penalty=2.5        # Reduce repetitive phrases
        )

    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [None]:
input_text = """
One of the primary drivers of economic disparity is financialization -- the increasing dominance of financial markets, institutions, and elites over economic policy.
This shift has redirected wealth accumulation toward asset-holders while stagnating wage growth for lower-income populations.
Additionally, automation and artificial intelligence threaten to displace millions of low-skilled workers, further widening the income gap.
While proponents argue that technological progress fosters new industries and employment opportunities, critics highlight that the benefits disproportionately favor those with advanced education and capital investments.
"""

summary = summarize_text(input_text, model, tokenizer)
print("Original Text:", input_text)
print("Generated Summary:", summary)

Original Text: 
One of the primary drivers of economic disparity is financialization -- the increasing dominance of financial markets, institutions, and elites over economic policy. 
This shift has redirected wealth accumulation toward asset-holders while stagnating wage growth for lower-income populations. 
Additionally, automation and artificial intelligence threaten to displace millions of low-skilled workers, further widening the income gap. 
While proponents argue that technological progress fosters new industries and employment opportunities, critics highlight that the benefits disproportionately favor those with advanced education and capital investments.

Generated Summary: One of the primary drivers of economic disparity is financialization .
This shift has increased wealth accumulation toward asset-holders while stagnating wage growth for lower-income populations.
Many experts argue that technological progress helps create new industries and employment opportunities, but crit

In [None]:
input_text = """
    We often hear that the world is getting smaller, but what does that really mean?
    In the age of globalization, the world is more interconnected than ever before.
    Advances in technology, communication, and transportation have made it easier for people to connect across vast distances.
    This has led to a greater exchange of ideas, cultures, and goods, creating a more integrated global economy.
"""

summary = summarize_text(input_text, model, tokenizer)
print("Original Text:", input_text)
print("Generated Summary:", summary)

Original Text: 
    We often hear that the world is getting smaller, but what does that really mean?
    In the age of globalization, the world is more interconnected than ever before.
    Advances in technology, communication, and transportation have made it easier for people to connect across vast distances.
    This has led to a greater exchange of ideas, cultures, and goods, creating a more integrated global economy.

Generated Summary: The world is more interconnected than ever before.
In the age of globalization, technology, communication, and transportation have made it easier for people to connect across vast distances.
This has led to a greater exchange of ideas, cultures, and goods, creating a more integrated global economy.


In [None]:
input_text = '''The transformer is a deep learning architecture that was developed by researchers at Google and is based on the multi-head attention mechanism, which was proposed in the 2017 paper "Attention Is All You Need". Text is converted to numerical representations called tokens, and each token is converted into a vector via lookup from a word embedding table. At each layer, each token is then contextualized within the scope of the context window with other (unmasked) tokens via a parallel multi-head attention mechanism, allowing the signal for key tokens to be amplified and less important tokens to be diminished.'''

summary = summarize_text(input_text)
print("Original Text:", input_text)
print("Generated Summary:", summary)

Original Text: The transformer is a deep learning architecture that was developed by researchers at Google and is based on the multi-head attention mechanism, which was proposed in the 2017 paper "Attention Is All You Need". Text is converted to numerical representations called tokens, and each token is converted into a vector via lookup from a word embedding table. At each layer, each token is then contextualized within the scope of the context window with other (unmasked) tokens via a parallel multi-head attention mechanism, allowing the signal for key tokens to be amplified and less important tokens to be diminished.
Generated Summary: Attention Is All You Need is a deep learning architecture that was developed by researchers at Google.
The transformer is based on the multi-head attention mechanism, which was proposed in the 2017 paper .
Each token is converted into a vector via lookup from a word embedding table .
At each layer, each token is then contextualized within the scope of

In [None]:
input_text = """
Ethical standards are required at both the individual and system levels of the information
organization enterprise, but are those standards the same? For example, are the ethical responsibilities
of DDC’s editorial board fundamentally the same as for an individual cataloger? And, what are the consequences of decisions made using different ethical frameworks to the users of knowledge organization
systems? A selection of ethical theories suitable for evaluating moral dilemmas at all levels in information organization is presented, including utilitarianism, deontology, and pragmatism, as well as the more
contemporary approaches of justice, feminist, and Derridean ethics. Finally, a selection of criteria is outlined, taken from the existing ethical frameworks, to use as a starting point for development of an ethical framework specifically
for information organization.
"""

summary = summarize_text(input_text, model, tokenizer)
print("Original Text:", input_text)
print("Generated Summary:", summary)

Original Text: 
Ethical standards are required at both the individual and system levels of the information
organization enterprise, but are those standards the same? For example, are the ethical responsibilities
of DDC’s editorial board fundamentally the same as for an individual cataloger? And, what are the consequences of decisions made using different ethical frameworks to the users of knowledge organization
systems? A selection of ethical theories suitable for evaluating moral dilemmas at all levels in information organization is presented, including utilitarianism, deontology, and pragmatism, as well as the more
contemporary approaches of justice, feminist, and Derridean ethics. Finally, a selection of criteria is outlined, taken from the existing ethical frameworks, to use as a starting point for development of an ethical framework specifically
for information organization.

Generated Summary: Ethical standards are required at both the individual and system levels of information 