In [1]:
pip install transformers datasets torch
pip install transformers
pip install rouge_score



DEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063


In [14]:
# from transformers import PegasusTokenizer
# from datasets import load_dataset

# # Load the cleaned dataset
# dataset = load_dataset("xsum", "xsum-cleaned",split="train[:1%]")

# # Initialize tokenizer
# tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")

# # Tokenize data
# def preprocess_data(examples):
#     inputs = tokenizer(examples['document'], max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
#     targets = tokenizer(examples['summary'], max_length=128, truncation=True, padding="max_length", return_tensors="pt")
#     inputs["labels"] = targets["input_ids"]
#     return inputs

# tokenized_datasets = dataset.map(preprocess_data, batched=True)
# tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

from transformers import PegasusTokenizer
from datasets import load_dataset

# Load dataset
dataset = load_dataset("xsum","xsum-cleaned")

def sample_one_percent(dataset_dict):
    sampled_dataset_dict = dataset
    for split in dataset_dict:
        num_rows = dataset_dict[split].num_rows
        sample_size = max(1, int(num_rows * 0.01))  # Ensure at least one sample is taken
        sampled_dataset = dataset_dict[split].shuffle(seed=42).select(range(sample_size))
        sampled_dataset_dict[split] = sampled_dataset
    return sampled_dataset_dict

dataset = sample_one_percent(dataset)


# Initialize tokenizer
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")

# Tokenize data
def preprocess_data(examples):
    inputs = tokenizer(examples['document'], max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
    targets = tokenizer(examples['summary'], max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_datasets = dataset.map(preprocess_data, batched=True)
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


Map: 100%|██████████| 2040/2040 [00:03<00:00, 590.92 examples/s]
Map: 100%|██████████| 113/113 [00:00<00:00, 479.90 examples/s]
Map: 100%|██████████| 113/113 [00:00<00:00, 551.34 examples/s]


In [13]:
import torch
from torch import nn
from transformers import PegasusConfig, PegasusModel

class MiniPegasus(nn.Module):
    def __init__(self, config):
        super(MiniPegasus, self).__init__()
        self.model = PegasusModel(config)
        self.linear = nn.Linear(config.d_model, config.vocab_size)

    def forward(self, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask,
                             decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask)
        sequence_output = outputs[0]
        logits = self.linear(sequence_output)
        return logits
    
    def generate(self, input_ids, attention_mask, max_length=1000, num_beams=4, early_stopping=True):
        # Initialize decoder input
        decoder_input_ids = torch.tensor([[self.model.config.decoder_start_token_id]]).to(input_ids.device)
        generated_ids = []

        for _ in range(max_length):
            outputs = self.forward(input_ids=input_ids, attention_mask=attention_mask,
                                   decoder_input_ids=decoder_input_ids, decoder_attention_mask=None)
            next_token_logits = outputs[:, -1, :]
            next_token = torch.argmax(next_token_logits, dim=-1)
            decoder_input_ids = torch.cat([decoder_input_ids, next_token.unsqueeze(-1)], dim=-1)
            generated_ids.append(next_token)

            if next_token == self.model.config.eos_token_id:
                break

        return torch.cat(generated_ids, dim=-1)

# Define configuration
config = PegasusConfig(
    vocab_size=tokenizer.vocab_size,
    d_model=512,  # Model dimension
    encoder_layers=6,
    decoder_layers=6,
    encoder_attention_heads=8,
    decoder_attention_heads=8,
    encoder_ffn_dim=2048,
    decoder_ffn_dim=2048,
)

# Initialize model
model = MiniPegasus(config)


In [15]:
from torch.optim import AdamW
from torch.utils.data import DataLoader

# DataLoader
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=4, shuffle=True)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
model.train()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(3):  # Train for 3 epochs
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        decoder_input_ids = labels[:, :-1]
        decoder_attention_mask = (decoder_input_ids != tokenizer.pad_token_id).float().to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask,
                        decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask)

        loss = nn.CrossEntropyLoss()(outputs.view(-1, config.vocab_size), labels[:, 1:].reshape(-1))
        loss.backward()
        optimizer.step()
        print(f"Epoch: {epoch}, Loss: {loss.item()}")


Epoch: 0, Loss: 11.821613311767578
Epoch: 0, Loss: 5.8452582359313965
Epoch: 0, Loss: 3.3950271606445312
Epoch: 0, Loss: 2.627937078475952
Epoch: 0, Loss: 2.3773906230926514
Epoch: 0, Loss: 3.0399465560913086
Epoch: 0, Loss: 2.1268343925476074
Epoch: 0, Loss: 2.152273654937744
Epoch: 0, Loss: 2.779311418533325
Epoch: 0, Loss: 2.6075732707977295
Epoch: 0, Loss: 2.060424327850342
Epoch: 0, Loss: 1.9470489025115967
Epoch: 0, Loss: 2.5823678970336914
Epoch: 0, Loss: 2.6925699710845947
Epoch: 0, Loss: 2.8955209255218506
Epoch: 0, Loss: 2.660137414932251
Epoch: 0, Loss: 2.067235231399536
Epoch: 0, Loss: 2.3007190227508545
Epoch: 0, Loss: 2.5502824783325195
Epoch: 0, Loss: 2.13461971282959
Epoch: 0, Loss: 1.8792449235916138
Epoch: 0, Loss: 3.087343692779541
Epoch: 0, Loss: 2.054056406021118
Epoch: 0, Loss: 2.21085786819458
Epoch: 0, Loss: 1.9278545379638672
Epoch: 0, Loss: 1.9925284385681152
Epoch: 0, Loss: 2.784196376800537
Epoch: 0, Loss: 2.3884119987487793
Epoch: 0, Loss: 2.440117835998535

In [16]:
from datasets import load_metric

# Load ROUGE metric
rouge = load_metric('rouge')

def evaluate(model, dataloader):
    model.eval()
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        decoder_input_ids = labels[:, :-1]
        decoder_attention_mask = (decoder_input_ids != tokenizer.pad_token_id).float().to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask,
                            decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask)

        predictions = torch.argmax(outputs, dim=-1)
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        rouge.add_batch(predictions=decoded_preds, references=decoded_labels)

    result = rouge.compute()
    return result

# Evaluate
val_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=4)
rouge_scores = evaluate(model, val_dataloader)
print(rouge_scores)


  rouge = load_metric('rouge')
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'rouge1': AggregateScore(low=Score(precision=0.15892103962995452, recall=0.11825619171968937, fmeasure=0.13439012563242303), mid=Score(precision=0.17528400435894465, recall=0.13025859016855917, fmeasure=0.14763963037862488), high=Score(precision=0.19210773512206408, recall=0.14315322575086398, fmeasure=0.16147917556981306)), 'rouge2': AggregateScore(low=Score(precision=0.0030574240959456997, recall=0.0021962760884512164, fmeasure=0.0025448161040844985), mid=Score(precision=0.00664564699572508, recall=0.004824351931943921, fmeasure=0.005517103769496904), high=Score(precision=0.011413630617170439, recall=0.008339456111347123, fmeasure=0.009547772780111851)), 'rougeL': AggregateScore(low=Score(precision=0.13165222968602852, recall=0.09721710859941869, fmeasure=0.11086195459899947), mid=Score(precision=0.14532528832817873, recall=0.10721815949627735, fmeasure=0.12213923859442383), high=Score(precision=0.15887537891857348, recall=0.11640173425460398, fmeasure=0.13251129408843954)), 'rougeL

In [24]:


# Assuming `model` is your trained model
# model_path = 'trained_model_azure.pth'
# torch.save(model.state_dict(), model_path)
# print(f"Model saved to {model_path}")

In [47]:
text = """
UK house prices rose by 5.7percentage in the year to the end of September, according to the latest house price index from lender Halifax.
The annual rate of increase picked up from 5.2percentage in August, Halifax said.
Across the UK, the average house price in September was £267,587, up from £263,786 in August, a rise of 1.7%.
Russell Galley, managing director at Halifax, said: "Housing market activity has remained solid with decent levels of buyer enquiries.
"However, some of the drivers of the buoyant market we saw earlier in the year, such as the lack of properties for sale and buyers racing to benefit from the stamp duty holiday, have fallen away to some extent.
"That said, underlying demand is still strong and is serving to underpin a degree of pricing pressure for homes, which we expect will lead to a further period of sustained house price growth."
The stamp duty holiday in England and Northern Ireland was phased out in stages over the summer, coming to a complete end from 1 October.
In Wales, the tax break on house purchases ended on 30 June, while in Scotland it ran until 31 March.
Halifax said the performance of the housing market was being supported by a number of other factors.
These included the continuing low mortgage rate environment, with products priced at close to historical lows, and the ongoing shortage of properties for sale.
The lender said the latter was helping to put upward pressure on house prices.
However, it added that affordability challenges for buyers remained acute, with the average first-time buyer in the UK now paying the equivalent of 40percentage of their annual gross income on mortgage repayments.
"With pressures on the cost of living mounting, and the prospect of interest rates increasing from the current low level, the house price to income ratio is becoming even more of a constraint," Mr Galley said.
"Most experts are anticipating a slowing of house price inflation next year as affordability issues and other economic headwinds exert greater influence."
Regional variations
Halifax's figures showed that Wales remained the strongest performer across the UK nations and regions, with annual house price inflation of 12.9%.
This was followed by Northern Ireland at 10.7%, the South West of England at 9.8%, and the East Midlands at 8.8%.
The weakest regions were the North East, where prices rose by 3.9percentage over the past year, Scotland at 4.4%, and London at 4.5%.
Within London, the average house price was £541,920.
Halifax said the capital was the only area of the UK where prices remained below their August 2007 peak, before the global financial crisis struck.
Separate figures released by the Bank of England on Thursday showed mortgage approvals for house purchases fell in September to their lowest level since June 2020.
Some 72,453 mortgages were approved for house purchase, down from 74,145 in August.
The Bank's Money and Credit report said mortgage approvals for house purchase had fallen in September for the fifth month in a row.
"This is likely reflecting increasing pressures on household finances as well as rising mortgage rates," said Nitesh Patel, strategic economist at the Bank.
 The article discusses the latest Halifax house price index showing UK house prices rose 5.7percentage annually to the end of September 2022, up from 5.2percentage in August. It provides details on the average UK house price, regional variations, factors supporting the housing market like low mortgage rates and supply shortages, as well as affordability challenges for buyers. 
 The article mentions mortgage approval figures from the Bank of England showing approvals for house purchases fell in September to the lowest level since June 2020, likely reflecting pressures on household finances and rising mortgage rates. 
 The article quotes commentary from Russell Galley, managing director at Halifax, analyzing the housing market dynamics and anticipating a period of sustained but slowing house price growth due to factors like affordability constraints and economic headwinds.
"""

In [51]:
from transformers import PegasusConfig, PegasusForConditionalGeneration

class MiniPegasus(PegasusForConditionalGeneration):
    def __init__(self, config):
        super(MiniPegasus, self).__init__(config)
        self.model = PegasusForConditionalGeneration(config)

# Define configuration
config = PegasusConfig(
    vocab_size=tokenizer.vocab_size,
    d_model=512,  # Model dimension
    encoder_layers=6,
    decoder_layers=6,
    encoder_attention_heads=8,
    decoder_attention_heads=8,
    encoder_ffn_dim=2048,
    decoder_ffn_dim=2048,
)

# Initialize model
model = MiniPegasus(config)


In [52]:
from transformers import PegasusForConditionalGeneration

# Load the pre-trained Pegasus model
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
model.to(device)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PegasusForConditionalGeneration(
  (model): PegasusModel(
    (shared): Embedding(96103, 1024, padding_idx=0)
    (encoder): PegasusEncoder(
      (embed_tokens): Embedding(96103, 1024, padding_idx=0)
      (embed_positions): PegasusSinusoidalPositionalEmbedding(512, 1024)
      (layers): ModuleList(
        (0-15): 16 x PegasusEncoderLayer(
          (self_attn): PegasusAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_nor

In [53]:
def generate_summary(model, tokenizer, text, max_length=128, num_beams=4):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Tokenize the input text
    inputs = tokenizer(text, max_length=1024, truncation=True, padding="max_length", return_tensors="pt").to(device)

    # Generate summary
    summary_ids = model.generate(
        input_ids=inputs['input_ids'], 
        attention_mask=inputs['attention_mask'],
        max_length=max_length, 
        num_beams=num_beams, 
        early_stopping=True
    )

    # Decode the generated summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


In [54]:
text = """
UK house prices rose by 5.7percentage in the year to the end of September, according to the latest house price index from lender Halifax.
The annual rate of increase picked up from 5.2percentage in August, Halifax said.
Across the UK, the average house price in September was £267,587, up from £263,786 in August, a rise of 1.7%.
Russell Galley, managing director at Halifax, said: "Housing market activity has remained solid with decent levels of buyer enquiries.
"""

# Example usage
summary = generate_summary(model, tokenizer, text)
print("Summary:", summary)


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
