In [1]:
!pip install transformers datasets torch




DEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063


In [2]:
from transformers import PegasusTokenizer
from datasets import load_dataset

# Load dataset
dataset = load_dataset("xsum")

# Initialize tokenizer
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")

# Tokenize data
def preprocess_data(examples):
    inputs = tokenizer(examples['document'], max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
    targets = tokenizer(examples['summary'], max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_datasets = dataset.map(preprocess_data, batched=True)
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


  from .autonotebook import tqdm as notebook_tqdm
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Downloading builder script: 100%|██████████| 5.76k/5.76k [00:00<?, ?B/s]
Downloading readme: 100%|██████████| 6.24k/6.24k [00:00<?, ?B/s]
Downloading data: 100%|██████████| 255M/255M [01:11<00:00, 3.54MB/s] 
Downloading data: 2.72MB [00:00, 16.7MB/s]                   
Generating train split: 100%|██████████| 204045/204045 [00:25<00:00, 7883.60 examples/s]
Generating validation split: 100%|██████████| 11332/11332 [00:15<00:00, 713.80 examples/s]
Generating test split: 100%|██████████| 11334/11334 [00:15<00:00, 711.64 examples/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/

In [22]:
import torch
from torch import nn
from transformers import PegasusConfig, PegasusModel

class MiniPegasus(nn.Module):
    def __init__(self, config):
        super(MiniPegasus, self).__init__()
        self.model = PegasusModel(config)
        self.linear = nn.Linear(config.d_model, config.vocab_size)

    def forward(self, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask,
                             decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask)
        sequence_output = outputs[0]
        logits = self.linear(sequence_output)
        return logits
    
    def generate(self, input_ids, attention_mask, max_length=1000, num_beams=4, early_stopping=True):
        # Initialize decoder input
        decoder_input_ids = torch.tensor([[self.model.config.decoder_start_token_id]]).to(input_ids.device)
        generated_ids = []

        for _ in range(max_length):
            outputs = self.forward(input_ids=input_ids, attention_mask=attention_mask,
                                   decoder_input_ids=decoder_input_ids, decoder_attention_mask=None)
            next_token_logits = outputs[:, -1, :]
            next_token = torch.argmax(next_token_logits, dim=-1)
            decoder_input_ids = torch.cat([decoder_input_ids, next_token.unsqueeze(-1)], dim=-1)
            generated_ids.append(next_token)

            if next_token == self.model.config.eos_token_id:
                break

        return torch.cat(generated_ids, dim=-1)

# Define configuration
config = PegasusConfig(
    vocab_size=tokenizer.vocab_size,
    d_model=512,  # Model dimension
    encoder_layers=6,
    decoder_layers=6,
    encoder_attention_heads=8,
    decoder_attention_heads=8,
    encoder_ffn_dim=2048,
    decoder_ffn_dim=2048,
)

# Initialize model
model = MiniPegasus(config)


In [4]:
from torch.optim import AdamW
from torch.utils.data import DataLoader

# DataLoader
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=4, shuffle=True)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
model.train()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(3):  # Train for 3 epochs
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        decoder_input_ids = labels[:, :-1]
        decoder_attention_mask = (decoder_input_ids != tokenizer.pad_token_id).float().to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask,
                        decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask)

        loss = nn.CrossEntropyLoss()(outputs.view(-1, config.vocab_size), labels[:, 1:].reshape(-1))
        loss.backward()
        optimizer.step()
        print(f"Epoch: {epoch}, Loss: {loss.item()}")


Epoch: 0, Loss: 11.648351669311523
Epoch: 0, Loss: 7.838944911956787
Epoch: 0, Loss: 5.423264026641846
Epoch: 0, Loss: 4.305593967437744
Epoch: 0, Loss: 3.0526986122131348
Epoch: 0, Loss: 2.6809840202331543
Epoch: 0, Loss: 2.830188512802124
Epoch: 0, Loss: 2.253309726715088
Epoch: 0, Loss: 3.1467092037200928
Epoch: 0, Loss: 2.383150339126587
Epoch: 0, Loss: 2.5124270915985107
Epoch: 0, Loss: 2.0647737979888916
Epoch: 0, Loss: 2.262897491455078
Epoch: 0, Loss: 2.511320114135742
Epoch: 0, Loss: 2.6562740802764893
Epoch: 0, Loss: 2.915217161178589
Epoch: 0, Loss: 3.304669141769409
Epoch: 0, Loss: 2.9000487327575684
Epoch: 0, Loss: 2.561016082763672
Epoch: 0, Loss: 3.1329619884490967
Epoch: 0, Loss: 2.223836898803711
Epoch: 0, Loss: 2.8049874305725098
Epoch: 0, Loss: 2.427738904953003
Epoch: 0, Loss: 2.4282071590423584
Epoch: 0, Loss: 3.2223072052001953
Epoch: 0, Loss: 2.6382038593292236
Epoch: 0, Loss: 2.80983567237854
Epoch: 0, Loss: 2.807852029800415
Epoch: 0, Loss: 2.5724077224731445
E

In [11]:
!pip install rouge_score

Collecting rouge_score

DEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pandas-profiling 3.2.0 requires joblib~=1.1.0, but you have joblib 1.4.2 which is incompatible.
pandas-profiling 3.2.0 requires visions[type_image_path]==0.7.4, but you have visions 0.7.6 which is incompatible.
spacy-langdetect 0.1.2 requires langdetect==1.0.7, but you have langdetect 1.0.9 which is incompatible.
textract 1.6.5 requires six~=1.12.0, but you have six 1.16.0 which is incompatible.



  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting six>=1.14.0 (from rouge_score)
  Using cached six-1.16.0-py2.py3-none-any.whl.metadata (1.8 kB)
Using cached six-1.16.0-py2.py3-none-any.whl (11 kB)
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py): started
  Building wheel for rouge_score (setup.py): finished with status 'done'
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24972 sha256=e44342393b88ec1bdf899f52212ee8e15e96e1e5cd99f3da9cf25aebaac210da
  Stored in directory: c:\users\sudhy\appdata\local\pip\cache\wheels\85\9d\af\01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: six, rouge_score
  Attempting uninstall: six
    Found existing installation: six 1.12.0
    Uninstalling six-1.12.0:
      Successfully uninstalled six-1.12.0
S

In [12]:
from datasets import load_metric

# Load ROUGE metric
rouge = load_metric('rouge')

def evaluate(model, dataloader):
    model.eval()
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        decoder_input_ids = labels[:, :-1]
        decoder_attention_mask = (decoder_input_ids != tokenizer.pad_token_id).float().to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask,
                            decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask)

        predictions = torch.argmax(outputs, dim=-1)
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        rouge.add_batch(predictions=decoded_preds, references=decoded_labels)

    result = rouge.compute()
    return result

# Evaluate
val_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=4)
rouge_scores = evaluate(model, val_dataloader)
print(rouge_scores)


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'rouge1': AggregateScore(low=Score(precision=0.3338943182818295, recall=0.2972392898675594, fmeasure=0.31388897756962914), mid=Score(precision=0.3362589400662424, recall=0.29949400916850405, fmeasure=0.3162285846941425), high=Score(precision=0.33849686751238056, recall=0.30157605004597704, fmeasure=0.31835374388996796)), 'rouge2': AggregateScore(low=Score(precision=0.08249705154173016, recall=0.07390002392955983, fmeasure=0.07781839790558305), mid=Score(precision=0.08427032112638727, recall=0.07546833635831286, fmeasure=0.07946528078343817), high=Score(precision=0.0860486111359052, recall=0.07709344235558434, fmeasure=0.08115915142644035)), 'rougeL': AggregateScore(low=Score(precision=0.2972881383874342, recall=0.2649841279850748, fmeasure=0.2797137438127607), mid=Score(precision=0.2995632097112868, recall=0.2670558663320951, fmeasure=0.2818485448170519), high=Score(precision=0.3015523466206361, recall=0.26887785389039626, fmeasure=0.28367352203991203)), 'rougeLsum': AggregateScore(lo

In [6]:
def generate_summary(model, tokenizer, text, max_length=128, num_beams=4):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Tokenize the input text
    inputs = tokenizer(text, max_length=1024, truncation=True, padding="max_length", return_tensors="pt").to(device)

    # Generate summary
    summary_ids = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'],
                                 max_length=max_length, num_beams=num_beams, early_stopping=True)

    # Decode the generated summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [7]:
test_text = """"
UK house prices rose by 5.7% in the year to the end of September, according to the latest house price index from lender Halifax.
The annual rate of increase picked up from 5.2% in August, Halifax said.
Across the UK, the average house price in September was £267,587, up from £263,786 in August, a rise of 1.7%.
Russell Galley, managing director at Halifax, said: "Housing market activity has remained solid with decent levels of buyer enquiries.
"However, some of the drivers of the buoyant market we saw earlier in the year, such as the lack of properties for sale and buyers racing to benefit from the stamp duty holiday, have fallen away to some extent.
"That said, underlying demand is still strong and is serving to underpin a degree of pricing pressure for homes, which we expect will lead to a further period of sustained house price growth."
The stamp duty holiday in England and Northern Ireland was phased out in stages over the summer, coming to a complete end from 1 October.
In Wales, the tax break on house purchases ended on 30 June, while in Scotland it ran until 31 March.
Halifax said the performance of the housing market was being supported by a number of other factors.
These included the continuing low mortgage rate environment, with products priced at close to historical lows, and the ongoing shortage of properties for sale.
The lender said the latter was helping to put upward pressure on house prices.
However, it added that affordability challenges for buyers remained acute, with the average first-time buyer in the UK now paying the equivalent of 40% of their annual gross income on mortgage repayments.
"With pressures on the cost of living mounting, and the prospect of interest rates increasing from the current low level, the house price to income ratio is becoming even more of a constraint," Mr Galley said.
"Most experts are anticipating a slowing of house price inflation next year as affordability issues and other economic headwinds exert greater influence."
Regional variations
Halifax's figures showed that Wales remained the strongest performer across the UK nations and regions, with annual house price inflation of 12.9%.
This was followed by Northern Ireland at 10.7%, the South West of England at 9.8%, and the East Midlands at 8.8%.
The weakest regions were the North East, where prices rose by 3.9% over the past year, Scotland at 4.4%, and London at 4.5%.
Within London, the average house price was £541,920.
Halifax said the capital was the only area of the UK where prices remained below their August 2007 peak, before the global financial crisis struck.
Separate figures released by the Bank of England on Thursday showed mortgage approvals for house purchases fell in September to their lowest level since June 2020.
Some 72,453 mortgages were approved for house purchase, down from 74,145 in August.
The Bank's Money and Credit report said mortgage approvals for house purchase had fallen in September for the fifth month in a row.
"This is likely reflecting increasing pressures on household finances as well as rising mortgage rates," said Nitesh Patel, strategic economist at the Bank.
 The article discusses the latest Halifax house price index showing UK house prices rose 5.7% annually to the end of September 2022, up from 5.2% in August. It provides details on the average UK house price, regional variations, factors supporting the housing market like low mortgage rates and supply shortages, as well as affordability challenges for buyers. 
 The article mentions mortgage approval figures from the Bank of England showing approvals for house purchases fell in September to the lowest level since June 2020, likely reflecting pressures on household finances and rising mortgage rates. 
 The article quotes commentary from Russell Galley, managing director at Halifax, analyzing the housing market dynamics and anticipating a period of sustained but slowing house price growth due to factors like affordability constraints and economic headwinds.
"""

In [8]:
# Example usage:

summary = generate_summary(model, tokenizer, test_text)
print("Summary:", summary)

NameError: name 'model' is not defined

In [None]:
import torch

# Assuming `model` is your trained PyTorch model
torch.save(model.state_dict(), 'model.pth')
