In [16]:
!pip install -q transformers[torch] datasets accelerate tokenizers seqeval evaluate

In [1]:
import datasets
import numpy as np
from transformers import BertTokenizerFast
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [3]:
import datasets
multi_news = datasets.load_dataset("alexfabbri/multi_news",trust_remote_code=True)

In [4]:
multi_news

DatasetDict({
    train: Dataset({
        features: ['document', 'summary'],
        num_rows: 44972
    })
    validation: Dataset({
        features: ['document', 'summary'],
        num_rows: 5622
    })
    test: Dataset({
        features: ['document', 'summary'],
        num_rows: 5622
    })
})

In [5]:
multi_news["train"]

Dataset({
    features: ['document', 'summary'],
    num_rows: 44972
})

In [6]:
multi_news["train"][0]

{'document': 'National Archives \n \n Yes, it’s that time again, folks. It’s the first Friday of the month, when for one ever-so-brief moment the interests of Wall Street, Washington and Main Street are all aligned on one thing: Jobs. \n \n A fresh update on the U.S. employment situation for January hits the wires at 8:30 a.m. New York time offering one of the most important snapshots on how the economy fared during the previous month. Expectations are for 203,000 new jobs to be created, according to economists polled by Dow Jones Newswires, compared to 227,000 jobs added in February. The unemployment rate is expected to hold steady at 8.3%. \n \n Here at MarketBeat HQ, we’ll be offering color commentary before and after the data crosses the wires. Feel free to weigh-in yourself, via the comments section. And while you’re here, why don’t you sign up to follow us on Twitter. \n \n Enjoy the show. ||||| Employers pulled back sharply on hiring last month, a reminder that the U.S. economy 

In [7]:
multi_news["train"].features

{'document': Value(dtype='string', id=None),
 'summary': Value(dtype='string', id=None)}

In [8]:
multi_news["train"].description

'\nMulti-News, consists of news articles and human-written summaries\nof these articles from the site newser.com.\nEach summary is professionally written by editors and\nincludes links to the original articles cited.\n\nThere are two features:\n  - document: text of news articles seperated by special token "|||||".\n  - summary: news summary.\n'

In [9]:
multi_news["train"].features["document"]

Value(dtype='string', id=None)

In [10]:
multi_news["train"].features["summary"]

Value(dtype='string', id=None)

In [11]:
example_text = multi_news["train"][0]

In [12]:
example_text

{'document': 'National Archives \n \n Yes, it’s that time again, folks. It’s the first Friday of the month, when for one ever-so-brief moment the interests of Wall Street, Washington and Main Street are all aligned on one thing: Jobs. \n \n A fresh update on the U.S. employment situation for January hits the wires at 8:30 a.m. New York time offering one of the most important snapshots on how the economy fared during the previous month. Expectations are for 203,000 new jobs to be created, according to economists polled by Dow Jones Newswires, compared to 227,000 jobs added in February. The unemployment rate is expected to hold steady at 8.3%. \n \n Here at MarketBeat HQ, we’ll be offering color commentary before and after the data crosses the wires. Feel free to weigh-in yourself, via the comments section. And while you’re here, why don’t you sign up to follow us on Twitter. \n \n Enjoy the show. ||||| Employers pulled back sharply on hiring last month, a reminder that the U.S. economy 

In [13]:
example_text["document"]

'National Archives \n \n Yes, it’s that time again, folks. It’s the first Friday of the month, when for one ever-so-brief moment the interests of Wall Street, Washington and Main Street are all aligned on one thing: Jobs. \n \n A fresh update on the U.S. employment situation for January hits the wires at 8:30 a.m. New York time offering one of the most important snapshots on how the economy fared during the previous month. Expectations are for 203,000 new jobs to be created, according to economists polled by Dow Jones Newswires, compared to 227,000 jobs added in February. The unemployment rate is expected to hold steady at 8.3%. \n \n Here at MarketBeat HQ, we’ll be offering color commentary before and after the data crosses the wires. Feel free to weigh-in yourself, via the comments section. And while you’re here, why don’t you sign up to follow us on Twitter. \n \n Enjoy the show. ||||| Employers pulled back sharply on hiring last month, a reminder that the U.S. economy may not be gr

In [14]:
example_text["summary"]

'– The unemployment rate dropped to 8.2% last month, but the economy only added 120,000 jobs, when 203,000 new jobs had been predicted, according to today\'s jobs report. Reaction on the Wall Street Journal\'s MarketBeat Blog was swift: "Woah!!! Bad number." The unemployment rate, however, is better news; it had been expected to hold steady at 8.3%. But the AP notes that the dip is mostly due to more Americans giving up on seeking employment.'

In [15]:
#model = "bert-base-uncased"
from transformers import AutoTokenizer 
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [17]:
def preprocess_function(examples):
    inputs = tokenizer(examples["document"], padding="max_length", truncation=True, max_length=512)
    outputs = tokenizer(examples["summary"], padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = outputs["input_ids"]
    return inputs

tokenized_datasets = multi_news.map(preprocess_function, batched=True)

Map: 100%|██████████| 44972/44972 [06:49<00:00, 109.93 examples/s]
Map: 100%|██████████| 5622/5622 [00:35<00:00, 156.78 examples/s]
Map: 100%|██████████| 5622/5622 [00:48<00:00, 116.61 examples/s]


In [18]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./bert-summarization",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=500
)
    

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [20]:
from transformers import Trainer, DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()


  trainer = Trainer(


: 

In [None]:
import nltk
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def compute_metrics(pred):
    predictions, labels = pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    rouge_scores = [scorer.score(pred, label) for pred, label in zip(decoded_preds, decoded_labels)]
    return {"rouge1": sum([x["rouge1"].fmeasure for x in rouge_scores]) / len(rouge_scores)}

trainer.evaluate()


In [None]:
trainer.save_model("./bert-summarization-model")
tokenizer.save_pretrained("./bert-summarization-model")


In [None]:
text = "Some news article text..."
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
output = model.generate(**inputs)
summary = tokenizer.decode(output[0], skip_special_tokens=True)
print(summary)
