<a href="https://colab.research.google.com/github/ahmedazaz32/Text-Summarization-with-BART-T5/blob/main/Text_summarization_with_BART_%26_T5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Summarise classical books with state-of-the-art machine learning models
BART and T5 are state-of-the-art machine learning models developed by [Lewis et al. 2019 (Facebook Research)](https://arxiv.org/abs/1910.13461) and [Raffel et al. 2019 (Google Research)](https://arxiv.org/abs/1910.10683). They have been trained to summarize text and are made available for easy use by [@HuggingFace](https://twitter.com/huggingface)'s [Transformers library](https://huggingface.co/transformers/). This notebook shows how to summarise history's most influential books like the Communist Manifesto or Orwell's 1984 in a few lines of code in a few minutes with these two models. You can copy the notebook, run and change it yourself and compare the two models. Notebook by [@MoritzLaurer](https://twitter.com/MoritzLaurer)





In [1]:
## installation
# see https://twitter.com/huggingface/status/1242512382800400384
# details https://github.com/huggingface/transformers/releases/tag/v2.6.0
!pip install transformers --upgrade

Collecting transformers
  Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully uninstalled transformers-4.35.2
Successfully installed transformers-4.36.2


In [2]:
from transformers import pipeline
import requests
import pprint
import time
pp = pprint.PrettyPrinter(indent=14)

RuntimeError: ignored

In [None]:
## documentation for summarizer: https://huggingface.co/transformers/main_classes/pipelines.html#summarizationpipeline
# summarize with BART
summarizer_bart = pipeline(task='summarization', model="bart-large-cnn")
# summarize with T5
summarizer_t5 = pipeline(task='summarization', model="t5-large") # options: ‘t5-small’, ‘t5-base’, ‘t5-large’, ‘t5-3b’, ‘t5-11b’
#for T5 you can chose the size of the model. Everything above t5-base is very slow, even on GPU or TPU.

## 1. Karl Marx, Friedrich Engels - Manifesto of the Communist Party

In [None]:
## download book
book_raw = requests.get("http://www.gutenberg.org/cache/epub/61/pg61.txt")
communist_manifesto = book_raw.text
# cleaning
delimiter = "[From the English edition of 1888, edited by Friedrich Engels]"
communist_manifesto_cl = communist_manifesto.split(delimiter, 1)[1]
delimiter2 = "WORKING MEN OF ALL COUNTRIES, UNITE!"
communist_manifesto_cl =  communist_manifesto_cl.split(delimiter2, 1)[0] + delimiter2
#print(communist_manifesto_cl)

#### 1.1 - BART model, machine-generated summary  - Communist Manifesto

In [None]:
## summarize book with BART model
t0 = time.time() # timer
summary_manifesto_bart = summarizer_bart(communist_manifesto_cl, min_length=150, max_length=500) # change min_ and max_length for different output
print("Summarization took " + str(round((time.time() - t0) / 60, 2)) + " minutes.")

In [None]:
pp.pprint(summary_manifesto_bart[0]['summary_text'])

#### 1.2 - T5 model, machine-generated summary - Communist Manifesto

In [None]:
## summarize book with T5 model
t0 = time.time() # timer
summary_manifesto_t5 = summarizer_t5(communist_manifesto_cl, min_length=150, max_length=500) # change min_ and max_length for different output
print("Summarization took " + str(round((time.time() - t0) / 60, 2)) + " minutes.") # timer

In [None]:
pp.pprint(summary_manifesto_t5[0]['summary_text'])

## 2. George Orwell - 1984

In [None]:
## download book
book_raw = requests.get("http://gutenberg.net.au/ebooks01/0100021.txt")
orwell_1984 = book_raw.text
# cleaning
delimiter = 'PART ONE'
orwell_1984_cl = delimiter + orwell_1984.split(delimiter, 1)[1]
delimiter2 = "THE END"
orwell_1984_cl = orwell_1984_cl.split(delimiter2, 1)[0] + delimiter2
#print(orwell_1984_cl)

#### 2.1 - BART model, machine-generated summary  - Orwell 1984

In [None]:
## summarize book with BART model
t0 = time.time() # timer
summary_orwell_bart = summarizer_bart(orwell_1984_cl, min_length=150, max_length=500) # change min_ and max_length for different output
print("Summarization took " + str(round((time.time() - t0) / 60, 2)) + " minutes.") # timer

In [None]:
pp.pprint(summary_orwell_bart[0]['summary_text'])

#### 2.2 - T5 model, machine-generated summary - Orwell 1984



In [None]:
## summarize book with T5 model
t0 = time.time() # timer
summary_orwell_t5 = summarizer_t5(orwell_1984_cl, min_length=150, max_length=500) # change min_ and max_length for different output
print("Summarization took " + str(round((time.time() - t0) / 60, 2)) + " minutes.") # timer

In [None]:
pp.pprint(summary_orwell_t5[0]['summary_text'])

## 3. Charles Darwin - The Origin of Species by Means of Natural Selection

In [None]:
## download book
book_raw = requests.get("http://www.gutenberg.org/cache/epub/2009/pg2009.txt")
darwin_origin_of_species = book_raw.text
# cleaning
delimiter = 'INTRODUCTION.'
darwin_origin_of_species_cl = "ORIGIN OF SPECIES." + delimiter + darwin_origin_of_species.split(delimiter, 1)[1]
delimiter2 = "GLOSSARY OF THE PRINCIPAL SCIENTIFIC TERMS USED IN THE PRESENT VOLUME."
darwin_origin_of_species_cl =  darwin_origin_of_species_cl.split(delimiter2, 1)[0]
print(darwin_origin_of_species_cl)

#### 3.1 - BART model, machine-generated summary - Darwin, Origin of Species

In [None]:
## summarize book with BART model
t0 = time.time() # timer
summary_darwin_bart = summarizer_bart(darwin_origin_of_species_cl, min_length=150, max_length=500) # change min_ and max_length for different output
print("Summarization took " + str(round((time.time() - t0) / 60, 2)) + " minutes.") # timer

In [None]:
pp.pprint(summary_darwin_bart[0]['summary_text'])

#### 3.2 - T5 model, machine-generated summary  - Darwin, Origin of Species

In [None]:
## summarize book with T5 model
t0 = time.time() # timer
summary_darwin_t5 = summarizer_t5(darwin_origin_of_species_cl, min_length=150, max_length=500) # change min_ and max_length for different output
print("Summarization took " + str(round((time.time() - t0) / 60, 2)) + " minutes.") # timer

In [None]:
pp.pprint(summary_darwin_t5[0]['summary_text'])

## 4. Mary Wollstonecraft - A Vindication of the Rights of Woman

In [None]:
## download book
book_raw = requests.get("http://www.gutenberg.org/cache/epub/3420/pg3420.txt")
rights_woman = book_raw.text
# cleaning
delimiter = 'A VINDICATION OF THE RIGHTS OF WOMAN,'
rights_woman_cl = delimiter + rights_woman.split(delimiter, 1)[1]
#print(rights_woman_cl)

#### 4.1 - BART model, machine-generated summary - Rights of Woman

In [None]:
## summarize book
t0 = time.time() # timer
summary_rights_woman_bart = summarizer_bart(rights_woman_cl, min_length=150, max_length=500) # change min_ and max_length for different output
print("Summarization took " + str(round((time.time() - t0) / 60, 2)) + " minutes.") # timer

In [None]:
pp.pprint(summary_rights_woman_bart[0]['summary_text'])

#### 4.2 - T5 model, machine-generated summary - Rights of Woman

In [None]:
## summarize book
t0 = time.time() # timer
summary_rights_woman_t5 = summarizer_t5(rights_woman_cl, min_length=150, max_length=500) # change min_ and max_length for different output
print("Summarization took " + str(round((time.time() - t0) / 60, 2)) + " minutes.") # timer

In [None]:
pp.pprint(summary_rights_woman_t5[0]['summary_text'])