In [1]:
import pandas as pd
import re
import gc
import time

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config

from os.path import basename
from os.path import join
from os import makedirs

from urllib.request import urlopen
from urllib.parse import urljoin

In [2]:
# download a file from a URL, returns content of downloaded file
def download_url(urlpath):
    try:
        # open a connection to the server
        with urlopen(urlpath, timeout=3) as connection:
            # read the contents of the url as bytes and return it
            return connection.read()
    except:
        return None

In [3]:
def download_book(book_id, save_path):
    # construct the download url
    url = f'https://www.gutenberg.org/files/{book_id}/{book_id}-0.txt'
    # download the content
    data = download_url(url)
    if data is None:
        return f'Failed to download {url}'
    # create local path
    save_file = join(save_path, f'war_peace.txt')
    # save book to file
    with open(save_file, 'wb') as file:
        file.write(data)
    return f'Saved {save_file}'

In [4]:
# war and peace is 2600
download_book(2600, './')

'Saved ./war_peace.txt'

In [5]:
data = pd.read_fwf('war_peace.txt')

In [6]:
data[data['The Project Gutenberg eBook of War and Peace, by Leo Tolstoy'].str.contains('PROJECT')]

Unnamed: 0,"The Project Gutenberg eBook of War and Peace, by Leo Tolstoy"
15,*** START OF THE PROJECT GUTENBERG EBOOK WAR A...
51877,*** END OF THE PROJECT GUTENBERG EBOOK WAR AND...
51886,Gutenberg-tm electronic works to protect the P...
51899,THE FULL PROJECT GUTENBERG LICENSE


In [7]:
df = data.iloc[403:51877]

In [8]:
novel = df.apply(lambda x: ' '.join(x)).values[0]

In [9]:
def get_chunks(entire_text, window_size=512):
    start = 0
    end = window_size
    tokens = entire_text.split(' ')
    total_token_length = len(tokens)
    chunks = []
    current_tokens_gathered = 0
    while (current_tokens_gathered < total_token_length):
        chunk = tokens[start:end]
        chunks.append(' '.join(chunk))
        start += window_size
        end += window_size
        current_tokens_gathered += window_size
    return chunks

In [10]:
wp_chunks = get_chunks(novel, 512)

In [11]:
wp_df = pd.DataFrame(wp_chunks)

In [12]:
wp_df.columns = ['chunks']

In [13]:
wp_df['length'] = wp_df['chunks'].apply(lambda x: len(x.split(' ')))

In [14]:
wp_df.head()

Unnamed: 0,chunks,length
0,"“Well, Prince, so Genoa and Lucca are now just...",512
1,has been decided? They have decided that Buona...,512
2,Emperor. Had you heard?” “I shall be delighted...,512
3,between them.” He said this smiling in a way m...,512
4,"Prince Vasíli’s daughter, the beautiful Hélène...",512


In [124]:
def clean_all_text(input_string):
    text = input_string.lower()
#     text = re.sub("\"","", text)
#     text = re.sub("\'", "", text)
    text = re.sub("á", "a", text)
    text = re.sub("é", "e", text)
    text = re.sub("í", "i", text)
    text = re.sub("ë", "e", text)
    text = re.sub("\.\.\.", ". ", text)
    text = re.sub("\.+", ". ", text)
    text = re.sub("\s+", " ", text)
    text = text.lower()
    return text

In [125]:
wp_df['clean_chunk'] = wp_df['chunks'].apply(lambda x: clean_all_text(x))

In [126]:
wp_df.head(1).style

Unnamed: 0,chunks,length,clean_chunk
0,"“Well, Prince, so Genoa and Lucca are now just family estates of the Buonapartes. But I warn you, if you don’t tell me that this means war, if you still try to defend the infamies and horrors perpetrated by that Antichrist—I really believe he is Antichrist—I will have nothing more to do with you and you are no longer my friend, no longer my ‘faithful slave,’ as you call yourself! But how do you do? I see I have frightened you—sit down and tell me all the news.” It was in July, 1805, and the speaker was the well-known Anna Pávlovna Schérer, maid of honor and favorite of the Empress Márya Fëdorovna. With these words she greeted Prince Vasíli Kurágin, a man of high rank and importance, who was the first to arrive at her reception. Anna Pávlovna had had a cough for some days. She was, as she said, suffering from la grippe; grippe being then a new word in St. Petersburg, used only by the elite. All her invitations without exception, written in French, and delivered by a scarlet-liveried footman that morning, ran as follows: “If you have nothing better to do, Count (or Prince), and if the prospect of spending an evening with a poor invalid is not too terrible, I shall be very charmed to see you tonight between 7 and 10—Annette Schérer.” “Heavens! what a virulent attack!” replied the prince, not in the least disconcerted by this reception. He had just entered, wearing an embroidered court uniform, knee breeches, and shoes, and had stars on his breast and a serene expression on his flat face. He spoke in that refined French in which our grandfathers not only spoke but thought, and with the gentle, patronizing intonation natural to a man of importance who had grown old in society and at court. He went up to Anna Pávlovna, kissed her hand, presenting to her his bald, scented, and shining head, and complacently seated himself on the sofa. “First of all, dear friend, tell me how you are. Set your friend’s mind at rest,” said he without altering his tone, beneath the politeness and affected sympathy of which indifference and even irony could be discerned. “Can one be well while suffering morally? Can one be calm in times like these if one has any feeling?” said Anna Pávlovna. “You are staying the whole evening, I hope?” “And the fete at the English ambassador’s? Today is Wednesday. I must put in an appearance there,” said the prince. “My daughter is coming for me to take me there.” “I thought today’s fete had been canceled. I confess all these festivities and fireworks are becoming wearisome.” “If they had known that you wished it, the entertainment would have been put off,” said the prince, who, like a wound-up clock, by force of habit said things he did not even wish to be believed. “Don’t tease! Well, and what has been decided about Novosíltsev’s dispatch? You know everything.” “What can one say about it?” replied the prince in a cold, listless tone. “What",512,"“well, prince, so genoa and lucca are now just family estates of the buonapartes. but i warn you, if you don’t tell me that this means war, if you still try to defend the infamies and horrors perpetrated by that antichrist—i really believe he is antichrist—i will have nothing more to do with you and you are no longer my friend, no longer my ‘faithful slave,’ as you call yourself! but how do you do? i see i have frightened you—sit down and tell me all the news. ” it was in july, 1805, and the speaker was the well-known anna pavlovna scherer, maid of honor and favorite of the empress marya fedorovna. with these words she greeted prince vasili kuragin, a man of high rank and importance, who was the first to arrive at her reception. anna pavlovna had had a cough for some days. she was, as she said, suffering from la grippe; grippe being then a new word in st. petersburg, used only by the elite. all her invitations without exception, written in french, and delivered by a scarlet-liveried footman that morning, ran as follows: “if you have nothing better to do, count (or prince), and if the prospect of spending an evening with a poor invalid is not too terrible, i shall be very charmed to see you tonight between 7 and 10—annette scherer. ” “heavens! what a virulent attack!” replied the prince, not in the least disconcerted by this reception. he had just entered, wearing an embroidered court uniform, knee breeches, and shoes, and had stars on his breast and a serene expression on his flat face. he spoke in that refined french in which our grandfathers not only spoke but thought, and with the gentle, patronizing intonation natural to a man of importance who had grown old in society and at court. he went up to anna pavlovna, kissed her hand, presenting to her his bald, scented, and shining head, and complacently seated himself on the sofa. “first of all, dear friend, tell me how you are. set your friend’s mind at rest,” said he without altering his tone, beneath the politeness and affected sympathy of which indifference and even irony could be discerned. “can one be well while suffering morally? can one be calm in times like these if one has any feeling?” said anna pavlovna. “you are staying the whole evening, i hope?” “and the fete at the english ambassador’s? today is wednesday. i must put in an appearance there,” said the prince. “my daughter is coming for me to take me there. ” “i thought today’s fete had been canceled. i confess all these festivities and fireworks are becoming wearisome. ” “if they had known that you wished it, the entertainment would have been put off,” said the prince, who, like a wound-up clock, by force of habit said things he did not even wish to be believed. “don’t tease! well, and what has been decided about novosiltsev’s dispatch? you know everything. ” “what can one say about it?” replied the prince in a cold, listless tone. “what"


In [43]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [45]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [47]:
model = model.to(device)

In [48]:
p = list("summarize: " + wp_df['clean_chunk'].values)

In [49]:
len(p)

1099

In [64]:
gc.collect()
 

356

In [65]:
torch.cuda.empty_cache()

In [66]:
gc.collect()
 

0

In [67]:
torch.cuda.empty_cache()

In [108]:
total_begin_time = time.perf_counter()
start = 0
end = 10
chunk_n = 1
chunk = {}
while end < len(p):
    begin_time = time.perf_counter()
    gc.collect()
    torch.cuda.empty_cache()
    inputs = tokenizer.batch_encode_plus(p[start:end], max_length=1024, return_tensors="pt", pad_to_max_length=True)
    inputs = inputs.to(device)
    outputs = model.generate(inputs['input_ids'], num_beams=4, max_length = 150, early_stopping=True)
    final = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in outputs]
    man = ' '.join([i for i in final])
    manfinal = re.sub(" \. ", ". ", man)
    chunk[f"section{chunk_n}"] = manfinal
    chunk_n += 1
    start += 10
    end += 10
    
    end_time = time.perf_counter()
    print(end_time - begin_time, f" seconds for loop {chunk_n}")
#     if chunk_n == 10:
#         break
total_end_time = time.perf_counter()
print(total_end_time - total_begin_time, " seconds for ENTIRE LOOP")



5.851248300000179  seconds for loop 2
4.792622200000096  seconds for loop 3
5.222704600000043  seconds for loop 4
5.70979269999998  seconds for loop 5
5.657800899999984  seconds for loop 6
5.5569066999998995  seconds for loop 7
5.446668199999976  seconds for loop 8
5.790183599999864  seconds for loop 9
5.238152599999921  seconds for loop 10
5.366132000000107  seconds for loop 11
5.823792500000081  seconds for loop 12
5.700449499999877  seconds for loop 13
5.813467299999957  seconds for loop 14
5.230469099999937  seconds for loop 15
5.639162700000043  seconds for loop 16
4.6524895000000015  seconds for loop 17
5.747362900000098  seconds for loop 18
5.367021799999975  seconds for loop 19
5.455220699999927  seconds for loop 20
4.416956700000128  seconds for loop 21
5.564123599999903  seconds for loop 22
5.780709599999909  seconds for loop 23
5.9157350999998926  seconds for loop 24
5.832592900000009  seconds for loop 25
5.794375200000104  seconds for loop 26
5.450829300000123  seconds for 

In [None]:
# should be 114 or 115 sections?

In [None]:
# each section is a summary of ~5120 words
# war_peace is 587,287 words

In [109]:
print(f"summary of {len(chunk)*5120} words of novel: ", "\n")
print("total_time: ", total_end_time - total_begin_time, " seconds", "\n")
[print(i, "\n", x, "\n") for i,x in chunk.items()]

summary of 558080 words of novel:  

total_time:  605.5581646  seconds 

section1 
 anna pavlovna scherer was maid of honor and favorite of the empress marya fedorovna. she was suffering from la grippe, a new word in st. petersburg, used only by the elite. prince vasili kuragin was the first to arrive at her reception in july, 1805. anna pavlovna scherer: i have faith only in god and the lofty destiny of our adored monarch. prussia has always declared that buonaparte is invincible, and that all europe is powerless before him. pavlovna: i am expecting two very interesting men tonight, le vicomte de mortemart and abbe morio. anna pavlovna, empress of dowager empress marya fedorovna, said: "baron funke has been recommended to the dowager empress by her sister" prince vasili wished to obtain this post for his son, but others were trying to secure it for the baron. "i don’t speak of anatole, your youngest. i don’t like him," she said. anna pavlovna's son anatole is a relation of hers, princ

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]