# Imports

In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, create_optimizer, PushToHubCallback, pipeline
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from tqdm import tqdm
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
import pandas as pd

In [4]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Script to load & process data, set up model, fit model, & save model to Hugging Face Hub

In [6]:
#Load data
data = load_dataset(
    'csv', data_files={
        'train':'./drive/MyDrive/General_Assembly/Capstone_Project/data/training_data.csv',
        'test':'./drive/MyDrive/General_Assembly/Capstone_Project/data/testing_data.csv',
        'validation':'./drive/MyDrive/General_Assembly/Capstone_Project/data/validation_data.csv'
        }
    )

#Laod model & tokenizer
model_checkpoint = 'sshleifer/distilbart-xsum-1-1'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

#Process data for modeling
def preprocess_function(text_data):
  model_inputs = tokenizer(
      text_data['chapter_text'], max_length=1024, truncation=True)     
  with tokenizer.as_target_tokenizer():
    labels = tokenizer(
        text_data['chapter_summary'], max_length=1024, truncation=True)   
    model_inputs['labels'] = labels['input_ids']
    return model_inputs
tokenized_data = data.map(preprocess_function)
tokenized_data_features = tokenized_data.remove_columns(
    data['train'].column_names)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors='tf')
tf_train = tokenized_data_features['train'].to_tf_dataset(
    columns=['input_ids', 'attention_mask', 'labels'],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=1)
tf_validation = tokenized_data_features['validation'].to_tf_dataset(
    columns=['input_ids', 'attention_mask', 'labels'],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=1)
tf_test = tokenized_data_features['test'].to_tf_dataset(
    columns=['input_ids', 'attention_mask', 'labels'],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=1)

#Compile & fit model
num_train_epochs = 25
num_train_steps = len(tf_train) * num_train_epochs
optimizer, schedule = create_optimizer(
    init_lr=5.6e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.1,)
model.compile(optimizer=optimizer)
early_stop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=1, mode='auto')
push_to_hub = PushToHubCallback(     #callback to save model to Hugging Face hub
    output_dir='./summary_model', tokenizer=tokenizer, hub_model_id='AnnaR/literature_summarizer')
model_history = model.fit(tf_train, validation_data=tf_validation, callbacks=[early_stop, push_to_hub], epochs=25, verbose=0)

Using custom data configuration default-74d066c3e8549d77
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-74d066c3e8549d77/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/3 [00:00<?, ?it/s]

All model checkpoint layers were used when initializing TFBartForConditionalGeneration.

All the layers of TFBartForConditionalGeneration were initialized from the model checkpoint at sshleifer/distilbart-xsum-1-1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-74d066c3e8549d77/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-6c6fe7e5e76a28b0.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-74d066c3e8549d77/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-13c913eb270e8082.arrow


  0%|          | 0/24 [00:00<?, ?ex/s]

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! Please ensure your labels are passed as keys in the input dict so that they are accessible to the model during the forward pass. To disable this behaviour, please pass a loss argument, or explicitly pass loss=None if you do not want your model to compute a loss.
Cloning https://huggingface.co/AnnaR/literature_summarizer into local empty directory.


Epoch 11: early stopping


Upload file tf_model.h5:   0%|          | 32.0k/317M [00:00<?, ?B/s]

remote: tput: No value for $TERM and no -T specified        
remote: tput: No value for $TERM and no -T specified        
remote: tput: No value for $TERM and no -T specified        
remote: tput: No value for $TERM and no -T specified        
To https://huggingface.co/AnnaR/literature_summarizer
   26eb539..a6a6f32  main -> main



# Generate summaries predictions from test data

In [None]:
#Add reference and model-generated summaries to lists
test_labels = []
test_preds = []

for batch in tqdm(tf_test):
    predictions = model.generate(batch['input_ids'])
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = batch["labels"].numpy()
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    test_preds.extend(decoded_preds)
    test_labels.extend(decoded_labels)

#Save in dataframe
test_summaries = pd.DataFrame({'reference':test_labels, 'model-generated':test_preds})

100%|██████████| 59/59 [09:21<00:00,  9.52s/it]


## Model-generated Summaries

In [None]:
test_summaries

Unnamed: 0,reference,model-generated
0,Godliness I This is the first in a four-part s...,"In this chapter, Jesse, a farmhouse, has died ..."
1,"Next, the UM uses the analogy of the road to p...","In this chapter, the UM has become the first m..."
2,"Duke Frederick finds that Celia, Rosalind, and...",The Duchess of Cornwall has been told that she...
3,"After the celebration of one day in Rome, Albe...",Dantes and Dantes arrive at the house of Dante...
4,5 Getting into the rowboat that the murdere...,"In this chapter, Huck's Jim is saved by a boat..."
5,"Falstaff has agreed to go to the wood, hoping ...",Fenton Ford and Mrs.\nPage enters with Falstaf...
6,Oliver was overjoyed at the news that she woul...,Fagin decides that he wants to go back to his ...
7,Believing themselves to be close to freedom no...,This scene begins to show Jim on the island of...
8,"Part2, He hides the stolen items beneath a ro...",In the search for a man who has been imprisone...
9,: Rosalind enters as the boy actor who played ...,Duke and Duchess of Cornwall have completed th...


# Test pipeline from model saved on Hugging Face

In [18]:
summarizer = pipeline('summarization', model='AnnaR/literature_summarizer')

All model checkpoint layers were used when initializing TFBartForConditionalGeneration.

All the layers of TFBartForConditionalGeneration were initialized from the model checkpoint at AnnaR/literature_summarizer.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


In [19]:
text = data['test']['chapter_text'][10]

In [16]:
#Reference text
text

'O and Lermontov\'s MASQUERADE. And all at once I felt horribly ashamed, so ashamed that I stopped the horse, got out of the sledge, and stood still in the snow in the middle of the street. The driver gazed at me, sighing and astonished. What was I to do? I could not go on there--it was evidently stupid, and I could not leave things as they were, because that would seem as though ... Heavens, how could I leave things! And after such insults! "No!" I cried, throwing myself into the sledge again. "It is ordained! It is fate! Drive on, drive on!" And in my impatience I punched the sledge-driver on the back of the neck. "What are you up to? What are you hitting me for?" the peasant shouted, but he whipped up his nag so that it began kicking. The wet snow was falling in big flakes; I unbuttoned myself, regardless of it. I forgot everything else, for I had finally decided on the slap, and felt with horror that it was going to happen NOW, AT ONCE, and that NO FORCE COULD STOP IT. The deserted

In [20]:
model_text = summarizer(text)

Token indices sequence length is longer than the specified maximum sequence length for this model (5642 > 1024). Running this sequence through the model will result in indexing errors


In [23]:
#Generated summary
model_text[0]['summary_text']

" \xa0  The next morning, Mr. Lermontown arrives at the house of the house, and he finds that Mr. Grange, and Mrs. Lernay, and Mr. Gardiner tells him that he would have to be his father's death. He says that he is"