##### Establishment of a baseline for model summary generation provides a baseline score to compare future models against. 

# Imports

In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset, load_metric
import nltk
from nltk.tokenize import sent_tokenize

In [43]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Read in data

## Read in full dataset as pandas dataframe

In [23]:
df_data = pd.read_csv('./modeling_data/cleaned_summaries_and_texts.csv')

In [24]:
df_data.head(2)

Unnamed: 0,chapter_title,chapter_summary,book_title,chapters,chapter_text
0,The Age of Innocence: Novel Summary: Chapters 1-3,The story opens at the opera in New York....,The Age of Innocence,Chapters 1-3,"byEdith Wharton Etext prepared by JudithBoss,..."
1,The Age of Innocence: Novel Summary: Chapters 4-6,Archer and May begin their round of betrotha...,The Age of Innocence,Chapters 4-6,In the course of the next day the first of th...


## Split data in testing, training, and validation sets

In [25]:
train, test = train_test_split(df_data, test_size=0.2)

In [26]:
train.shape

(236, 5)

In [27]:
test.shape

(59, 5)

In [28]:
train, validation = train_test_split(train, test_size=0.1)

In [29]:
train.shape

(212, 5)

In [30]:
validation.shape

(24, 5)

## Export split data back to separate csv files

In [31]:
train.to_csv('./modeling_data/training_data.csv', index=False)

In [32]:
test.to_csv('./modeling_data/testing_data.csv', index=False)

In [33]:
validation.to_csv('./modeling_data/validation_data.csv', index=False)

## Read in data in Huggingface data dictionary format

In [34]:
data = load_dataset(
    'csv', 
    data_files={
        'train':'./modeling_data/training_data.csv', 'test':'./modeling_data/testing_data.csv',
        './modeling_data/validation':'validation_data.csv'
        }
    )

Using custom data configuration default-b344bfb567d2fdc8


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-b344bfb567d2fdc8/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-b344bfb567d2fdc8/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [35]:
data

DatasetDict({
    train: Dataset({
        features: ['chapter_title', 'chapter_summary', 'book_title', 'chapters', 'chapter_text'],
        num_rows: 212
    })
    test: Dataset({
        features: ['chapter_title', 'chapter_summary', 'book_title', 'chapters', 'chapter_text'],
        num_rows: 59
    })
    validation: Dataset({
        features: ['chapter_title', 'chapter_summary', 'book_title', 'chapters', 'chapter_text'],
        num_rows: 24
    })
})

# Baseline summarization model
Uses the first three sentences of a piece of text as the summary
<br>
code from https://huggingface.co/course/chapter7/5?fw=tf

## Function to create baseline model-generated summaries by selecting first three sentences of chapter texts

In [45]:
def baseline_summary(chapter_text):
  return '\n'.join(sent_tokenize(chapter_text)[:3])

## Function to evaluate baseline three-sentence summaries with rouge scores

In [50]:
rouge = load_metric('rouge')

Downloading:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [52]:
def baseline_scores(data):
  baseline_summaries = [baseline_summary(chapter) for chapter in data['chapter_text']]
  return rouge.compute(predictions=baseline_summaries, references=data['chapter_summary'])

## Create baseline summaries and evaluate

In [53]:
r_score = baseline_scores(data['validation'])

In [54]:
r_score

{'rouge1': AggregateScore(low=Score(precision=0.35541433534046385, recall=0.05841849415360017, fmeasure=0.09492658966334183), mid=Score(precision=0.4319420767192303, recall=0.07971813584828565, fmeasure=0.12384609006840333), high=Score(precision=0.5077492824813814, recall=0.10318180537562414, fmeasure=0.1524584694958896)),
 'rouge2': AggregateScore(low=Score(precision=0.036488467155314015, recall=0.005422561409607098, fmeasure=0.009043354616527562), mid=Score(precision=0.05874820527377431, recall=0.008205463096863975, fmeasure=0.013324729370866805), high=Score(precision=0.08717972281483662, recall=0.011232759048428327, fmeasure=0.01824717757166429)),
 'rougeL': AggregateScore(low=Score(precision=0.22645932274437872, recall=0.03520678114259022, fmeasure=0.05821662702858322), mid=Score(precision=0.2798211757346323, recall=0.04640937427095732, fmeasure=0.07329302935422743), high=Score(precision=0.34289213365765714, recall=0.05829698213318197, fmeasure=0.08820308120127679)),
 'rougeLsum': 

In [56]:
r_score_types = ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
rouge_dict = dict((rn, round(r_score[rn].mid.fmeasure * 100, 2)) for rn in r_score_types)
rouge_dict

{'rouge1': 12.38, 'rouge2': 1.33, 'rougeL': 7.33, 'rougeLsum': 8.73}

These rouge scores indicate that this baseline summarization model does generate summaries that have some very low level of similarity to the reference summaries, but that overall the model does not perform well
<br>
The goal for future fine tuned summarization will be to score higher than this baseline