Pip Install

In [41]:
!pip install langdetect
!pip install bert-score
!pip install transformers
!pip install kagglehub
!pip install openpyxl



### Data Import

In [2]:
import kagglehub
import pandas as pd

path = kagglehub.dataset_download("beridzeg45/book-reviews")

print("Path to dataset files:", path)
df_books = pd.read_csv(path + "/Book Reviews.csv")
df_books.head()

Path to dataset files: /kaggle/input/book-reviews


Unnamed: 0.1,Unnamed: 0,Book,Review,Review Date
0,0,To Kill a Mockingbird,/// gentle reminder that this is not the time ...,"March 24, 2022"
1,1,To Kill a Mockingbird,\n|\n|6.0 stars. I know I am risking a serious...,"May 24, 2011"
2,2,To Kill a Mockingbird,\n|\n|Looking for a new book but don't want to...,"December 10, 2020"
3,3,To Kill a Mockingbird,"To Kill a Mockingbird, Harper Lee|To Kill a Mo...","July 1, 2022"
4,4,To Kill a Mockingbird,Why is it when I pick up | To Kill A Mockingbi...,"October 25, 2009"


### Inspect Data

In [3]:
df_books.head()

Unnamed: 0.1,Unnamed: 0,Book,Review,Review Date
0,0,To Kill a Mockingbird,/// gentle reminder that this is not the time ...,"March 24, 2022"
1,1,To Kill a Mockingbird,\n|\n|6.0 stars. I know I am risking a serious...,"May 24, 2011"
2,2,To Kill a Mockingbird,\n|\n|Looking for a new book but don't want to...,"December 10, 2020"
3,3,To Kill a Mockingbird,"To Kill a Mockingbird, Harper Lee|To Kill a Mo...","July 1, 2022"
4,4,To Kill a Mockingbird,Why is it when I pick up | To Kill A Mockingbi...,"October 25, 2009"


In [4]:
print("\nMissing values per column:\n", df_books.isnull().sum())


Missing values per column:
 Unnamed: 0       0
Book             0
Review         309
Review Date      0
dtype: int64


In [5]:
#Keep only English Review Text

from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

# Set seed for consistency
DetectorFactory.seed = 0

def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"
# Only detect language if not empty
df_books['language'] = df_books['Review'].apply(lambda x: detect_language(str(x)) if str(x).strip() else "unknown")

# Filter for English-only reviews
df_books = df_books[df_books['language'] == 'en']


In [6]:
df_books.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23057 entries, 0 to 32077
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   23057 non-null  int64 
 1   Book         23057 non-null  object
 2   Review       23057 non-null  object
 3   Review Date  23057 non-null  object
 4   language     23057 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.1+ MB


In [7]:
#Merge Reviews to one row review per book
df_books['Review'] = df_books['Review'].astype(str)
df_reviews = df_books.groupby('Book')['Review'].apply(lambda reviews: ' '.join(reviews)).reset_index()
df_reviews.columns = ['Book', 'Review']
df_reviews.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_books['Review'] = df_books['Review'].astype(str)


Unnamed: 0,Book,Review
0,10:04,I’m going to let the text of 10:04 by Ben Lern...
1,1984,YOU. ARE. THE. DEAD.| Oh my God. I got the chi...
2,"1Q84 (1Q84, #1-3)",1Q84 is undoubtedly the biggest literary let-d...
3,2001: A Space Odyssey,"The book is always better than the film, but I..."
4,2666,Roberto Bolaño's |2666| has been described as ...


In [8]:
df_reviews[df_reviews.duplicated()]


Unnamed: 0,Book,Review


### Data Pre Processing

In [9]:
#Clean text for PEGASUS input

def clean_text(text):
    text = text.replace('\n', ' ').replace('\r', ' ').strip()
    return text

df_reviews['Review'] = df_reviews['Review'].apply(clean_text)

df_reviews.head()


Unnamed: 0,Book,Review
0,10:04,I’m going to let the text of 10:04 by Ben Lern...
1,1984,YOU. ARE. THE. DEAD.| Oh my God. I got the chi...
2,"1Q84 (1Q84, #1-3)",1Q84 is undoubtedly the biggest literary let-d...
3,2001: A Space Odyssey,"The book is always better than the film, but I..."
4,2666,Roberto Bolaño's |2666| has been described as ...


### PEGASUS Class

In [10]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
import torch

In [11]:
#Generation Tuner PEGASUS Class

class PegasusSummarizer:
    def __init__(self, model_name="google/pegasus-xsum", device=None, generation_kwargs=None):
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = PegasusTokenizer.from_pretrained(model_name)
        self.model = PegasusForConditionalGeneration.from_pretrained(model_name).to(self.device)
        self.generation_kwargs = generation_kwargs or { #initalise summary generation tuners
            "max_length": 150, #Maximum length of the summary in tokens
            "min_length": 50, #Minimum length of the summary in tokens
            "do_sample": True, #Enables sampling instead of greedy decoding (for creativity)
            "top_k": 50, #Sample only from the top 50 most probable next words
            "top_p": 0.95, #Nucleus sampling: consider tokens with 95% cumulative probability
            "temperature": 0.8, #Controls randomness: <1 = more focused, >1 = more random
            "repetition_penalty": 1.2 #Penalizes repeated phrases or words to avoid looping
}


    def summarize(self, text, max_input_length=512):
        if not isinstance(text, str) or not text.strip():
            return ""

        inputs = self.tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            padding="longest",
            max_length=max_input_length
        ).to(self.device)

        summary_ids = self.model.generate(
            inputs["input_ids"],
            **self.generation_kwargs
        )

        return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    def summarize_books(self, df, text_column="Review", title_column="Book"):
        results = []
        for idx, row in df.iterrows():
            Book = row[title_column]
            Reviews = row[text_column]
            Summary = self.summarize(Reviews)
            results.append({
                "book_number": f"Book {idx + 1}",
                "Book": Book,
                "Summary": Summary
            })
            print(f"\n Book {idx + 1}: {Book}\n Summary: {Summary}\n{'-'*80}")
        return pd.DataFrame(results)




In [12]:
#Selected books for summary test
lBooks = ['To Kill a Mockingbird', '1984', 'Jane Eyre', 'Animal Farm', 'Crime and Punishment', 'Cataract', 'The Afternoon of a Writer', 'The History of the Siege of Lisbon', 'Flaubert\'s Parrot', 'Infinite Jest']


# Filter only for selected books
filtered_books_df = df_reviews[df_reviews['Book'].isin(lBooks)]

# Group reviews per book (if not already done)
df_selected = filtered_books_df.groupby('Book')['Review'].apply(lambda x: ' '.join(x)).reset_index()
df_selected.columns = ['Book', 'Review']


In [13]:
#Common creative decoding parameters (balanced sampling) variation I

custom_params = {
    "max_length": 200,
    "min_length": 60,
    "do_sample": True,
    "top_k": 50,
    "top_p": 0.95,
    "temperature": 0.8,
    "repetition_penalty": 1.2
}

summarizer = PegasusSummarizer(generation_kwargs=custom_params)
summary_df = summarizer.summarize_books(df_selected)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]


 Book 1: 1984
 Summary: In our series of letters from African journalists, novelist and writer Adaobi Tricia Nwaubani considers George Orwell's dystopian novel 1984, which tells the story of Winston, a young man who is sent to live in a society ruled by a party called the Party (War is Peace), after his father dies.
--------------------------------------------------------------------------------

 Book 2: Animal Farm
 Summary: George Orwell's Animal Farm is one of the most banned books in the world and Amazon pulled it from sale on 17 July 2009 after discovering that the publisher lacked rights to publish the titles in question, prompting outcry and comparisons to Orwell's novel Nineteen Eighty-Four, which was also published by Amazon.
--------------------------------------------------------------------------------

 Book 3: Cataract
 Summary: Mykhaylo Osadchy's memoir, "Cataract", about his imprisonment and torture by the KGB in the mid-sixties, has been translated into English by Ma

In [14]:
#Deterministic beam search (no randomness) varaiation II
custom_params = {
    "max_length": 150,
    "min_length": 60,
    "do_sample": False, #Set to false = no randomness
    "num_beams": 5, #Deterministic beam search over 5 paths
    "repetition_penalty": 1.2
}

summarizer = PegasusSummarizer(generation_kwargs=custom_params)
summary_finetuneII = summarizer.summarize_books(df_selected)


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 Book 1: 1984
 Summary: In our series of letters from African journalists, film-maker and columnist Sharmila Tagore reflects on her experience of reading George Orwell's dystopian novel 1984, which she describes as "the best book I've read in a very long time" and "the best thing I've read in a very long time."
--------------------------------------------------------------------------------

 Book 2: Animal Farm
 Summary: George Orwell's Animal Farm is one of the most banned books in the world, and Amazon pulled it from sale in 2009 after discovering that the publisher lacked rights to publish the titles in question, prompting comparisons to Orwell's novel Nineteen Eighty-Four, which was also banned in many countries.
--------------------------------------------------------------------------------

 Book 3: Cataract
 Summary: Mykhaylo Osadchy's memoir, "Cataract", about his imprisonment by the KGB in the mid-sixties, has been translated into English by Marco Caynnyk, editor of the Ukr

In [15]:
#High creativity and more expresiveness variation III
custom_params = {
    "max_length": 100,
    "min_length": 80,
    "do_sample": True,
    "top_k": 40,
    "top_p": 0.90,
    "temperature": 1.0,
    "repetition_penalty": 1.0
}
summarizer = PegasusSummarizer(generation_kwargs=custom_params)
summary_finetuneIII = summarizer.summarize_books(df_selected)


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 Book 1: 1984
 Summary: In our series of letters from African journalists, film-maker and columnist Farai Sevenzo looks at George Orwell's dystopian novel 1984, which has been called "one of the greatest books of all time" and "one of the best books of the 20th Century" by The New York Times and The Guardian, both of which have called it "one of the greatest novels of all time."
--------------------------------------------------------------------------------

 Book 2: Animal Farm
 Summary: George Orwell's Animal Farm is one of the most Orwellian books ever written and it's also one of the most controversial - not least because of Amazon's decision in 2009 to remove the book from sale on Kindle devices after discovering that the book's publisher did not have the rights to sell it on the online retailer's Kindle platform, as well as its removal from Amazon's website.
--------------------------------------------------------------------------------

 Book 3: Cataract
 Summary: Mykhaylo Os

In [18]:
#Shorter summary variation IV
custom_params = {
    "max_length": 80,
    "min_length": 20,
    "do_sample": True,
    "top_k": 40,
    "top_p": 0.90,
    "temperature": 1.0,
    "repetition_penalty": 1.0
}
summarizer = PegasusSummarizer(generation_kwargs=custom_params)
summary_finetuneIV = summarizer.summarize_books(df_selected)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 Book 1: 1984
 Summary: In our series of letters from African journalists, film-maker and columnist Sharmila Tagore reflects on her experience of reading George Orwell's 1984.
--------------------------------------------------------------------------------

 Book 2: Animal Farm
 Summary: George Orwell's novel Animal Farm was pulled from sale by Amazon in 2009 after the online retailer discovered it did not have the rights to publish the book.
--------------------------------------------------------------------------------

 Book 3: Cataract
 Summary: Mykhaylo Osadchy's memoir, "Cataract", about his imprisonment by the KGB in the 1960s, has been translated by Marco Caynnyk.
--------------------------------------------------------------------------------

 Book 4: Crime and Punishment
 Summary: Fyodor Dostoevsky's Crime and Punishment is one of my all-time favourite books.
--------------------------------------------------------------------------------

 Book 5: Flaubert's Parrot
 Summa

Summarize selected test books:

#### Full Book Summary

*only run and remove hashtags if needed, runtime 5hr+*

In [16]:

# Run summarization for all books with standard class generation_kwargs

#summarizer = PegasusSummarizer(generation_kwargs=custom_params)
#full_summary = summarizer.summarize_books(df_selected)



### BERT Score Evaluation

### Mean BERT Score over 10 sample books

In [19]:
from bert_score import score

# Get summary and source columns from summary_df and df_selected
summaries = summary_df['Summary'].tolist()
originals = df_selected['Review'].tolist()


# Compute BERTScore
P, R, F1 = score(summaries, originals, lang="en")

print(f"Precision: {P.mean():.4f}")
print(f"Recall: {R.mean():.4f}")
print(f"F1 Score: {F1.mean():.4f}")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Precision: 0.8450
Recall: 0.7842
F1 Score: 0.8133


In [20]:


# Get summary and source columns from summary_finetuneII and df_selected
summaries = summary_finetuneII['Summary'].tolist()
originals = df_selected['Review'].tolist()


# Compute BERTScore
P, R, F1 = score(summaries, originals, lang="en")

print(f"Precision: {P.mean():.4f}")
print(f"Recall: {R.mean():.4f}")
print(f"F1 Score: {F1.mean():.4f}")



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Precision: 0.8372
Recall: 0.7871
F1 Score: 0.8108


In [21]:

# Get summary and source columns from summary_finetuneIII and df_selected
summaries = summary_finetuneIII['Summary'].tolist()
originals = df_selected['Review'].tolist()


# Compute BERTScore
P, R, F1 = score(summaries, originals, lang="en")

print(f"Precision: {P.mean():.4f}")
print(f"Recall: {R.mean():.4f}")
print(f"F1 Score: {F1.mean():.4f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Precision: 0.8370
Recall: 0.7840
F1 Score: 0.8095


In [22]:


# Get summary and source columns from summary_finetuneIII and df_selected
summaries = summary_finetuneIV['Summary'].tolist()
originals = df_selected['Review'].tolist()


# Compute BERTScore
P, R, F1 = score(summaries, originals, lang="en")

print(f"Precision: {P.mean():.4f}")
print(f"Recall: {R.mean():.4f}")
print(f"F1 Score: {F1.mean():.4f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Precision: 0.8494
Recall: 0.7755
F1 Score: 0.8105


### BERT Score per Book

In [23]:

# Match the 10 books and summaries for variation I
originals = df_selected['Review'].tolist()
summaries = summary_df['Summary'].tolist()

# Compute BERTScores for each book
P, R, F1 = score(summaries, originals, lang="en")

# Store them in your summary_df
summary_df['bert_precision'] = P.tolist()
summary_df['bert_recall'] = R.tolist()
summary_df['bert_f1'] = F1.tolist()

# Check result
print(summary_df[['Book', 'bert_f1']].head())


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


                   Book   bert_f1
0                  1984  0.803946
1           Animal Farm  0.839349
2              Cataract  0.821609
3  Crime and Punishment  0.811302
4     Flaubert's Parrot  0.806319


In [24]:
# Match the 10 books and summaries for variation II
originals = df_selected['Review'].tolist()
summaries = summary_finetuneII['Summary'].tolist()

# Compute BERTScores for each book
P, R, F1 = score(summaries, originals, lang="en")

# Store them in your summary_df
summary_finetuneII['bert_precision'] = P.tolist()
summary_finetuneII['bert_recall'] = R.tolist()
summary_finetuneII['bert_f1'] = F1.tolist()

# Check result
print(summary_finetuneII[['Book', 'bert_f1']].head())

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


                   Book   bert_f1
0                  1984  0.800437
1           Animal Farm  0.837467
2              Cataract  0.859019
3  Crime and Punishment  0.809805
4     Flaubert's Parrot  0.793419


In [25]:
# Match the 10 books and summaries for variation III
originals = df_selected['Review'].tolist()
summaries = summary_finetuneIII['Summary'].tolist()

# Compute BERTScores for each book
P, R, F1 = score(summaries, originals, lang="en")

# Store them in your summary_df
summary_finetuneIII['bert_precision'] = P.tolist()
summary_finetuneIII['bert_recall'] = R.tolist()
summary_finetuneIII['bert_f1'] = F1.tolist()

# Check result
print(summary_finetuneIII[['Book', 'bert_f1']].head())

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


                   Book   bert_f1
0                  1984  0.788137
1           Animal Farm  0.816724
2              Cataract  0.832976
3  Crime and Punishment  0.820951
4     Flaubert's Parrot  0.791589


In [26]:

# Match the 10 books and summaries for variation III
originals = df_selected['Review'].tolist()
summaries = summary_finetuneIV['Summary'].tolist()

# Compute BERTScores for each book
P, R, F1 = score(summaries, originals, lang="en")

# Store them in your summary_df
summary_finetuneIV['bert_precision'] = P.tolist()
summary_finetuneIV['bert_recall'] = R.tolist()
summary_finetuneIV['bert_f1'] = F1.tolist()

# Check result
print(summary_finetuneIII[['Book', 'bert_f1']].head())

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


                   Book   bert_f1
0                  1984  0.788137
1           Animal Farm  0.816724
2              Cataract  0.832976
3  Crime and Punishment  0.820951
4     Flaubert's Parrot  0.791589


### Manual Evaluation

Orginal vs Generated Review

In [27]:
#Variation I

sample = summary_df

for _, row in sample.iterrows():
    book_title = row['Book']
    summary = row['Summary']
    original = df_reviews[df_reviews['Book'] == book_title]['Review'].values[0]

    print(f"\n {book_title}")
    print(f" Summary: {summary}")
    print(f" Original Excerpt: {original[:300]}...\n")



 1984
 Summary: In our series of letters from African journalists, novelist and writer Adaobi Tricia Nwaubani considers George Orwell's dystopian novel 1984, which tells the story of Winston, a young man who is sent to live in a society ruled by a party called the Party (War is Peace), after his father dies.
 Original Excerpt: YOU. ARE. THE. DEAD.| Oh my God. I got the chills so many times toward the end of this book. It completely blew my mind. It managed to surpass my high expectations AND be nothing at all like I expected. Or in Newspeak "Double Plus Good."|Let me preface this with an apology. If I sound stunningly ina...


 Animal Farm
 Summary: George Orwell's Animal Farm is one of the most banned books in the world and Amazon pulled it from sale on 17 July 2009 after discovering that the publisher lacked rights to publish the titles in question, prompting outcry and comparisons to Orwell's novel Nineteen Eighty-Four, which was also published by Amazon.
 Original Excerpt: Amazon'

In [28]:
#Variation II
sample = summary_finetuneII

for _, row in sample.iterrows():
    book_title = row['Book']
    summary = row['Summary']
    original = df_reviews[df_reviews['Book'] == book_title]['Review'].values[0]

    print(f"\n {book_title}")
    print(f" Summary: {summary}")
    print(f" Original Excerpt: {original[:300]}...\n")


 1984
 Summary: In our series of letters from African journalists, film-maker and columnist Sharmila Tagore reflects on her experience of reading George Orwell's dystopian novel 1984, which she describes as "the best book I've read in a very long time" and "the best thing I've read in a very long time."
 Original Excerpt: YOU. ARE. THE. DEAD.| Oh my God. I got the chills so many times toward the end of this book. It completely blew my mind. It managed to surpass my high expectations AND be nothing at all like I expected. Or in Newspeak "Double Plus Good."|Let me preface this with an apology. If I sound stunningly ina...


 Animal Farm
 Summary: George Orwell's Animal Farm is one of the most banned books in the world, and Amazon pulled it from sale in 2009 after discovering that the publisher lacked rights to publish the titles in question, prompting comparisons to Orwell's novel Nineteen Eighty-Four, which was also banned in many countries.
 Original Excerpt: Amazon's very Orwellian i

In [29]:
#Variation III
sample = summary_finetuneIII

for _, row in sample.iterrows():
    book_title = row['Book']
    summary = row['Summary']
    original = df_reviews[df_reviews['Book'] == book_title]['Review'].values[0]

    print(f"\n {book_title}")
    print(f" Summary: {summary}")
    print(f" Original Excerpt: {original[:300]}...\n")


 1984
 Summary: In our series of letters from African journalists, film-maker and columnist Farai Sevenzo looks at George Orwell's dystopian novel 1984, which has been called "one of the greatest books of all time" and "one of the best books of the 20th Century" by The New York Times and The Guardian, both of which have called it "one of the greatest novels of all time."
 Original Excerpt: YOU. ARE. THE. DEAD.| Oh my God. I got the chills so many times toward the end of this book. It completely blew my mind. It managed to surpass my high expectations AND be nothing at all like I expected. Or in Newspeak "Double Plus Good."|Let me preface this with an apology. If I sound stunningly ina...


 Animal Farm
 Summary: George Orwell's Animal Farm is one of the most Orwellian books ever written and it's also one of the most controversial - not least because of Amazon's decision in 2009 to remove the book from sale on Kindle devices after discovering that the book's publisher did not have the 

In [30]:
#Variation IV

sample = summary_finetuneIV

for _, row in sample.iterrows():
    book_title = row['Book']
    summary = row['Summary']
    original = df_reviews[df_reviews['Book'] == book_title]['Review'].values[0]

    print(f"\n {book_title}")
    print(f" Summary: {summary}")
    print(f" Original Excerpt: {original[:300]}...\n")


 1984
 Summary: In our series of letters from African journalists, film-maker and columnist Sharmila Tagore reflects on her experience of reading George Orwell's 1984.
 Original Excerpt: YOU. ARE. THE. DEAD.| Oh my God. I got the chills so many times toward the end of this book. It completely blew my mind. It managed to surpass my high expectations AND be nothing at all like I expected. Or in Newspeak "Double Plus Good."|Let me preface this with an apology. If I sound stunningly ina...


 Animal Farm
 Summary: George Orwell's novel Animal Farm was pulled from sale by Amazon in 2009 after the online retailer discovered it did not have the rights to publish the book.
 Original Excerpt: Amazon's very Orwellian involvement with this book at the end. If Amazon ever partnered Facebook they'd own us.|This is not really a review, but one of those moments where everything that was clear to you suddenly becomes utterly muddied and you really can't say what lies beneath the murky waters al...




### Export Results from different models

In [35]:
summary_df.head()

Unnamed: 0,book_number,Book,Summary,bert_precision,bert_recall,bert_f1,original
0,Book 1,1984,In our series of letters from African journali...,0.827562,0.78164,0.803946,YOU. ARE. THE. DEAD.| Oh my God. I got the chi...
1,Book 2,Animal Farm,George Orwell's Animal Farm is one of the most...,0.88658,0.796895,0.839349,Amazon's very Orwellian involvement with this ...
2,Book 3,Cataract,"Mykhaylo Osadchy's memoir, ""Cataract"", about h...",0.858365,0.787871,0.821609,this book more than any deserves a new press r...
3,Book 4,Crime and Punishment,Fyodor Dostoevsky's Crime and Punishment is on...,0.837998,0.786254,0.811302,“Trying to untie the string and going to the w...
4,Book 5,Flaubert's Parrot,In our series of letters from African journali...,0.837512,0.777367,0.806319,How can we know the past? Old articles are sil...


In [43]:
#Variation I
import openpyxl

# Get matching originals for the summaries
originals = df_selected['Review'].tolist()

# Add original text to summary_df
summary_df['original'] = originals



# Save to CSV
summary_df.to_excel("pegasus_summary_VI.xlsx", index=False)

In [44]:
#Variation II
# Get matching originals for the summaries
originals = df_selected['Review'].tolist()

# Add original text to summary_df
summary_finetuneII['original'] = originals


# Save to CSV
summary_finetuneII.to_excel("pegasus_summary_VII.xlsx", index=False)


In [45]:
#Variation III
# Get matching originals for the summaries
originals = df_selected['Review'].tolist()

# Add original text to summary_df
summary_finetuneIII['original'] = originals


# Save to CSV
summary_finetuneIII.to_excel("pegasus_summary_VIII.xlsx", index=False)

In [46]:
#Variation IV
# Get matching originals for the summaries
originals = df_selected['Review'].tolist()

# Add original text to summary_df
summary_finetuneIV['original'] = originals


# Save to CSV
summary_finetuneIV.to_excel("pegasus_summary_VIV.xlsx", index=False)