## NLP Final
NGram token summarization

In [1]:
!pip install langdetect==1.0.9 > /dev/null 2>&1
import kagglehub
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from collections import Counter
from langdetect import detect, DetectorFactory

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

# Download latest version
path = kagglehub.dataset_download("beridzeg45/book-reviews")

print("Path to dataset files:", path)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Path to dataset files: /kaggle/input/book-reviews


In [2]:
df = pd.read_csv(path + "/Book Reviews.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Book,Review,Review Date
0,0,To Kill a Mockingbird,/// gentle reminder that this is not the time ...,"March 24, 2022"
1,1,To Kill a Mockingbird,\n|\n|6.0 stars. I know I am risking a serious...,"May 24, 2011"
2,2,To Kill a Mockingbird,\n|\n|Looking for a new book but don't want to...,"December 10, 2020"
3,3,To Kill a Mockingbird,"To Kill a Mockingbird, Harper Lee|To Kill a Mo...","July 1, 2022"
4,4,To Kill a Mockingbird,Why is it when I pick up | To Kill A Mockingbi...,"October 25, 2009"


In [3]:
# Drop null reviews
df.dropna(subset=['Review'], inplace=True)

# Only include reviews written in english
def is_english(text):
  try:
    return detect(text) == 'en'
  except:
    return False

df_english = df[df['Review'].apply(is_english)].copy()

In [4]:
book_reviews = df_english.groupby("Book")["Review"].apply(lambda x: " ".join(x.dropna())).reset_index()
book_reviews.columns = ["Book", "combined_reviews"]

In [5]:
import re
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)  # remove punctuation
    text = re.sub(r"\s+", " ", text).strip()
    return text

book_reviews['Review'] = df_english['Review'].apply(clean_text)

In [6]:
stop_words = stopwords.words('english')
lem = WordNetLemmatizer()

# Create the data preprocessing function
def data_preprocess(text):
  if not isinstance(text, str):
        text = ""
  # Tokenize text
  wtokens = word_tokenize(text)

  # Filtering tokens
  t_filtered = []
  for t in wtokens:
    # Convert token to lowercase and check if it's not in stopwords and is alphabetic (use the isalpha() method)
    if t.lower() not in stop_words and t.isalpha():
      # Add the lowercase token to filtered_tokens
      t_filtered.append(t.lower())

  # Lemmatization
  t_lemmatized = []
  for t in t_filtered:
    # Lemmatize token
    lemma_t = lem.lemmatize(t)
    # Add lemmatized token to lemmatized_tokens
    t_lemmatized.append(lemma_t)

  # Rejoin the processed tokens into a single string
  return " ".join(t_lemmatized) #here we use a space (" ") as the separator

book_reviews['Preprocessed_Text'] = book_reviews['combined_reviews'].apply(data_preprocess)
book_reviews.head()

Unnamed: 0,Book,combined_reviews,Review,Preprocessed_Text
0,10:04,I’m going to let the text of 10:04 by Ben Lern...,gentle reminder that this is not the time to r...,going let text ben lerner talking excerpt come...
1,1984,YOU. ARE. THE. DEAD.| Oh my God. I got the chi...,60 stars i know i am risking a serious film at...,oh god got chill many time toward end book com...
2,"1Q84 (1Q84, #1-3)",1Q84 is undoubtedly the biggest literary let-d...,looking for a new book but dont want to commit...,undoubtedly biggest literary ever come across ...
3,2001: A Space Odyssey,"The book is always better than the film, but I...",,book always better film never read know readin...
4,2666,Roberto Bolaño's |2666| has been described as ...,why is it when i pick up to kill a mockingbird...,roberto bolaño described electrifying literary...


In [11]:
lBooks = ['To Kill a Mockingbird', '1984', 'Jane Eyre', 'Animal Farm', 'Crime and Punishment', 'Cataract', 'The Afternoon of a Writer',
          'The History of the Siege of Lisbon', 'Flaubert\'s Parrot', 'Infinite Jest']
selected_books = book_reviews[book_reviews['Book'].isin(lBooks)]
selected_books

Unnamed: 0,Book,combined_reviews,Review,Preprocessed_Text
1,1984,YOU. ARE. THE. DEAD.| Oh my God. I got the chi...,60 stars i know i am risking a serious film at...,oh god got chill many time toward end book com...
76,Animal Farm,Amazon's very Orwellian involvement with this ...,,amazon orwellian involvement book end amazon e...
149,Cataract,this book more than any deserves a new press r...,,book deserves new press run attention translat...
181,Crime and Punishment,“Trying to untie the string and going to the w...,someone i know claimed this no longer has valu...,trying untie string going window light window ...
266,Flaubert's Parrot,How can we know the past? Old articles are sil...,,know past old article silent witness day old o...
356,Infinite Jest,"USHER: Goodreads court is now in session, the ...",dont let the bastards grind you down theres a ...,usher goodreads court session honourable judge...
368,Jane Eyre,"Yes, I suppose you can view this book mostly a...",im in the minority unfortunately i thought the...,yes suppose view book mostly love story age le...
652,The Afternoon of a Writer,Handke’s narrator suffers from a writer’s bloc...,rebecca is a classic gothic novel that had bee...,handke narrator suffers writer block take walk...
791,The History of the Siege of Lisbon,Historical science and the real past of the ma...,so i finally got to find out for myself what t...,historical science real past always struck his...
1026,To Kill a Mockingbird,/// gentle reminder that this is not the time ...,,gentle reminder time read book first regret on...


In [17]:
def extract_ngrams(text, n=3):
    tokens = text.split()
    return list(ngrams(tokens, n))

book_summaries = {}

for row_index, row_data in selected_books.iterrows():
    book = row_data['Book']
    book_text = row_data['Preprocessed_Text']

    if not isinstance(book_text, str):
        book_text = ""

    trigrams = extract_ngrams(book_text, n=3)
    trigram_counts = Counter(trigrams)
    top_trigrams = [' '.join(t) for t, _ in trigram_counts.most_common(3)]

    summary = '. '.join(top_trigrams) + '.' if top_trigrams else ''
    book_summaries[book] = summary

for book, summary in book_summaries.items():
    print(f"\nSummary for: {book}")
    print(f"- {summary}")


Summary for: 1984
- brave new world. world war ii. quite bit time.

Summary for: Animal Farm
- animal equal animal. equal animal equal. four leg good.

Summary for: Cataract
- book deserves new. deserves new press. new press run.

Summary for: Crime and Punishment
- ivanovna sonia mother. many many many. richard pevear larissa.

Summary for: Flaubert's Parrot
- doctor geoffrey braithwaite. two parrot two. flaubert stuffed parrot.

Summary for: Infinite Jest
- david foster wallace. reading infinite jest. enfield tennis academy.

Summary for: Jane Eyre
- men would suffer. feel men feel. men feel need.

Summary for: The Afternoon of a Writer
- take walk around. find way back. unnamed european city.

Summary for: The History of the Siege of Lisbon
- history siege lisbon. raimundo benvindo silva. change word historical.

Summary for: To Kill a Mockingbird
- one thing abide. thing abide majority. abide majority rule.


In [18]:
!pip install bert-score==0.3.12 > /dev/null 2>&1
from bert_score import score

In [None]:
generated_summaries = list(book_summaries.values())
reference_reviews = selected_books['combined_reviews'].tolist()

P, R, F1 = score(generated_summaries, reference_reviews, lang="en")

In [23]:
print("\nBERTScore per Book:")
for i, book in enumerate(lBooks):
    print(f"\nBook: {book}")
    print(f"  BERTScore P: {P[i]:.4f}")
    print(f"  BERTScore R: {R[i]:.4f}")
    print(f"  BERTScore F1: {F1[i]:.4f}")


BERTScore per Book:

Book: To Kill a Mockingbird
  BERTScore P: 0.8481
  BERTScore R: 0.7595
  BERTScore F1: 0.8014

Book: 1984
  BERTScore P: 0.8347
  BERTScore R: 0.7617
  BERTScore F1: 0.7965

Book: Jane Eyre
  BERTScore P: 0.8864
  BERTScore R: 0.7542
  BERTScore F1: 0.8150

Book: Animal Farm
  BERTScore P: 0.7766
  BERTScore R: 0.7535
  BERTScore F1: 0.7649

Book: Crime and Punishment
  BERTScore P: 0.8124
  BERTScore R: 0.7629
  BERTScore F1: 0.7869

Book: Cataract
  BERTScore P: 0.8188
  BERTScore R: 0.7472
  BERTScore F1: 0.7814

Book: The Afternoon of a Writer
  BERTScore P: 0.8333
  BERTScore R: 0.7526
  BERTScore F1: 0.7909

Book: The History of the Siege of Lisbon
  BERTScore P: 0.8417
  BERTScore R: 0.7612
  BERTScore F1: 0.7994

Book: Flaubert's Parrot
  BERTScore P: 0.8085
  BERTScore R: 0.7598
  BERTScore F1: 0.7834

Book: Infinite Jest
  BERTScore P: 0.8352
  BERTScore R: 0.7695
  BERTScore F1: 0.8010


In [24]:
!pip install rouge-score==0.1.2 > /dev/null 2>&1
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)


In [26]:
print("\nROGUE Score per Book:")
for i in range(len(generated_summaries)):
    scores = scorer.score(reference_reviews[i], generated_summaries[i])
    print(f"\nBook: {lBooks[i]}")
    print(f"  ROUGE-1: {scores['rouge1'].fmeasure:.4f}")
    print(f"  ROUGE-L: {scores['rougeL'].fmeasure:.4f}")


ROGUE Score per Book:

Book: To Kill a Mockingbird
  ROUGE-1: 0.0021
  ROUGE-L: 0.0019

Book: 1984
  ROUGE-1: 0.0024
  ROUGE-L: 0.0024

Book: Jane Eyre
  ROUGE-1: 0.0214
  ROUGE-L: 0.0179

Book: Animal Farm
  ROUGE-1: 0.0014
  ROUGE-L: 0.0014

Book: Crime and Punishment
  ROUGE-1: 0.0019
  ROUGE-L: 0.0019

Book: Cataract
  ROUGE-1: 0.0005
  ROUGE-L: 0.0005

Book: The Afternoon of a Writer
  ROUGE-1: 0.0012
  ROUGE-L: 0.0012

Book: The History of the Siege of Lisbon
  ROUGE-1: 0.0086
  ROUGE-L: 0.0086

Book: Flaubert's Parrot
  ROUGE-1: 0.0033
  ROUGE-L: 0.0033

Book: Infinite Jest
  ROUGE-1: 0.0015
  ROUGE-L: 0.0015
