## NLP Final
Summarize by clustering

In [1]:
!pip install langdetect==1.0.9 > /dev/null 2>&1
!pip install bert_score==0.3.13 > /dev/null 2>&1
!pip install kagglehub > /dev/null 2>&1
!pip install sentence-transformers > /dev/null 2>&1

import kagglehub
import pandas as pd
import re
from langdetect import detect
from bert_score import score

import nltk
nltk.download('punkt', quiet = True)
nltk.download('punkt_tab', quiet= True)
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.metrics import silhouette_score

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.tokenize import sent_tokenize
import numpy as np
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity

path = kagglehub.dataset_download("beridzeg45/book-reviews")
print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/book-reviews


In [2]:
df = pd.read_csv(path + "/Book Reviews.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Book,Review,Review Date
0,0,To Kill a Mockingbird,/// gentle reminder that this is not the time ...,"March 24, 2022"
1,1,To Kill a Mockingbird,\n|\n|6.0 stars. I know I am risking a serious...,"May 24, 2011"
2,2,To Kill a Mockingbird,\n|\n|Looking for a new book but don't want to...,"December 10, 2020"
3,3,To Kill a Mockingbird,"To Kill a Mockingbird, Harper Lee|To Kill a Mo...","July 1, 2022"
4,4,To Kill a Mockingbird,Why is it when I pick up | To Kill A Mockingbi...,"October 25, 2009"


In [3]:
# Drop null reviews
df.dropna(subset=['Review'], inplace=True)

# Only include reviews written in english
def is_english(text):
  try:
    return detect(text) == 'en'
  except:
    return False

df_english = df[df['Review'].apply(is_english)].copy()

In [4]:
grouped = df_english.groupby("Book")["Review"].apply(lambda x: " ".join(x.dropna())).reset_index()
grouped.columns = ["Book", "combined_reviews"]

In [5]:
def clean_text(text):
    text = re.sub(r'\|', ' ', text)
    text = re.sub(r'\\n|\\r|\n', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

grouped["combined_reviews"] = grouped["combined_reviews"].apply(clean_text)

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")

def find_optimal_k(embeddings, k_min=2, k_max=6):
    best_k = k_min
    best_score = -1
    for k in range(k_min, min(k_max + 1, len(embeddings))):
        kmeans = KMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(embeddings)
        score = silhouette_score(embeddings, labels)
        if score > best_score:
            best_k = k
            best_score = score
    return best_k

def summarize_by_clustering(text, return_labels=False):
    sentences = sent_tokenize(text)
    if len(sentences) < 3:
        return " ".join(sentences)

    embeddings = model.encode(sentences)
    optimal_k = find_optimal_k(embeddings)

    kmeans = KMeans(n_clusters=optimal_k, random_state=42)
    labels = kmeans.fit_predict(embeddings)
    centers = kmeans.cluster_centers_

    summary_sentences = []
    for i in range(optimal_k):
        cluster_indices = np.where(labels == i)[0]
        cluster_embeddings = embeddings[cluster_indices]
        similarities = cosine_similarity([centers[i]], cluster_embeddings)[0]
        best_idx = cluster_indices[np.argmax(similarities)]
        summary_sentences.append(sentences[best_idx])

    summary = " ".join(summary_sentences)

    if return_labels:
        return summary, labels, sentences
    return summary


In [None]:
lBooks = ['To Kill a Mockingbird', '1984', 'Jane Eyre', 'Animal Farm', 'Crime and Punishment', 'Cataract', 'The Afternoon of a Writer',
          'The History of the Siege of Lisbon', 'Flaubert\'s Parrot', 'Infinite Jest']
sampled_books = grouped[grouped['Book'].isin(lBooks)]
sampled_books["cluster_summary"] = sampled_books["combined_reviews"].apply(
    lambda x: summarize_by_clustering(x)
)

In [8]:
print(sampled_books[["Book", "cluster_summary"]])

                                    Book  \
1                                   1984   
76                           Animal Farm   
149                             Cataract   
181                 Crime and Punishment   
266                    Flaubert's Parrot   
356                        Infinite Jest   
368                            Jane Eyre   
652            The Afternoon of a Writer   
791   The History of the Siege of Lisbon   
1026               To Kill a Mockingbird   

                                        cluster_summary  
1     Like it, Yay. These are the successes of 1984 ...  
76    3. Animal Farm is the story of a set of animal...  
149   Unfortunately that's where my enjoyment of the...  
181   I went into it like a wise man, and that was j...  
266   The beginning is confusing. Ironically enough,...  
356   But this book has become a part of me, has art...  
368   She is a rebel - setting out to have her own c...  
652   This book looks to get around that by spendin

In [24]:
sampled_books['cluster_summary'][1026]

'BONUS QUOTE: This is Scout talking to Atticus after getting to know someone she had previously be afraid of: “ ‘When they finally saw him, why he hadn’t done any of those things . If you are someone who loves books, you might have already read this book. . However, in a story of so much evil and injustice there is also hope, bravery, and kindness. As a reader you are never allowed to feel with Tom Robinson, the Black man who is innocently convicted for raping a white woman, because all the Black characters in this tale are sidelined.'

In [30]:
bookreviews = df_english[df_english['Book'] == 'To Kill a Mockingbird']
print(bookreviews[['Book', 'Review']])
print('\nTotal Reviews: ', bookreviews['Review'].count())

                     Book                                             Review
0   To Kill a Mockingbird  /// gentle reminder that this is not the time ...
1   To Kill a Mockingbird  \n|\n|6.0 stars. I know I am risking a serious...
2   To Kill a Mockingbird  \n|\n|Looking for a new book but don't want to...
4   To Kill a Mockingbird  Why is it when I pick up | To Kill A Mockingbi...
5   To Kill a Mockingbird  I had a much longer review written for this bo...
7   To Kill a Mockingbird  With endless books and infinitely more to be w...
8   To Kill a Mockingbird  While the plot was very gripping and well-writ...
9   To Kill a Mockingbird  In the course of 5 years, I’ve read this book ...
10  To Kill a Mockingbird  So... I don't really know what to say.|I think...
11  To Kill a Mockingbird                                   Beautiful book. 
12  To Kill a Mockingbird  Life gives you a few things that you can count...
13  To Kill a Mockingbird  This is one book that I think is more relevant...

BERTScore is not a great representation for our model because it is just pulling sentences from the aggregated review. So of course it will score highly. A better representation is using TF_IDF to find cluster topics

In [None]:
def calculate_bert_score(generated_summary, source_text):
  P, R, F1 = score([generated_summary], [source_text], lang="en")
  return P.mean().item(), R.mean().item(), F1.mean().item()

  sampled_books['bert_precision'] = None
  sampled_books['bert_recall'] = None
  sampled_books['bert_f1_score'] = None

for index, row in sampled_books.iterrows():
    book_title = row['Book']
    generated_summary = row['cluster_summary']
    combined_reviews_row = grouped[grouped['Book'] == book_title]

    if not combined_reviews_row.empty:
        combined_reviews = combined_reviews_row['combined_reviews'].iloc[0]

        # Calculate BERTScore
        bert_precision, bert_recall, bert_f1 = calculate_bert_score(generated_summary, combined_reviews)

        # Print the scores for the book
        sampled_books.loc[index, 'bert_precision'] = bert_precision
        sampled_books.loc[index, 'bert_recall'] = bert_recall
        sampled_books.loc[index, 'bert_f1_score'] = bert_f1
    else:
        print(f"\nWarning: Combined reviews not found for book '{book_title}'. Skipping BERTScore calculation.")

In [12]:
print(sampled_books[['Book', 'bert_precision', 'bert_recall', 'bert_f1_score']])

                                    Book  bert_precision  bert_recall  \
1                                   1984        0.824369     0.791572   
76                           Animal Farm        0.842073     0.780148   
149                             Cataract        0.932555     0.867907   
181                 Crime and Punishment        0.811542     0.804246   
266                    Flaubert's Parrot        0.819192     0.791765   
356                        Infinite Jest        0.813670     0.759504   
368                            Jane Eyre        0.832071     0.818741   
652            The Afternoon of a Writer        0.829993     0.793558   
791   The History of the Siege of Lisbon        0.819884     0.799080   
1026               To Kill a Mockingbird        0.832652     0.808001   

      bert_f1_score  
1          0.807638  
76         0.809929  
149        0.899071  
181        0.807877  
266        0.805245  
356        0.785654  
368        0.825352  
652        0.811367 

### Cluster Keywords

In [13]:
model = SentenceTransformer("all-MiniLM-L6-v2")

def extract_cluster_keywords(text, top_n=5):
    sentences = sent_tokenize(text)
    if len(sentences) < 3:
        return {}

    embeddings = model.encode(sentences)
    optimal_k = find_optimal_k(embeddings)
    #k = min(5, len(sentences))  # limit max clusters
    kmeans = KMeans(n_clusters=optimal_k, random_state=42)
    labels = kmeans.fit_predict(embeddings)

    cluster_sentences = {i: [] for i in range(optimal_k)}
    for sent, label in zip(sentences, labels):
        cluster_sentences[label].append(sent)

    # Extract keywords per cluster using TF-IDF
    cluster_keywords = {}
    for cluster_id, sents in cluster_sentences.items():
        if len(sents) == 0:
            continue
        vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
        tfidf = vectorizer.fit_transform(sents)
        scores = np.asarray(tfidf.mean(axis=0)).flatten()
        top_indices = scores.argsort()[-top_n:][::-1]
        keywords = [vectorizer.get_feature_names_out()[i] for i in top_indices]
        cluster_keywords[cluster_id] = keywords

    return cluster_keywords


In [None]:
sampled_books['cluster_keywords'] = sampled_books['combined_reviews'].apply(
    lambda x: extract_cluster_keywords(x)
)

In [23]:
sampled_books['cluster_keywords'][1026]

{0: ['atticus', 'scout', 'finch', 'jem', 'atticus finch'],
 1: ['book', 'read', 'loved', 'lee', 'books'],
 2: ['know', 'folks', 'didn', 'sure', 'really'],
 3: ['just', 'people', 'story', 'right', 'conscience'],
 4: ['white', 'black', 'people', 'book', 'man']}

In [16]:
bert_model = SentenceTransformer("all-MiniLM-L6-v2")

def flatten_keywords(cluster_keywords_dict):
    return [kw for kws in cluster_keywords_dict.values() for kw in kws]

def keyword_similarity(keywords, summary, model):
    keyword_embeddings = model.encode(keywords, convert_to_tensor=True)

    from nltk.tokenize import sent_tokenize
    summary_sentences = sent_tokenize(summary)
    sentence_embeddings = model.encode(summary_sentences, convert_to_tensor=True)

    sim_matrix = util.cos_sim(keyword_embeddings, sentence_embeddings)

    # max similarity for each keyword (best matching sentence), then average
    max_sim_per_keyword = sim_matrix.max(dim=1).values
    return max_sim_per_keyword.mean().item()


In [None]:
similarities = []

for i, row in sampled_books.iterrows():
    keywords = flatten_keywords(row['cluster_keywords'])
    summary = row['cluster_summary']

    sim_score = keyword_similarity(keywords, summary, bert_model)
    similarities.append(sim_score)

sampled_books['keyword_similarity'] = similarities

In [18]:
sampled_books[['Book','keyword_similarity']]

Unnamed: 0,Book,keyword_similarity
1,1984,0.345211
76,Animal Farm,0.389986
149,Cataract,0.29767
181,Crime and Punishment,0.33557
266,Flaubert's Parrot,0.369431
356,Infinite Jest,0.278527
368,Jane Eyre,0.296662
652,The Afternoon of a Writer,0.238759
791,The History of the Siege of Lisbon,0.32708
1026,To Kill a Mockingbird,0.384547
