## NLP Final
Summarize by clustering

In [1]:
!pip install langdetect==1.0.9 > /dev/null 2>&1
!pip install bert_score==0.3.13 > /dev/null 2>&1
!pip install kagglehub > /dev/null 2>&1
!pip install sentence-transformers > /dev/null 2>&1

import kagglehub
import pandas as pd
import re
from langdetect import detect
from bert_score import score

import nltk
nltk.download('punkt', quiet = True)
nltk.download('punkt_tab', quiet= True)
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.metrics import silhouette_score

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.tokenize import sent_tokenize
import numpy as np
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity

path = kagglehub.dataset_download("beridzeg45/book-reviews")
print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/book-reviews


In [2]:
df = pd.read_csv(path + "/Book Reviews.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Book,Review,Review Date
0,0,To Kill a Mockingbird,/// gentle reminder that this is not the time ...,"March 24, 2022"
1,1,To Kill a Mockingbird,\n|\n|6.0 stars. I know I am risking a serious...,"May 24, 2011"
2,2,To Kill a Mockingbird,\n|\n|Looking for a new book but don't want to...,"December 10, 2020"
3,3,To Kill a Mockingbird,"To Kill a Mockingbird, Harper Lee|To Kill a Mo...","July 1, 2022"
4,4,To Kill a Mockingbird,Why is it when I pick up | To Kill A Mockingbi...,"October 25, 2009"


In [3]:
# Drop null reviews
df.dropna(subset=['Review'], inplace=True)

# Only include reviews written in english
def is_english(text):
  try:
    return detect(text) == 'en'
  except:
    return False

df_english = df[df['Review'].apply(is_english)].copy()

In [4]:
grouped = df_english.groupby("Book")["Review"].apply(lambda x: " ".join(x.dropna())).reset_index()
grouped.columns = ["Book", "combined_reviews"]

In [5]:
def clean_text(text):
    text = re.sub(r'\|', ' ', text)
    text = re.sub(r'\\n|\\r|\n', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

grouped["combined_reviews"] = grouped["combined_reviews"].apply(clean_text)

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")

def find_optimal_k(embeddings, k_min=2, k_max=6):
    best_k = k_min
    best_score = -1
    for k in range(k_min, min(k_max + 1, len(embeddings))):
        kmeans = KMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(embeddings)
        score = silhouette_score(embeddings, labels)
        if score > best_score:
            best_k = k
            best_score = score
    return best_k

def summarize_by_clustering(text, return_labels=False):
    sentences = sent_tokenize(text)
    if len(sentences) < 3:
        return " ".join(sentences)

    embeddings = model.encode(sentences)
    optimal_k = find_optimal_k(embeddings)

    kmeans = KMeans(n_clusters=optimal_k, random_state=42)
    labels = kmeans.fit_predict(embeddings)
    centers = kmeans.cluster_centers_

    summary_sentences = []
    for i in range(optimal_k):
        cluster_indices = np.where(labels == i)[0]
        cluster_embeddings = embeddings[cluster_indices]
        similarities = cosine_similarity([centers[i]], cluster_embeddings)[0]
        best_idx = cluster_indices[np.argmax(similarities)]
        summary_sentences.append(sentences[best_idx])

    summary = " ".join(summary_sentences)

    if return_labels:
        return summary, labels, sentences
    return summary


In [7]:
lBooks = ['To Kill a Mockingbird', '1984', 'Jane Eyre', 'Animal Farm', 'Crime and Punishment', 'The Catcher in the Rye', 'Cataract', 'I\'m Not Scared',
          'Reasons to Live', 'Night Boat to Tangier', 'Infinite Jest']
sampled_books = grouped[grouped['Book'].isin(lBooks)]
sampled_books["cluster_summary"] = sampled_books["combined_reviews"].apply(
    lambda x: summarize_by_clustering(x)
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sampled_books["cluster_summary"] = sampled_books["combined_reviews"].apply(


In [8]:
print(sampled_books[["Book", "cluster_summary"]])

                        Book  \
1                       1984   
76               Animal Farm   
149                 Cataract   
181     Crime and Punishment   
337           I'm Not Scared   
356            Infinite Jest   
368                Jane Eyre   
496    Night Boat to Tangier   
571          Reasons to Live   
692   The Catcher in the Rye   
1026   To Kill a Mockingbird   

                                        cluster_summary  
1     Like it, Yay. These are the successes of 1984 ...  
76    Whatever. Yeah, yeah, everyone claims Orwell w...  
149   Unfortunately that's where my enjoyment of the...  
181   I went into it like a wise man, and that was j...  
337   And Michele is constantly being exposed to ter...  
356   But this book has become a part of me, has art...  
368   She is a rebel - setting out to have her own c...  
496   It took me a while to find any interest beyond...  
571   The thoughts and sentences are beautiful, but ...  
692   He can't see that he himself 

In [9]:
sampled_books['cluster_summary'][1]

"Like it, Yay. These are the successes of 1984 's paranoia, far outliving its original intent as a battery against where Communism was going (Orwell was a severely disappointed Marxist), and while people who compare their leaders to Big Brother are usually overreaching themselves and speak far away from Orwell's intent and vision, it is a useful catchcloth for dissent. I almost don't know what to think about this book."

In [10]:
combined_reviews_row = grouped[grouped['Book'] == '1984']
combined_reviews = combined_reviews_row['combined_reviews'].iloc[0]
print(combined_reviews)



In [12]:
bookreviews = df_english[df_english['Book'] == '1984']
print(bookreviews[['Book', 'Review']])

    Book                                             Review
31  1984  YOU. ARE. THE. DEAD.| Oh my God. I got the chi...
32  1984  This book is far from perfect. Its characters ...
33  1984  WAR IS PEACE.|FREEDOM IS SLAVERY.|IGNORANCE IS...
34  1984  In George Orwell's 1984, Winston Smith is an o...
35  1984  1984| is not a particularly good novel, but it...
36  1984  This was the book that started my love affair ...
37  1984  This was an up and down kind of read for me. T...
38  1984  I'm gonna ask myself a mandatory question and ...
41  1984  \n|\n|I am a big fan of speculative fiction an...
43  1984  “It was a bright cold day in April, and the cl...
44  1984  \n|“The best books... are those that tell you ...
45  1984  I know this is a well loved classic and I defi...
46  1984  Social media is a cage full of starved rats an...
47  1984  I wanted to understand the origin of the expre...
48  1984  Update: 1984=2024. FULL DYSTOPIAN PARADIGM FUL...
49  1984  Newspeaking ones way towards m

BERTScore is not a great representation for our model because it is just pulling sentences from the aggregated review. So of course it will score highly. A better representation is using TF_IDF to find cluster topics

In [13]:
def calculate_bert_score(generated_summary, source_text):
  P, R, F1 = score([generated_summary], [source_text], lang="en")
  return P.mean().item(), R.mean().item(), F1.mean().item()


for index, row in sampled_books.iterrows():
    book_title = row['Book']
    generated_summary = row['cluster_summary']
    combined_reviews_row = grouped[grouped['Book'] == book_title]

    if not combined_reviews_row.empty:
        combined_reviews = combined_reviews_row['combined_reviews'].iloc[0]

        # Calculate BERTScore
        bert_precision, bert_recall, bert_f1 = calculate_bert_score(generated_summary, combined_reviews)

        # Print the scores for the book
        print(f"\nBook: '{book_title}'")
        print(f"  BERTScore Precision: {bert_precision:.4f}")
        print(f"  BERTScore Recall: {bert_recall:.4f}")
        print(f"  BERTScore F1 Score: {bert_f1:.4f}")
    else:
        print(f"\nWarning: Combined reviews not found for book '{book_title}'. Skipping BERTScore calculation.")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Book: '1984'
  BERTScore Precision: 0.8244
  BERTScore Recall: 0.7916
  BERTScore F1 Score: 0.8076


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Book: 'Animal Farm'
  BERTScore Precision: 0.8483
  BERTScore Recall: 0.7807
  BERTScore F1 Score: 0.8131


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Book: 'Cataract'
  BERTScore Precision: 0.9326
  BERTScore Recall: 0.8679
  BERTScore F1 Score: 0.8991


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Book: 'Crime and Punishment'
  BERTScore Precision: 0.8115
  BERTScore Recall: 0.8042
  BERTScore F1 Score: 0.8079


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Book: 'I'm Not Scared'
  BERTScore Precision: 0.8837
  BERTScore Recall: 0.8198
  BERTScore F1 Score: 0.8506


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Book: 'Infinite Jest'
  BERTScore Precision: 0.8137
  BERTScore Recall: 0.7595
  BERTScore F1 Score: 0.7857


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Book: 'Jane Eyre'
  BERTScore Precision: 0.8321
  BERTScore Recall: 0.8187
  BERTScore F1 Score: 0.8254


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Book: 'Night Boat to Tangier'
  BERTScore Precision: 0.8471
  BERTScore Recall: 0.7993
  BERTScore F1 Score: 0.8225


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Book: 'Reasons to Live'
  BERTScore Precision: 0.8537
  BERTScore Recall: 0.7930
  BERTScore F1 Score: 0.8222


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Book: 'The Catcher in the Rye'
  BERTScore Precision: 0.8407
  BERTScore Recall: 0.7865
  BERTScore F1 Score: 0.8127


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Book: 'To Kill a Mockingbird'
  BERTScore Precision: 0.8286
  BERTScore Recall: 0.7962
  BERTScore F1 Score: 0.8121


### Cluster Keywords

In [14]:
model = SentenceTransformer("all-MiniLM-L6-v2")

def extract_cluster_keywords(text, top_n=5):
    sentences = sent_tokenize(text)
    if len(sentences) < 3:
        return {}

    embeddings = model.encode(sentences)
    optimal_k = find_optimal_k(embeddings)
    #k = min(5, len(sentences))  # limit max clusters
    kmeans = KMeans(n_clusters=optimal_k, random_state=42)
    labels = kmeans.fit_predict(embeddings)

    cluster_sentences = {i: [] for i in range(optimal_k)}
    for sent, label in zip(sentences, labels):
        cluster_sentences[label].append(sent)

    # Extract keywords per cluster using TF-IDF
    cluster_keywords = {}
    for cluster_id, sents in cluster_sentences.items():
        if len(sents) == 0:
            continue
        vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
        tfidf = vectorizer.fit_transform(sents)
        scores = np.asarray(tfidf.mean(axis=0)).flatten()
        top_indices = scores.argsort()[-top_n:][::-1]
        keywords = [vectorizer.get_feature_names_out()[i] for i in top_indices]
        cluster_keywords[cluster_id] = keywords

    return cluster_keywords


In [15]:
sampled_books['cluster_keywords'] = sampled_books['combined_reviews'].apply(
    lambda x: extract_cluster_keywords(x)
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sampled_books['cluster_keywords'] = sampled_books['combined_reviews'].apply(


In [16]:
sampled_books['cluster_keywords'][149]

{0: ['book', 'unfortunately', 'unfortunately enjoyment', 'ended', 'enjoyment'],
 1: ['osadchy', 'isn', 'act', 'soviet', 'reading'],
 2: ['live', 'soviet', 'literature', 'live live', 'references people'],
 3: ['problems', 'layers', 'different', 'today 1966', 'today'],
 4: ['list goes', 'list', 'goes'],
 5: ['translation', 'translation ears', 'near', 'ears', 'flawless']}

In [17]:
bert_model = SentenceTransformer("all-MiniLM-L6-v2")

def flatten_keywords(cluster_keywords_dict):
    return [kw for kws in cluster_keywords_dict.values() for kw in kws]

def keyword_similarity(keywords, summary, model):
    keyword_embeddings = model.encode(keywords, convert_to_tensor=True)

    from nltk.tokenize import sent_tokenize
    summary_sentences = sent_tokenize(summary)
    sentence_embeddings = model.encode(summary_sentences, convert_to_tensor=True)

    sim_matrix = util.cos_sim(keyword_embeddings, sentence_embeddings)

    # max similarity for each keyword (best matching sentence), then average
    max_sim_per_keyword = sim_matrix.max(dim=1).values
    return max_sim_per_keyword.mean().item()


In [18]:
similarities = []

for i, row in sampled_books.iterrows():
    keywords = flatten_keywords(row['cluster_keywords'])
    summary = row['cluster_summary']

    sim_score = keyword_similarity(keywords, summary, bert_model)
    similarities.append(sim_score)

sampled_books['keyword_similarity'] = similarities

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sampled_books['keyword_similarity'] = similarities


In [19]:
sampled_books[['Book','keyword_similarity']]

Unnamed: 0,Book,keyword_similarity
1,1984,0.345211
76,Animal Farm,0.364459
149,Cataract,0.29767
181,Crime and Punishment,0.33557
337,I'm Not Scared,0.21855
356,Infinite Jest,0.278527
368,Jane Eyre,0.296662
496,Night Boat to Tangier,0.243933
571,Reasons to Live,0.232677
692,The Catcher in the Rye,0.346316
