In [None]:
"""  
After using LLMs to find similar books to each other and then classify them (Fiction/Nonfiction), time for sentiment analysis that is going to determine the emotional tone.
Target is classifying data into 7 categories:
anger, disgust, fear, joy, sadness, joy, surprise, and neutral
"""
import pandas as pd 
books = pd.read_csv("books_with_categories.csv")

In [None]:
"""
The reported evaluation accuracy for this model is 66%. This is considered significantly higher than a random-chance baseline, which for 7 emotion classes would be 1/7 or approximately 14%. 
"""


from transformers import pipeline
classifier = pipeline("text-classification",
                      model="j-hartmann/emotion-english-distilroberta-base",
                      top_k = None,
                      device= "cuda"
                      )
classifier("I love this!")

In [None]:
books["description"][0]

In [None]:
#Classify the whole description
classifier(books["description"][0])

In [None]:
#Classify each sentence inside the description
classifier(books["description"][0].split("."))

In [None]:
#The idea is for each book has a separate column of each emotion

from tqdm import tqdm
import numpy as np

def calculate_max_emotion_scores(predictions, emotion_labels):
    """
    Calculates the maximum score for each emotion across all sentences for a single book.
    Uses list comprehensions for concise score extraction.

    Args:
        predictions (list of list of dict): Output from the Hugging Face classifier
                                            for one book's sentences.
        emotion_labels (list): A list of all possible emotion labels.

    Returns:
        dict: A dictionary where keys are emotion labels and values are the
              maximum score observed for that emotion across all sentences.
    """
    # Create a flattened list of all individual emotion prediction dicts
    # from all sentences.
    all_single_predictions = [item for sublist in predictions for item in sublist]

    max_scores_per_emotion = {}
    for label in emotion_labels:
        # Extract all scores for the current 'label' across ALL sentences
        scores_for_this_label = [
            item['score'] for item in all_single_predictions if item['label'] == label
        ]
        # Find the maximum score for this label, or 0.0 if no scores were found
        max_scores_per_emotion[label] = np.max(scores_for_this_label) if scores_for_this_label else 0.0

    return max_scores_per_emotion

In [None]:
emotion_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]

# Initialize a list to store results for each book
# Each element in this list will be a dictionary containing ISBN and emotion scores for one book.
results_list = []

for index, row in tqdm(books.iterrows(), total=len(books), desc="Processing Books"):
    isbn = row["isbn13"]
    description = row["description"]

    sentences = description.split(".")
    predictions = classifier(sentences)
    
    # Pass emotion_labels to the calculation function
    max_scores = calculate_max_emotion_scores(predictions, emotion_labels)

    # Create a dictionary for the current book's results
    book_result = {"isbn13": isbn}
    book_result.update(max_scores) # Add all emotion scores to the dictionary

    results_list.append(book_result)


In [None]:
emotions_df = pd.DataFrame(results_list)

In [None]:
emotions_df.head()

In [None]:
books = pd.merge(books, emotions_df, on = "isbn13", how="left")

In [None]:
books

In [None]:
books.to_csv("books_with_emotions.csv", index = False)