In [1]:
import numpy as np
import pandas as pd
books = pd.read_csv('books_with_categories.csv')

In [9]:
from transformers import pipeline
pipe = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None)


Device set to use cpu


In [10]:
pipe(books['description'][0])

[[{'label': 'fear', 'score': 0.6548405885696411},
  {'label': 'neutral', 'score': 0.16985228657722473},
  {'label': 'sadness', 'score': 0.11640921980142593},
  {'label': 'surprise', 'score': 0.02070065587759018},
  {'label': 'disgust', 'score': 0.019100677222013474},
  {'label': 'joy', 'score': 0.01516144908964634},
  {'label': 'anger', 'score': 0.003935146611183882}]]

In [4]:
pipe(books['description'][0].split('.'))

[{'label': 'surprise', 'score': 0.7296020984649658},
 {'label': 'neutral', 'score': 0.44937002658843994},
 {'label': 'neutral', 'score': 0.6462159752845764},
 {'label': 'fear', 'score': 0.9281681180000305},
 {'label': 'sadness', 'score': 0.9671575427055359},
 {'label': 'joy', 'score': 0.932798445224762},
 {'label': 'joy', 'score': 0.6528708338737488},
 {'label': 'neutral', 'score': 0.5494767427444458}]

In [19]:
import numpy as np
emotion_labels = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'neutral']
isbn = []
emotion_scores = {label: [] for label in emotion_labels}

def calculate_emotion_scores(predictions):
    per_emotion_scores = {label: [] for label in emotion_labels}
    for prediction in predictions:
        sorted_predictions = sorted(prediction, key=lambda x: x['label'])
        for index, label in enumerate(emotion_labels):
            per_emotion_scores[label].append(sorted_predictions[index]['score'])
    return {label: float(np.max(scores)) for label, scores in per_emotion_scores.items()}


In [20]:
for i in range(10):
    isbn.append(books['isbn13'][i])
    sentences = books['description'][i].split('.')
    predictions = pipe(sentences)
    max_scores = calculate_emotion_scores(predictions)
    for label in emotion_labels:
        emotion_scores[label].append(max_scores[label])

In [21]:
emotion_scores

{'anger': [0.0641336441040039,
  0.6126197576522827,
  0.0641336441040039,
  0.35148438811302185,
  0.08141235262155533,
  0.2322252243757248,
  0.5381842255592346,
  0.0641336441040039,
  0.3006700277328491,
  0.0641336441040039],
 'disgust': [0.27359163761138916,
  0.3482847511768341,
  0.10400667786598206,
  0.1507224589586258,
  0.18449543416500092,
  0.7271744608879089,
  0.155854731798172,
  0.10400667786598206,
  0.2794816195964813,
  0.17792661488056183],
 'fear': [0.9281681180000305,
  0.9425276517868042,
  0.9723208546638489,
  0.3607059419155121,
  0.09504333138465881,
  0.05136283114552498,
  0.7474274635314941,
  0.4044976532459259,
  0.9155241250991821,
  0.05136283114552498],
 'joy': [0.932798445224762,
  0.7044219970703125,
  0.7672380805015564,
  0.2518811821937561,
  0.04056443274021149,
  0.043375786393880844,
  0.872565746307373,
  0.04056443274021149,
  0.04056443274021149,
  0.04056443274021149],
 'sadness': [0.6462159752845764,
  0.8879395127296448,
  0.549476742

In [22]:
from tqdm import tqdm
isbn = []
emotion_scores = {label: [] for label in emotion_labels}

for i in tqdm(range(len(books))):
    isbn.append(books['isbn13'][i])
    sentences = books['description'][i].split('.')
    predictions = pipe(sentences)
    max_scores = calculate_emotion_scores(predictions)
    for label in emotion_labels:
        emotion_scores[label].append(max_scores[label])

100%|██████████| 5197/5197 [09:33<00:00,  9.07it/s]


In [23]:
emotions_df = pd.DataFrame(emotion_scores)
emotions_df['isbn13'] = isbn

In [24]:
books = pd.merge(books, emotions_df, on='isbn13')

In [26]:
books.to_csv("books_with_emotions.csv", index=False)