In [31]:
import pandas as pd
import numpy as np
from tqdm import tqdm
books = pd.read_csv("./data/books_with_categories.csv")


In [9]:
from transformers import pipeline
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None, device=0)
classifier("I love this!")

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 8642535e-a649-423d-9b6d-f60902895720)')' thrown while requesting HEAD https://huggingface.co/j-hartmann/emotion-english-distilroberta-base/resolve/main/config.json
Retrying in 1s [Retry 1/5].
Device set to use cuda:0


[[{'label': 'joy', 'score': 0.9771687984466553},
  {'label': 'surprise', 'score': 0.008528673090040684},
  {'label': 'neutral', 'score': 0.005764594301581383},
  {'label': 'anger', 'score': 0.004419783595949411},
  {'label': 'sadness', 'score': 0.002092392183840275},
  {'label': 'disgust', 'score': 0.001611993182450533},
  {'label': 'fear', 'score': 0.00041385157965123653}]]

In [14]:
classifier(books["description"][0])


[[{'label': 'fear', 'score': 0.654841423034668},
  {'label': 'neutral', 'score': 0.16985194385051727},
  {'label': 'sadness', 'score': 0.11640876531600952},
  {'label': 'surprise', 'score': 0.02070068195462227},
  {'label': 'disgust', 'score': 0.019100716337561607},
  {'label': 'joy', 'score': 0.0151612414047122},
  {'label': 'anger', 'score': 0.003935152664780617}]]

In [15]:
classifier(books["description"][0].split("."))

[[{'label': 'surprise', 'score': 0.7296032905578613},
  {'label': 'neutral', 'score': 0.14038528501987457},
  {'label': 'fear', 'score': 0.06816212832927704},
  {'label': 'joy', 'score': 0.04794234782457352},
  {'label': 'anger', 'score': 0.00915635097771883},
  {'label': 'disgust', 'score': 0.002628469606861472},
  {'label': 'sadness', 'score': 0.002122158883139491}],
 [{'label': 'neutral', 'score': 0.449370801448822},
  {'label': 'disgust', 'score': 0.27359163761138916},
  {'label': 'joy', 'score': 0.10908281058073044},
  {'label': 'sadness', 'score': 0.0936271920800209},
  {'label': 'anger', 'score': 0.04047831892967224},
  {'label': 'surprise', 'score': 0.02697017416357994},
  {'label': 'fear', 'score': 0.006879049353301525}],
 [{'label': 'neutral', 'score': 0.6462163925170898},
  {'label': 'sadness', 'score': 0.24273289740085602},
  {'label': 'disgust', 'score': 0.043422624468803406},
  {'label': 'surprise', 'score': 0.028300542384386063},
  {'label': 'joy', 'score': 0.01421148143

In [22]:
sentences = books["description"][0].split(".")
predictions = classifier(sentences)

In [17]:
sentences[0]

'A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives'

In [23]:
predictions[0]

[{'label': 'surprise', 'score': 0.7296032905578613},
 {'label': 'neutral', 'score': 0.14038528501987457},
 {'label': 'fear', 'score': 0.06816212832927704},
 {'label': 'joy', 'score': 0.04794234782457352},
 {'label': 'anger', 'score': 0.00915635097771883},
 {'label': 'disgust', 'score': 0.002628469606861472},
 {'label': 'sadness', 'score': 0.002122158883139491}]

In [None]:
sentences[4]


' John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers'

In [24]:
predictions[4]

[{'label': 'sadness', 'score': 0.9671574234962463},
 {'label': 'neutral', 'score': 0.015104176476597786},
 {'label': 'disgust', 'score': 0.0064806039445102215},
 {'label': 'fear', 'score': 0.005394001957029104},
 {'label': 'surprise', 'score': 0.0022869459353387356},
 {'label': 'anger', 'score': 0.0018428926123306155},
 {'label': 'joy', 'score': 0.0017338803736492991}]

In [25]:
sorted(predictions[0], key=lambda x: x['score'])

[{'label': 'sadness', 'score': 0.002122158883139491},
 {'label': 'disgust', 'score': 0.002628469606861472},
 {'label': 'anger', 'score': 0.00915635097771883},
 {'label': 'joy', 'score': 0.04794234782457352},
 {'label': 'fear', 'score': 0.06816212832927704},
 {'label': 'neutral', 'score': 0.14038528501987457},
 {'label': 'surprise', 'score': 0.7296032905578613}]

In [27]:
emotion_labels = ["anger", "disgust", "fear", "joy", "neutral", "sadness", "surprise"]
isbn = []
emotion_scores = {label: [] for label in emotion_labels}

def calculate_max_emotion_scores(predictions):
    per_emotion_score = {label: [] for label in emotion_labels}
    for prediction in predictions:
        sorted_prediction = sorted(prediction, key=lambda x: x['score'], reverse=True)
        for index, label in enumerate(emotion_labels):
            per_emotion_score[label].append(sorted_prediction[index]['score'])
    return {label: np.max(scores) for label, scores in per_emotion_score.items()}


In [28]:
# test it using a small subset first
for i in range(10):
    isbn.append(books["isbn13"][i])
    sentences = books["description"][i].split(".")
    predictions = classifier(sentences)
    max_scores = calculate_max_emotion_scores(predictions)
    for label in emotion_labels:
        emotion_scores[label].append(max_scores[label])



You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [29]:
emotion_scores

{'anger': [np.float64(0.9671574234962463),
  np.float64(0.9425276517868042),
  np.float64(0.9723208546638489),
  np.float64(0.7326855063438416),
  np.float64(0.8843895196914673),
  np.float64(0.7271749377250671),
  np.float64(0.8725652098655701),
  np.float64(0.8202822804450989),
  np.float64(0.9155241250991821),
  np.float64(0.8603722453117371)],
 'disgust': [np.float64(0.27359163761138916),
  np.float64(0.31934988498687744),
  np.float64(0.11169017851352692),
  np.float64(0.3514835834503174),
  np.float64(0.27261340618133545),
  np.float64(0.2719031274318695),
  np.float64(0.3853572905063629),
  np.float64(0.29216688871383667),
  np.float64(0.2794811427593231),
  np.float64(0.1779269576072693)],
 'fear': [np.float64(0.10908281058073044),
  np.float64(0.19543610513210297),
  np.float64(0.10400667041540146),
  np.float64(0.15072251856327057),
  np.float64(0.12224282324314117),
  np.float64(0.11962246149778366),
  np.float64(0.18529143929481506),
  np.float64(0.23448744416236877),
  np.

In [34]:
emotion_labels = ["anger", "disgust", "fear", "joy", "neutral", "sadness", "surprise"]
isbn = []
emotion_scores = {label: [] for label in emotion_labels}

# test it using a small subset first
for i in tqdm(range(len(books))):
    isbn.append(books["isbn13"][i])
    sentences = books["description"][i].split(".")
    predictions = classifier(sentences)
    max_scores = calculate_max_emotion_scores(predictions)
    for label in emotion_labels:
        emotion_scores[label].append(max_scores[label])

100%|██████████| 5198/5198 [03:02<00:00, 28.44it/s]


In [39]:
emotions_df = pd.DataFrame(emotion_scores)
emotions_df["isbn13"] = isbn

In [40]:
emotions_df

Unnamed: 0,anger,disgust,fear,joy,neutral,sadness,surprise,isbn13
0,0.967157,0.273592,0.109083,0.093627,0.064134,0.051363,0.040564,9780002005883
1,0.942528,0.319350,0.195436,0.143348,0.077130,0.051363,0.040564,9780002261982
2,0.972321,0.111690,0.104007,0.078766,0.064134,0.051363,0.040564,9780006178736
3,0.732686,0.351484,0.150723,0.079672,0.064134,0.051363,0.040564,9780006280897
4,0.884390,0.272613,0.122243,0.095043,0.064134,0.051363,0.040564,9780006280934
...,...,...,...,...,...,...,...,...
5193,0.980877,0.305738,0.148209,0.127782,0.043363,0.030656,0.009569,9788172235222
5194,0.883198,0.338892,0.227765,0.078766,0.064134,0.051363,0.040564,9788173031014
5195,0.947779,0.339218,0.141733,0.066685,0.057625,0.009929,0.009055,9788179921623
5196,0.951104,0.368111,0.214132,0.078766,0.064134,0.051363,0.040564,9788185300535


In [41]:
books = pd.merge(books, emotions_df, on = "isbn13")

In [42]:
books

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,...,title_and_subtitle,tagged description,simple_categories,anger,disgust,fear,joy,neutral,sadness,surprise
0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,...,Gilead,9780002005883 A NOVEL THAT READERS and critics...,Fiction,0.967157,0.273592,0.109083,0.093627,0.064134,0.051363,0.040564
1,9780002261982,0002261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,...,Spider's Web: A Novel,9780002261982 A new 'Christie for Christmas' -...,Fiction,0.942528,0.319350,0.195436,0.143348,0.077130,0.051363,0.040564
2,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,...,Rage of angels,"9780006178736 A memorable, mesmerizing heroine...",Fiction,0.972321,0.111690,0.104007,0.078766,0.064134,0.051363,0.040564
3,9780006280897,0006280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,...,The Four Loves,9780006280897 Lewis' work on the nature of lov...,Nonfiction,0.732686,0.351484,0.150723,0.079672,0.064134,0.051363,0.040564
4,9780006280934,0006280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,...,The Problem of Pain,"9780006280934 ""In The Problem of Pain, C.S. Le...",Nonfiction,0.884390,0.272613,0.122243,0.095043,0.064134,0.051363,0.040564
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5193,9788172235222,8172235224,Mistaken Identity,Nayantara Sahgal,Indic fiction (English),http://books.google.com/books/content?id=q-tKP...,On A Train Journey Home To North India After L...,2003.0,2.93,324.0,...,Mistaken Identity,9788172235222 On A Train Journey Home To North...,Fiction,0.980877,0.305738,0.148209,0.127782,0.043363,0.030656,0.009569
5194,9788173031014,8173031010,Journey to the East,Hermann Hesse,Adventure stories,http://books.google.com/books/content?id=rq6JP...,This book tells the tale of a man who goes on ...,2002.0,3.70,175.0,...,Journey to the East,9788173031014 This book tells the tale of a ma...,Nonfiction,0.883198,0.338892,0.227765,0.078766,0.064134,0.051363,0.040564
5195,9788179921623,817992162X,The Monk Who Sold His Ferrari: A Fable About F...,Robin Sharma,Health & Fitness,http://books.google.com/books/content?id=c_7mf...,"Wisdom to Create a Life of Passion, Purpose, a...",2003.0,3.82,198.0,...,The Monk Who Sold His Ferrari: A Fable About F...,9788179921623 Wisdom to Create a Life of Passi...,Fiction,0.947779,0.339218,0.141733,0.066685,0.057625,0.009929,0.009055
5196,9788185300535,8185300534,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,...,I Am that: Talks with Sri Nisargadatta Maharaj,9788185300535 This collection of the timeless ...,Nonfiction,0.951104,0.368111,0.214132,0.078766,0.064134,0.051363,0.040564


In [43]:
books.to_csv("./data/books_with_emotion_scores.csv", index=False)