In [1]:
import pandas as pd
books = pd.read_csv('books_with_categories.csv')

In [2]:
from transformers import pipeline
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)
classifier("I love this!")


Device set to use cpu


[[{'label': 'anger', 'score': 0.004419783595949411},
  {'label': 'disgust', 'score': 0.0016119886422529817},
  {'label': 'fear', 'score': 0.0004138524236623198},
  {'label': 'joy', 'score': 0.9771687984466553},
  {'label': 'neutral', 'score': 0.005764580797404051},
  {'label': 'sadness', 'score': 0.002092391485348344},
  {'label': 'surprise', 'score': 0.00852868054062128}]]

In [3]:
#modified
from transformers import pipeline
classifier = pipeline("text-classification",
                      model="j-hartmann/emotion-english-distilroberta-base",
                      top_k = None,
                      device = -1)
classifier("I love this!")


Device set to use cpu


[[{'label': 'joy', 'score': 0.9771687984466553},
  {'label': 'surprise', 'score': 0.00852868054062128},
  {'label': 'neutral', 'score': 0.005764580797404051},
  {'label': 'anger', 'score': 0.004419783595949411},
  {'label': 'sadness', 'score': 0.002092391485348344},
  {'label': 'disgust', 'score': 0.0016119886422529817},
  {'label': 'fear', 'score': 0.0004138524236623198}]]

In [4]:
books["description"][0]

'A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, Gilead is a song of celebration and acceptance of the best and the worst the world ha

seems that we have a bit of a mixture of emotions and get only one emotion as the output might lose the information

In [5]:
classifier(books["description"][0])

[[{'label': 'fear', 'score': 0.6548405885696411},
  {'label': 'neutral', 'score': 0.16985228657722473},
  {'label': 'sadness', 'score': 0.11640921980142593},
  {'label': 'surprise', 'score': 0.02070065587759018},
  {'label': 'disgust', 'score': 0.019100677222013474},
  {'label': 'joy', 'score': 0.01516144908964634},
  {'label': 'anger', 'score': 0.003935146611183882}]]

seems not exactly right because we can see there is a mixture of tone emotions
so what we can do instead is split this description down into individual sentences and classify on that basis

In [6]:
classifier(books["description"][0].split("."))

[[{'label': 'surprise', 'score': 0.7296020984649658},
  {'label': 'neutral', 'score': 0.14038600027561188},
  {'label': 'fear', 'score': 0.06816228479146957},
  {'label': 'joy', 'score': 0.04794260859489441},
  {'label': 'anger', 'score': 0.009156366810202599},
  {'label': 'disgust', 'score': 0.0026284765917807817},
  {'label': 'sadness', 'score': 0.002122163539752364}],
 [{'label': 'neutral', 'score': 0.44937002658843994},
  {'label': 'disgust', 'score': 0.27359163761138916},
  {'label': 'joy', 'score': 0.10908330976963043},
  {'label': 'sadness', 'score': 0.09362746775150299},
  {'label': 'anger', 'score': 0.04047830402851105},
  {'label': 'surprise', 'score': 0.026970159262418747},
  {'label': 'fear', 'score': 0.006879047024995089}],
 [{'label': 'neutral', 'score': 0.6462159752845764},
  {'label': 'sadness', 'score': 0.24273329973220825},
  {'label': 'disgust', 'score': 0.04342271760106087},
  {'label': 'surprise', 'score': 0.028300564736127853},
  {'label': 'joy', 'score': 0.014211

In [7]:
sentences = books["description"][0].split(".")
predictions = classifier(sentences)

In [8]:
sentences[0]

'A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives'

In [9]:
predictions[0]

[{'label': 'surprise', 'score': 0.7296020984649658},
 {'label': 'neutral', 'score': 0.14038600027561188},
 {'label': 'fear', 'score': 0.06816228479146957},
 {'label': 'joy', 'score': 0.04794260859489441},
 {'label': 'anger', 'score': 0.009156366810202599},
 {'label': 'disgust', 'score': 0.0026284765917807817},
 {'label': 'sadness', 'score': 0.002122163539752364}]

this idea somewhat seems confusing because now we introduce multiple emotion that can be associated with the same book, so how could we make sense of this ? each book can have separate column for each emotions and instead of categorizing which one is or is not, we can instead take the highest probability from across the whole description for that particular sentiment, example if joy is very high in one sentence but its low across the others we just take that highest score of joy for that book description and so on for the other 7 emotions

so what we need to do is process this output of predicter and amalgamate it in such a way that it only give us one maximum score for each of this label

In [10]:
#previously we have different order for every sentence, now what we can do is sort these by "score" instead and what it means is for each sentences we gonna have the same order

import numpy as np

emotion_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]
isbn = [] #for holding isbn because later on we are gonna crate this dataframe and merge it back with full dataframe
emotion_scores = {label: [] for label in emotion_labels} # it is going to contain all the scores of every single description of each of these labels

def calculate_max_emotion_scores (predictions): #prediction will be list of prediction result
    per_emotion_scores = {label: [] for label in emotion_labels} # this is another dictionary comprehention to hold all of the predictions for a particular description (single description)
    for prediction in predictions: # then we loop over each sentences
        sorted_prediction = sorted(prediction, key=lambda x: x["label"]) #each sentences returned by prediction we sort it, all the keys of emotions are in the same order
        for index, label in enumerate(emotion_labels): #then we loop over that
            per_emotion_scores[label].append(sorted_prediction[index]["score"]) #and per of each emotions we extract the score and append using the correct label
    return {label: np.max(scores) for label, scores in per_emotion_scores.items()} #then append this to the dictionary comprehension then we take the maximum score for each emotions

In [11]:
#now we need to apply this function to each of the books
for i in range(10):
    isbn.append(books["isbn10"][i]) #we take isbn10 for that book and append to isbn list
    sentences = books["description"][i].split(".") #take and split individual sentences
    predictions = classifier(sentences) #pass to classifier
    max_scores = calculate_max_emotion_scores(predictions) #pass the result to max score function
    for label in emotion_labels: #then take the result dictionary from that function
        emotion_scores[label].append(max_scores[label])

by this we can have dictionary that becaome the basis of our dataframe with max probability of each book

In [12]:
emotion_scores

{'anger': [np.float64(0.06413359194993973),
  np.float64(0.6126202344894409),
  np.float64(0.06413359194993973),
  np.float64(0.35148438811302185),
  np.float64(0.08141235262155533),
  np.float64(0.2322249710559845),
  np.float64(0.5381842255592346),
  np.float64(0.06413359194993973),
  np.float64(0.3006700277328491),
  np.float64(0.06413359194993973)],
 'disgust': [np.float64(0.27359163761138916),
  np.float64(0.3482847511768341),
  np.float64(0.10400661826133728),
  np.float64(0.1507224589586258),
  np.float64(0.18449543416500092),
  np.float64(0.727174699306488),
  np.float64(0.155854731798172),
  np.float64(0.10400661826133728),
  np.float64(0.2794816195964813),
  np.float64(0.17792661488056183)],
 'fear': [np.float64(0.9281681180000305),
  np.float64(0.9425276517868042),
  np.float64(0.9723208546638489),
  np.float64(0.3607059419155121),
  np.float64(0.09504347294569016),
  np.float64(0.051362793892621994),
  np.float64(0.7474274635314941),
  np.float64(0.4044976532459259),
  np.f

In [13]:
#now apply to all the books
from tqdm import tqdm


emotion_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]
isbn = [] #for holding isbn because later on we are gonna crate this dataframe and merge it back with full dataframe
emotion_scores = {label: [] for label in emotion_labels} # it is going to contain all the scores of every single description of each of these labels

for i in tqdm(range(len(books))):
    isbn.append(books["isbn10"][i])
    sentences = books["description"][i].split(".") #
    predictions = classifier(sentences) #
    max_scores = calculate_max_emotion_scores(predictions) #
    for label in emotion_labels: #
        emotion_scores[label].append(max_scores[label])

100%|██████████| 5197/5197 [22:14<00:00,  3.89it/s] 


In [15]:
#pass the result to panda dataframe, we don't want the index to be isbn but we gonna create a separate column for isbn and assign isbn list to that
emotion_df = pd.DataFrame(emotion_scores)
emotion_df["isbn10"] = isbn

In [16]:
emotion_df

Unnamed: 0,anger,disgust,fear,joy,sadness,surprise,neutral,isbn10
0,0.064134,0.273592,0.928168,0.932798,0.646216,0.967158,0.729602,0002005883
1,0.612620,0.348285,0.942528,0.704422,0.887940,0.111690,0.252546,0002261987
2,0.064134,0.104007,0.972321,0.767238,0.549477,0.111690,0.078765,0006178731
3,0.351484,0.150722,0.360706,0.251881,0.732684,0.111690,0.078765,0006280897
4,0.081412,0.184495,0.095043,0.040564,0.884390,0.475880,0.078765,0006280935
...,...,...,...,...,...,...,...,...
5192,0.148208,0.030643,0.919165,0.255172,0.853721,0.980877,0.030656,8172235224
5193,0.064134,0.114383,0.051363,0.400262,0.883198,0.111690,0.227765,8173031010
5194,0.009997,0.009929,0.339218,0.947779,0.375754,0.066685,0.057625,817992162X
5195,0.064134,0.104007,0.459269,0.759456,0.951104,0.368111,0.078765,8185300534


In [17]:
#final thing : merge back to the dataset
books = pd.merge(books, emotion_df, on="isbn10")

In [18]:
def count_summary (df):
    summary = pd.DataFrame({
        'total' : [df.shape[0]]*df.shape[1],
        'non-null count' : df.count(),
        'unique count' : df.nunique(),
        'missing-count' : df.isnull().sum(),
        'missing-percent' : df.isnull().sum() / df.shape[0] * 100
    })
    return summary.sort_values('missing-percent', ascending=False)

In [19]:
count_summary(books)

Unnamed: 0,total,non-null count,unique count,missing-count,missing-percent
thumbnail,5197,5031,5031,166,3.19415
authors,5197,5165,3045,32,0.61574
categories,5197,5167,479,30,0.577256
title,5197,5197,4969,0,0.0
isbn13,5197,5197,5197,0,0.0
isbn10,5197,5197,5197,0,0.0
description,5197,5197,5154,0,0.0
published_year,5197,5197,83,0,0.0
average_rating,5197,5197,190,0,0.0
num_pages,5197,5197,808,0,0.0


In [20]:
books.to_csv("books_with_emotions.csv", index=False)