In [1]:
import pandas as pd
import math, re, string, requests, json
from itertools import product
from inspect import getsourcefile
from os.path import abspath, join, dirname
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import random
%matplotlib inline 
matplotlib.style.use('ggplot')

In [3]:
df = pd.read_csv('datasets_recsys/reviews.csv', sep=';', encoding='latin')

In [4]:
print('total reviews: {}'.format(len(df['story_id'])))
df.head()

total reviews: 132711


Unnamed: 0,reviewer_name,review_date,review_text,story_id
0,3522972,11-04-14,Why is this so adorable? I blame the ending. S...,10000529
1,3700017,07-02-14,Oh my goodness. Eliot and Clint working togeth...,10000529
2,462777,1/30/2014,This was an awesome piece! I like how you bro...,10000529
3,7455526,05-abr,wtf,10001611
4,6451578,12/24/2016,I just read this a second time & I still think...,10001611


In [5]:
class SentimentIntensityAnalyzer(object):
    """
    Give a sentiment intensity score to sentences.
    """
    def __init__(self, lexicon_file="vader_lexicon.txt"):
        _this_module_file_path_ = abspath(getsourcefile(lambda:0))
        lexicon_full_filepath = join(dirname(_this_module_file_path_), lexicon_file)
        with open(lexicon_full_filepath) as f:
            self.lexicon_full_filepath = f.read()
        self.lexicon = self.make_lex_dict()

    def make_lex_dict(self):
        """
        Convert lexicon file to a dictionary
        """
        lex_dict = {}
        for line in self.lexicon_full_filepath.split('\n'):
            (word, measure) = line.strip().split('\t')[0:2]
            lex_dict[word] = float(measure)
        return lex_dict

    def polarity_scores(self, text):
        """
        Return a float for sentiment strength based on the input text.
        Positive values are positive valence, negative value are negative
        valence.
        """
        sentitext = SentiText(text)
        #text, words_and_emoticons, is_cap_diff = self.preprocess(text)

        sentiments = []
        words_and_emoticons = sentitext.words_and_emoticons
        for item in words_and_emoticons:
            valence = 0
            i = words_and_emoticons.index(item)
            if (i < len(words_and_emoticons) - 1 and item.lower() == "kind" and \
                words_and_emoticons[i+1].lower() == "of") or \
                item.lower() in BOOSTER_DICT:
                sentiments.append(valence)
                continue

            sentiments = self.sentiment_valence(valence, sentitext, item, i, sentiments)

        sentiments = self._but_check(words_and_emoticons, sentiments)
        
        valence_dict = self.score_valence(sentiments, text)

        return valence_dict

    def sentiment_valence(self, valence, sentitext, item, i, sentiments):
        is_cap_diff = sentitext.is_cap_diff
        words_and_emoticons = sentitext.words_and_emoticons
        item_lowercase = item.lower()
        if item_lowercase in self.lexicon:
            #get the sentiment valence
            valence = self.lexicon[item_lowercase]

            #check if sentiment laden word is in ALL CAPS (while others aren't)
            if item.isupper() and is_cap_diff:
                if valence > 0:
                    valence += C_INCR
                else:
                    valence -= C_INCR

            for start_i in range(0,3):
                if i > start_i and words_and_emoticons[i-(start_i+1)].lower() not in self.lexicon:
                    # dampen the scalar modifier of preceding words and emoticons
                    # (excluding the ones that immediately preceed the item) based
                    # on their distance from the current item.
                    s = scalar_inc_dec(words_and_emoticons[i-(start_i+1)], valence, is_cap_diff)
                    if start_i == 1 and s != 0:
                        s = s*0.95
                    if start_i == 2 and s != 0:
                        s = s*0.9
                    valence = valence+s
                    valence = self._never_check(valence, words_and_emoticons, start_i, i)
                    if start_i == 2:
                        valence = self._idioms_check(valence, words_and_emoticons, i)

                        # future work: consider other sentiment-laden idioms
                        # other_idioms =
                        # {"back handed": -2, "blow smoke": -2, "blowing smoke": -2,
                        #  "upper hand": 1, "break a leg": 2,
                        #  "cooking with gas": 2, "in the black": 2, "in the red": -2,
                        #  "on the ball": 2,"under the weather": -2}

            valence = self._least_check(valence, words_and_emoticons, i)

        sentiments.append(valence)
        return sentiments

    def _least_check(self, valence, words_and_emoticons, i):
        # check for negation case using "least"
        if i > 1 and words_and_emoticons[i-1].lower() not in self.lexicon \
           and words_and_emoticons[i-1].lower() == "least":
            if words_and_emoticons[i-2].lower() != "at" and words_and_emoticons[i-2].lower() != "very":
                valence = valence*N_SCALAR
        elif i > 0 and words_and_emoticons[i-1].lower() not in self.lexicon \
             and words_and_emoticons[i-1].lower() == "least":
            valence = valence*N_SCALAR
        return valence

    def _but_check(self, words_and_emoticons, sentiments):
        # check for modification in sentiment due to contrastive conjunction 'but'
        if 'but' in words_and_emoticons or 'BUT' in words_and_emoticons:
            try:
                bi = words_and_emoticons.index('but')
            except ValueError:
                bi = words_and_emoticons.index('BUT')
            for sentiment in sentiments:
                si = sentiments.index(sentiment)
                if si < bi:
                    sentiments.pop(si)
                    sentiments.insert(si, sentiment*0.5)
                elif si > bi:
                    sentiments.pop(si)
                    sentiments.insert(si, sentiment*1.5)
        return sentiments

    def _idioms_check(self, valence, words_and_emoticons, i):
        onezero = "{0} {1}".format(words_and_emoticons[i-1], words_and_emoticons[i])

        twoonezero = "{0} {1} {2}".format(words_and_emoticons[i-2],
                                       words_and_emoticons[i-1], words_and_emoticons[i])

        twoone = "{0} {1}".format(words_and_emoticons[i-2], words_and_emoticons[i-1])

        threetwoone = "{0} {1} {2}".format(words_and_emoticons[i-3],
                                        words_and_emoticons[i-2], words_and_emoticons[i-1])

        threetwo = "{0} {1}".format(words_and_emoticons[i-3], words_and_emoticons[i-2])

        sequences = [onezero, twoonezero, twoone, threetwoone, threetwo]

        for seq in sequences:
            if seq in SPECIAL_CASE_IDIOMS:
                valence = SPECIAL_CASE_IDIOMS[seq]
                break

        if len(words_and_emoticons)-1 > i:
            zeroone = "{0} {1}".format(words_and_emoticons[i], words_and_emoticons[i+1])
            if zeroone in SPECIAL_CASE_IDIOMS:
                valence = SPECIAL_CASE_IDIOMS[zeroone]
        if len(words_and_emoticons)-1 > i+1:
            zeroonetwo = "{0} {1} {2}".format(words_and_emoticons[i], words_and_emoticons[i+1], words_and_emoticons[i+2])
            if zeroonetwo in SPECIAL_CASE_IDIOMS:
                valence = SPECIAL_CASE_IDIOMS[zeroonetwo]

        # check for booster/dampener bi-grams such as 'sort of' or 'kind of'
        if threetwo in BOOSTER_DICT or twoone in BOOSTER_DICT:
            valence = valence+B_DECR
        return valence

    def _never_check(self, valence, words_and_emoticons, start_i, i):
        if start_i == 0:
            if negated([words_and_emoticons[i-1]]):
                    valence = valence*N_SCALAR
        if start_i == 1:
            if words_and_emoticons[i-2] == "never" and\
               (words_and_emoticons[i-1] == "so" or
                words_and_emoticons[i-1] == "this"):
                valence = valence*1.5
            elif negated([words_and_emoticons[i-(start_i+1)]]):
                valence = valence*N_SCALAR
        if start_i == 2:
            if words_and_emoticons[i-3] == "never" and \
               (words_and_emoticons[i-2] == "so" or words_and_emoticons[i-2] == "this") or \
               (words_and_emoticons[i-1] == "so" or words_and_emoticons[i-1] == "this"):
                valence = valence*1.25
            elif negated([words_and_emoticons[i-(start_i+1)]]):
                valence = valence*N_SCALAR
        return valence

    def _punctuation_emphasis(self, sum_s, text):
        # add emphasis from exclamation points and question marks
        ep_amplifier = self._amplify_ep(text)
        qm_amplifier = self._amplify_qm(text)
        punct_emph_amplifier = ep_amplifier+qm_amplifier
        return punct_emph_amplifier

    def _amplify_ep(self, text):
        # check for added emphasis resulting from exclamation points (up to 4 of them)
        ep_count = text.count("!")
        if ep_count > 4:
            ep_count = 4
        # (empirically derived mean sentiment intensity rating increase for
        # exclamation points)
        ep_amplifier = ep_count*0.292
        return ep_amplifier

    def _amplify_qm(self, text):
        # check for added emphasis resulting from question marks (2 or 3+)
        qm_count = text.count("?")
        qm_amplifier = 0
        if qm_count > 1:
            if qm_count <= 3:
                # (empirically derived mean sentiment intensity rating increase for
                # question marks)
                qm_amplifier = qm_count*0.18
            else:
                qm_amplifier = 0.96
        return qm_amplifier

    def _sift_sentiment_scores(self, sentiments):
        # want separate positive versus negative sentiment scores
        pos_sum = 0.0
        neg_sum = 0.0
        neu_count = 0
        for sentiment_score in sentiments:
            if sentiment_score > 0:
                pos_sum += (float(sentiment_score) +1) # compensates for neutral words that are counted as 1
            if sentiment_score < 0:
                neg_sum += (float(sentiment_score) -1) # when used with math.fabs(), compensates for neutrals
            if sentiment_score == 0:
                neu_count += 1
        return pos_sum, neg_sum, neu_count

    def score_valence(self, sentiments, text):
        if sentiments:
            sum_s = float(sum(sentiments))
            # compute and add emphasis from punctuation in text
            punct_emph_amplifier = self._punctuation_emphasis(sum_s, text)
            if sum_s > 0:
                sum_s += punct_emph_amplifier
            elif  sum_s < 0:
                sum_s -= punct_emph_amplifier

            compound = normalize(sum_s)
            # discriminate between positive, negative and neutral sentiment scores
            pos_sum, neg_sum, neu_count = self._sift_sentiment_scores(sentiments)

            if pos_sum > math.fabs(neg_sum):
                pos_sum += (punct_emph_amplifier)
            elif pos_sum < math.fabs(neg_sum):
                neg_sum -= (punct_emph_amplifier)

            total = pos_sum + math.fabs(neg_sum) + neu_count
            pos = math.fabs(pos_sum / total)
            neg = math.fabs(neg_sum / total)
            neu = math.fabs(neu_count / total)

        else:
            compound = 0.0
            pos = 0.0
            neg = 0.0
            neu = 0.0

        sentiment_dict = \
            {"neg" : round(neg, 3),
             "neu" : round(neu, 3),
             "pos" : round(pos, 3),
             "compound" : round(compound, 4)}

        return sentiment_dict

In [6]:
def negated(input_words, include_nt=True):
    """
    Determine if input contains negation words
    """
    neg_words = []
    neg_words.extend(NEGATE)
    for word in neg_words:
        if word in input_words:
            return True
    if include_nt:
        for word in input_words:
            if "n't" in word:
                return True
    if "least" in input_words:
        i = input_words.index("least")
        if i > 0 and input_words[i-1] != "at":
            return True
    return False


def normalize(score, alpha=15):
    """
    Normalize the score to be between -1 and 1 using an alpha that
    approximates the max expected value
    """
    norm_score = score/math.sqrt((score*score) + alpha)
    if norm_score < -1.0: 
        return -1.0
    elif norm_score > 1.0:
        return 1.0
    else:
        return norm_score


def allcap_differential(words):
    """
    Check whether just some words in the input are ALL CAPS
    :param list words: The words to inspect
    :returns: `True` if some but not all items in `words` are ALL CAPS
    """
    is_different = False
    allcap_words = 0
    for word in words:
        if word.isupper():
            allcap_words += 1
    cap_differential = len(words) - allcap_words
    if cap_differential > 0 and cap_differential < len(words):
        is_different = True
    return is_different


def scalar_inc_dec(word, valence, is_cap_diff):
    """
    Check if the preceding words increase, decrease, or negate/nullify the
    valence
    """
    scalar = 0.0
    word_lower = word.lower()
    if word_lower in BOOSTER_DICT:
        scalar = BOOSTER_DICT[word_lower]
        if valence < 0:
            scalar *= -1
        #check if booster/dampener word is in ALLCAPS (while others aren't)
        if word.isupper() and is_cap_diff:
            if valence > 0:
                scalar += C_INCR
            else: scalar -= C_INCR
    return scalar

In [7]:
REGEX_REMOVE_PUNCTUATION = re.compile('[%s]' % re.escape(string.punctuation))

# (empirically derived mean sentiment intensity rating increase for booster words)
B_INCR = 0.293
B_DECR = -0.293

# (empirically derived mean sentiment intensity rating increase for using
# ALLCAPs to emphasize a word)
C_INCR = 0.733

N_SCALAR = -0.74

# for removing punctuation
REGEX_REMOVE_PUNCTUATION = re.compile('[%s]' % re.escape(string.punctuation))

PUNC_LIST = [".", "!", "?", ",", ";", ":", "-", "'", "\"",
             "!!", "!!!", "??", "???", "?!?", "!?!", "?!?!", "!?!?"]
NEGATE = \
["aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt",
 "ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't",
 "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither",
 "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't",
 "neednt", "needn't", "never", "none", "nope", "nor", "not", "nothing", "nowhere",
 "oughtnt", "shant", "shouldnt", "uhuh", "wasnt", "werent",
 "oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't",
 "without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite"]

# booster/dampener 'intensifiers' or 'degree adverbs'
# http://en.wiktionary.org/wiki/Category:English_degree_adverbs

BOOSTER_DICT = \
{"absolutely": B_INCR, "amazingly": B_INCR, "awfully": B_INCR, "completely": B_INCR, "considerably": B_INCR,
 "decidedly": B_INCR, "deeply": B_INCR, "effing": B_INCR, "enormously": B_INCR,
 "entirely": B_INCR, "especially": B_INCR, "exceptionally": B_INCR, "extremely": B_INCR,
 "fabulously": B_INCR, "flipping": B_INCR, "flippin": B_INCR,
 "fricking": B_INCR, "frickin": B_INCR, "frigging": B_INCR, "friggin": B_INCR, "fully": B_INCR, "fucking": B_INCR,
 "greatly": B_INCR, "hella": B_INCR, "highly": B_INCR, "hugely": B_INCR, "incredibly": B_INCR,
 "intensely": B_INCR, "majorly": B_INCR, "more": B_INCR, "most": B_INCR, "particularly": B_INCR,
 "purely": B_INCR, "quite": B_INCR, "really": B_INCR, "remarkably": B_INCR,
 "so": B_INCR, "substantially": B_INCR,
 "thoroughly": B_INCR, "totally": B_INCR, "tremendously": B_INCR,
 "uber": B_INCR, "unbelievably": B_INCR, "unusually": B_INCR, "utterly": B_INCR,
 "very": B_INCR,
 "almost": B_DECR, "barely": B_DECR, "hardly": B_DECR, "just enough": B_DECR,
 "kind of": B_DECR, "kinda": B_DECR, "kindof": B_DECR, "kind-of": B_DECR,
 "less": B_DECR, "little": B_DECR, "marginally": B_DECR, "occasionally": B_DECR, "partly": B_DECR,
 "scarcely": B_DECR, "slightly": B_DECR, "somewhat": B_DECR,
 "sort of": B_DECR, "sorta": B_DECR, "sortof": B_DECR, "sort-of": B_DECR}

# check for special case idioms using a sentiment-laden keyword known to VADER
SPECIAL_CASE_IDIOMS = {"the shit": 3, "the bomb": 3, "bad ass": 1.5, "yeah right": -2,
                       "cut the mustard": 2, "kiss of death": -1.5, "hand to mouth": -2}

class SentiText(object):
    """
    Identify sentiment-relevant string-level properties of input text.
    """
    def __init__(self, text):
        if not isinstance(text, str):
            text = str(text.encode('utf-8'))
        self.text = text
        self.words_and_emoticons = self._words_and_emoticons()
        # doesn't separate words from\
        # adjacent punctuation (keeps emoticons & contractions)
        self.is_cap_diff = allcap_differential(self.words_and_emoticons)

    def _words_plus_punc(self):
        """
        Returns mapping of form:
        {
            'cat,': 'cat',
            ',cat': 'cat',
        }
        """
        no_punc_text = REGEX_REMOVE_PUNCTUATION.sub('', self.text)
        # removes punctuation (but loses emoticons & contractions)
        words_only = no_punc_text.split()
        # remove singletons
        words_only = set( w for w in words_only if len(w) > 1 )
        # the product gives ('cat', ',') and (',', 'cat')
        punc_before = {''.join(p): p[1] for p in product(PUNC_LIST, words_only)}
        punc_after = {''.join(p): p[0] for p in product(words_only, PUNC_LIST)}
        words_punc_dict = punc_before
        words_punc_dict.update(punc_after)
        return words_punc_dict

    def _words_and_emoticons(self):
        """
        Removes leading and trailing puncutation
        Leaves contractions and most emoticons
            Does not preserve punc-plus-letter emoticons (e.g. :D)
        """
        wes = self.text.split()
        words_punc_dict = self._words_plus_punc()
        wes = [we for we in wes if len(we) > 1]
        for i, we in enumerate(wes):
            if we in words_punc_dict:
                wes[i] = words_punc_dict[we]
        return wes

In [8]:
analyzer = SentimentIntensityAnalyzer()

In [9]:
reviews = list(df['review_text'])

compound = []
positive =[]
neutral=[]
negative =[]

for r in reviews:
    vs = analyzer.polarity_scores(r)
    compound.append(vs['compound'])
    positive.append(vs['pos'])
    neutral.append(vs['neu'])
    negative.append(vs['neg'])

In [10]:
df['negative'] = pd.Series(negative)
df['neutral'] = pd.Series(neutral)
df['positive'] = pd.Series(positive)
df['compound'] = pd.Series(compound)
df['emotion'] = np.where(df['compound']>0.65, 'positive', 'negative')

# guardamos csv con reviews y sentiment analysis de c/u 
#df.to_csv('reviews_sentiment.csv', sep='|', index=False)

df.head()

Unnamed: 0,reviewer_name,review_date,review_text,story_id,negative,neutral,positive,compound,emotion
0,3522972,11-04-14,Why is this so adorable? I blame the ending. S...,10000529,0.158,0.469,0.373,0.7868,positive
1,3700017,07-02-14,Oh my goodness. Eliot and Clint working togeth...,10000529,0.0,0.638,0.362,0.7783,positive
2,462777,1/30/2014,This was an awesome piece! I like how you bro...,10000529,0.0,0.733,0.267,0.8805,positive
3,7455526,05-abr,wtf,10001611,1.0,0.0,0.0,-0.5859,negative
4,6451578,12/24/2016,I just read this a second time & I still think...,10001611,0.0,1.0,0.0,0.0,negative


## Sentiment analysis stories summary

In [13]:
df_summary = pd.read_csv('datasets_recsys/ff_users_fav_stories_data.csv', sep='|', encoding='latin')
df_summary.head()

Unnamed: 0,user,story_id,plays,franchise_x,story_title,author,franchise_y,story_summary,q_words,date_submit,date_update,chapters,reviews
0,752508,1984041,1,Alias,Trying to Carry On,590553.0,Alias,"He had left her, her and her daughter. She wan...",51285.0,"7/27/2004,9/6/2004,3/19/2005,9/6/2004,7/21/200...",11/24/2005,17.0,267.0
1,752508,2046469,1,Alias,A Second Chance,590553.0,Alias,"SV had an affair and as a result, Sydney becam...",34944.0,"7/27/2004,9/6/2004,3/19/2005,9/6/2004,7/21/200...",7/22/2005,11.0,238.0
2,752508,1991603,1,Alias,A Question of Fate,586795.0,Alias,"SV S3. Angst, Romance, Action, Humor aw, come...",52468.0,"2/3/2005,8/1/2004,10/28/2004",6/16/2005,21.0,122.0
3,752508,2179554,1,Alias,The Ace of Hearts,590553.0,Alias,Sydney and Vaughn play a card game to pass the...,17342.0,"7/27/2004,9/6/2004,3/19/2005,9/6/2004,7/21/200...",5/6/2005,5.0,138.0
4,752508,1879281,1,Alias,The Vaughn\'s,251435.0,Alias,Sequel to 'One wedding and a near funeral.' S...,26402.0,"5/26/2004,11/15/2004,4/21/2004,3/17/2004,12/12...",2/21/2005,20.0,55.0


In [15]:
summary = list(df_summary['story_summary'])

compound = []
positive =[]
neutral=[]
negative =[]

for r in summary:
    try:
        vs = analyzer.polarity_scores(r)
        compound.append(vs['compound'])
        positive.append(vs['pos'])
        neutral.append(vs['neu'])
        negative.append(vs['neg'])
    except:
        continue

In [None]:
df_summary['negative_sum'] = pd.Series(negative)
df_summary['neutral_sum'] = pd.Series(neutral)
df_summary['positive_sum'] = pd.Series(positive)
df_summary['compound_sum'] = pd.Series(compound)
df_summary['emotion_sum'] = np.where(df_summary['compound_sum']>0.65, 'positive', 'negative')

df_summary.head(10)

# reordenamos 
df_summary = df_summary[['story_id', 'author','franchise', 'story_title','story_summary', 'q_words', 'reviews', 'compound_sum', 'emotion_sum']]
df_summary.head()

Ahora hacemos una comparacion de sentiment analysis entre reviews y resumen de historias:

In [14]:
df_total = pd.merge(df, df_summary, on=['story_id'])

print('total stories: {}'.format(len(df_total)))

df_total.head()

total stories: 139754


Unnamed: 0,reviewer_name,review_date,review_text,story_id,negative,neutral,positive,compound,emotion,author,franchise,story_title,story_summary,q_words,reviews,compound_sum,emotion_sum
0,3522972,11-04-14,Why is this so adorable? I blame the ending. S...,10000529,0.158,0.469,0.373,0.7868,positive,4685422.0,Leverage & Avengers,A Little Way Up The Road,Clint asks an old friend for some assistance. ...,1793.0,5.0,0.8689,positive
1,3700017,07-02-14,Oh my goodness. Eliot and Clint working togeth...,10000529,0.0,0.638,0.362,0.7783,positive,4685422.0,Leverage & Avengers,A Little Way Up The Road,Clint asks an old friend for some assistance. ...,1793.0,5.0,0.8689,positive
2,462777,1/30/2014,This was an awesome piece! I like how you bro...,10000529,0.0,0.733,0.267,0.8805,positive,4685422.0,Leverage & Avengers,A Little Way Up The Road,Clint asks an old friend for some assistance. ...,1793.0,5.0,0.8689,positive
3,7455526,05-abr,wtf,10001611,1.0,0.0,0.0,-0.5859,negative,5439324.0,Frozen,Life\'s Too Short,Anna wants to know why Elsa keeps turning down...,8806.0,57.0,0.126,negative
4,6451578,12/24/2016,I just read this a second time & I still think...,10001611,0.0,1.0,0.0,0.0,negative,5439324.0,Frozen,Life\'s Too Short,Anna wants to know why Elsa keeps turning down...,8806.0,57.0,0.126,negative


In [15]:
df_total = df_total[['story_id','review_text','compound','emotion','franchise','story_summary','compound_sum','emotion_sum']]
df_total.head()

Unnamed: 0,story_id,review_text,compound,emotion,franchise,story_summary,compound_sum,emotion_sum
0,10000529,Why is this so adorable? I blame the ending. S...,0.7868,positive,Leverage & Avengers,Clint asks an old friend for some assistance. ...,0.8689,positive
1,10000529,Oh my goodness. Eliot and Clint working togeth...,0.7783,positive,Leverage & Avengers,Clint asks an old friend for some assistance. ...,0.8689,positive
2,10000529,This was an awesome piece! I like how you bro...,0.8805,positive,Leverage & Avengers,Clint asks an old friend for some assistance. ...,0.8689,positive
3,10001611,wtf,-0.5859,negative,Frozen,Anna wants to know why Elsa keeps turning down...,0.126,negative
4,10001611,I just read this a second time & I still think...,0.0,negative,Frozen,Anna wants to know why Elsa keeps turning down...,0.126,negative


In [16]:
count=0
for x in range(0,len(list(df_total['emotion']))):
    if list(df_total['emotion'])[x] == list(df_total['emotion_sum'])[x] :
        count+=1

print('coincidencias sentiment reviews/summary: {}'.format(count))
print('coincidencia porcentual sentiment summary/reviews: {} %'.format((count/len(list(df_total['emotion'])))*100))

coincidencias sentiment reviews/summary: 79590
coincidencia porcentual sentiment summary/reviews: 56.95006940767349 %


## Sentimiento del resumen de las franquicias: 

In [None]:
# sentimiento promedio de franchises summary (sample 50 franquicias aleatorias)
franchise_sentiment = df_summary[['story_id','franchise_x','compound_sum']].sample(30)
franchise_sentiment = franchise_sentiment.groupby(['franchise_x'])['compound_sum'].mean()

franchise_sentiment.plot.bar()

## Sentimiento de las reviews en las franquicias: 

In [2]:
# sentimiento promedio de las reviews franchises (random 50 franquicias)
franchise_sentiment_reviews = df_total[['story_id','franchise','compound']].sample(30)
franchise_sentiment_reviews = franchise_sentiment_reviews.groupby(['franchise'])['compound'].mean()
franchise_sentiment_reviews.plot.bar(figsize=(20, 8))

NameError: name 'df_total' is not defined

In [19]:
# sentimientos de los reviews 
df_total['compound'].describe()

count    139754.000000
mean          0.398409
std           0.473383
min          -1.000000
25%           0.000000
50%           0.540000
75%           0.796400
max           1.000000
Name: compound, dtype: float64

In [20]:
# sentimientos de los resumenes de las historias 
df_total['compound_sum'].describe()

count    139754.000000
mean          0.060523
std           0.566547
min          -0.986400
25%          -0.421500
50%           0.000000
75%           0.571900
max           0.989600
Name: compound_sum, dtype: float64

Los reviewers tienen pensamientos mas positivos que los mismos escritores de las historias en fanfiction...