In [1]:
from nltk.tokenize import TweetTokenizer
import pandas as pd

In [2]:
lexipaths = {
    "VAD": "NRC-VAD-Lexicon/NRC-VAD-Lexicon.txt",
    "emotion": "NRC-Emotion-Lexicon-v0.92/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt",
    "affect intensity": "NRC-Affect-Intensity-Lexicon/NRC-AffectIntensity-Lexicon.txt"
}
big_folder = "NRC-Sentiment-Emotion-Lexicons"

In [20]:
class Emo:
    def __init__(self,lexicon_paths:dict=lexipaths,overfolder:str=big_folder):
        self.tknzr = TweetTokenizer()
        self.lexicons = {}
        for lex in lexicon_paths:
            filename = "../{}/{}".format(overfolder,lexicon_paths[lex])
            with open(filename,"r",encoding="utf-8") as f:
                if lex == "emotion":
                    self.lexicons[lex] = pd.read_csv(f,sep="\t",index_col=["Word","Sense"])
                elif lex == "affect intensity":
                    self.lexicons[lex] = pd.read_csv(f,sep="\t",index_col=["Word","AffectDimension"])
                else:
                    self.lexicons[lex] = pd.read_csv(f,sep="\t",index_col="Word")

    def classify_sentence(self,
        sentence:"""string or list of words""",
        database:"""one of VAD, emotion, or affect intensity""",
        tokenize:bool=True
        ):
        if database == "emotion":
            values = {
                "anger": 0,
                "anticipation": 0,
                "disgust": 0,
                "fear": 0,
                "joy": 0,
                "sadness": 0,
                "surprise": 0,
                "trust": 0
            }
        elif database == "VAD":
            values = {
                "Valence": 0,
                "Arousal": 0,
                "Dominance": 0
            }
        elif database == "affect intensity":
            values = {
                "anger": 0,
                "fear": 0,
                "joy": 0,
                "sadness": 0
            }
        else:
            raise ValueError("database must be one of VAD, emotion, or affect intensity")
        df = self.lexicons[database]
        words_factored = 0
        if tokenize:
            bag_of_words = self.tknzr.tokenize(sentence)
        else:
            if type(sentence) is str:
                bag_of_words = sentence.split()
            else:
                bag_of_words = sentence
        for word in bag_of_words:
            word = word.lower()
            try:
                word_values = df.loc[word]
            except KeyError:
                continue
            else:
                words_factored += 1
                for key in values:
                    if database == "VAD":
                        values[key] += df.loc[word,key]
                    elif database == "emotion":
                        values[key] += df.loc[(word,key),'Score']
                    elif database == "affect intensity":
                        try:
                            v = df.loc[(word,key),'Score']
                        except KeyError:
                            continue
                        else:
                            values[key] += v
        if words_factored == 0:
            print("No words from this sentence found in lexicon.")
            return None
        return {key:(values[key]/words_factored) for key in values}
        

In [25]:
emojudge = Emo()
emojudge.classify_sentence("Damn it!","VAD")

{'Valence': 0.073, 'Arousal': 0.784, 'Dominance': 0.431}