In [1]:
from collections import OrderedDict, defaultdict, Counter
import pandas as pd
import csv
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
data = pd.read_csv("../data/clean/scores.csv").drop("Unnamed: 0", axis=1)
lexicon = '../resources/lexicons/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt'

In [3]:
def create_emotionList(lexicon):
    wordList = defaultdict(list)
    emotionList = defaultdict(list)
    with open(lexicon, 'r') as f:
        reader = csv.reader(f, delimiter='\t')
        headerRows = [i for i in range(0, 46)]
        for row in headerRows:
            next(reader)
        for word, emotion, present in reader:
            if int(present) == 1:
                #print(word)
                wordList[word].append(emotion)
                emotionList[emotion].append(word)
    return wordList

In [4]:
wordList = create_emotionList(lexicon)
tt = TweetTokenizer()

In [5]:
def generate_emotion_count(string, tokenizer):
    emoCount = Counter()
    for token in tt.tokenize(string):
        token = token.lower()
        emoCount += Counter(wordList[token])
    #dividing by length of string   
    for k in emoCount.keys():
        emoCount[k] = emoCount[k]*1.0/len(tt.tokenize(string))
        
    return emoCount

In [6]:
def add_emotion_score(data):
    emotion_scores = [generate_emotion_count(tweet, tt) for tweet in data['text']]
    emotion_df = pd.DataFrame(emotion_scores, index=data.index)
    emotion_df = emotion_df.fillna(0)
    return pd.concat([data, emotion_df], axis = 1)

In [7]:
d = add_emotion_score(data)
d.head()

Unnamed: 0,id_str,score_keyword,score_textblob_bayes_sentiment,score_textblob_pattern_sentiment,text,score,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust
0,921829947093733376,0.5,0.6,0.964574,"Keep hearing about ""tiny"" amount of money spen...",0.688191,0.03125,0.03125,0.0,0.03125,0.03125,0.09375,0.03125,0.0,0.03125,0.03125
1,925005659569041409,0.5,0.625,0.919302,"Sorry, but this is years ago, before Paul Mana...",0.681434,0.0,0.0,0.0,0.0,0.0,0.0,0.032258,0.0,0.032258,0.0
2,931877599034388480,0.5,0.55,0.950407,Crooked Hillary Clinton is the worst (and bigg...,0.666802,0.0,0.066667,0.0,0.0,0.033333,0.0,0.033333,0.0,0.033333,0.033333
3,926481563214376961,0.5,0.5125,0.932663,"The rigged Dem Primary, one of the biggest pol...",0.648388,0.037037,0.037037,0.037037,0.0,0.0,0.074074,0.037037,0.0,0.0,0.0
4,926456069047582721,1.0,0.401786,0.461401,Bernie Sanders supporters have every right to ...,0.621062,0.047619,0.0,0.047619,0.047619,0.0,0.047619,0.047619,0.047619,0.0,0.0


In [8]:
d.shape

(294, 16)

In [9]:
d.drop(['score_keyword', 'score_textblob_bayes_sentiment', 'score_textblob_pattern_sentiment', 'text', 'score'])
d.to_csv(path_or_buf ="../data/clean/scores_emotion.csv")

In [12]:
len(d.id_str.unique())

294