In [1]:
from collections import OrderedDict, defaultdict, Counter
import pandas as pd
import csv
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
data = pd.read_csv("../data/clean/scores.csv").drop("Unnamed: 0", axis=1)
lexicon = '../data/lexicons/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt'

In [3]:
def create_emotionList(lexicon):
    wordList = defaultdict(list)
    emotionList = defaultdict(list)
    with open(lexicon, 'r') as f:
        reader = csv.reader(f, delimiter='\t')
        headerRows = [i for i in range(0, 46)]
        for row in headerRows:
            next(reader)
        for word, emotion, present in reader:
            if int(present) == 1:
                #print(word)
                wordList[word].append(emotion)
                emotionList[emotion].append(word)
    return wordList

In [4]:
wordList = create_emotionList(lexicon)
tt = TweetTokenizer()

In [5]:
def generate_emotion_count(string, tokenizer):
    emoCount = Counter()
    for token in tt.tokenize(string):
        token = token.lower()
        emoCount += Counter(wordList[token])
    #dividing by length of string   
    for k in emoCount.keys():
        emoCount[k] = emoCount[k]*1.0/len(tt.tokenize(string))
        
    return emoCount

In [6]:
def add_emotion_score(data):
    emotion_scores = [generate_emotion_count(tweet, tt) for tweet in data['text']]
    emotion_df = pd.DataFrame(emotion_scores, index=data.index)
    emotion_df = emotion_df.fillna(0)
    return pd.concat([data, emotion_df], axis = 1)

In [7]:
d = add_emotion_score(data)
d.head()

Unnamed: 0,id_str,score,text,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust
0,927851452185407490,-1.0,....and has been horrible on Virginia economy....,0.153846,0.076923,0.076923,0.076923,0.076923,0.153846,0.076923,0.076923,0.076923,0.153846
1,925806343855984647,-0.9,We mourn the horrifying terrorist attack in NY...,0.083333,0.0,0.083333,0.125,0.0,0.208333,0.0,0.166667,0.041667,0.0
2,924639422066384896,-0.9,"...""collusion,"" which doesn't exist. The Dems ...",0.1875,0.0,0.15625,0.15625,0.0,0.15625,0.0,0.125,0.0,0.0
3,922798321739161600,-0.7,"Bob Corker, who helped President O give us the...",0.071429,0.035714,0.035714,0.035714,0.035714,0.107143,0.071429,0.071429,0.035714,0.071429
4,927644826006425601,-0.5,"The state of Virginia economy, under Democrat ...",0.074074,0.074074,0.037037,0.074074,0.037037,0.074074,0.037037,0.074074,0.037037,0.111111


In [8]:
d.to_csv(path_or_buf ="../data/clean/emoscores.csv")