In [None]:
! cp lexi

# Notebook for collecting and compilling lexicons for text mining

In [None]:
import pandas as pd
import numpy as np
import requests

## Argument dictionary
*Swapna Somasundaran, Josef Ruppenhofer and Janyce Wiebe (2007) Detecting Arguing and Sentiment in Meetings, SIGdial Workshop on Discourse and Dialogue, Antwerp, Belgium, September 2007 (SIGdial Workshop 2007).*

In [636]:

import os,re
path = '/home/snorre/Dropbox/Forskning/PhD/undervisning/arglex_Somasundaran07/arglex_Somasundaran07/'
files = [path+i for i in os.listdir(path) if 'tff' in i]
macros = ['modals.tff','spoken.tff',
'wordclasses.tff',
'pronoun.tff','intensifiers.tff']
macro2replace = {}
for macro in macros:
    filename = path+macro
    l = open(filename,'r').read().split('\n')[0:-1]
    for i in l[1:]:
        name = i.split('={')[0]
        if not '@' in name:
            continue
        words = '|'.join(i.split('={')[1].strip('}').split(','))
        macro2replace[name] = words
        
class2re = {}
for filename in files:
    if filename.split('/')[-1] in macros:
        continue
    l = open(filename,'r').read().split('\n')[0:-1]
    name = l[0].split('"')[1]
    
    #print(name,len(class2re),end=' ')
    expressions = l[1:]
    expand_exp = []
    for exp in expressions:
        for macro,rep in sorted(macro2replace.items(),key=lambda x: len(x[0]),reverse=True):
            if macro in exp:
                exp = exp.replace(macro,rep)
        if exp=='':
            continue
        expand_exp.append(exp)
    re_exp = re.compile('|'.join(expand_exp),flags=re.IGNORECASE)
    #print(class2re)
    class2re[name] = re_exp
    
def text2argfeatures(text):
    d = {}
    for name,regex in class2re.items():
        d[name] = len(regex.findall(text))
    return d
#import codecs
string_test = codecs.open(path+'patterntest','r','utf-8').read()

import pickle
pickle.dump([class2re,string_test],open('lexicon_functions/text2arg.pkl','wb'))
class2re,string_test = pickle.load(open('lexicon_functions/text2arg.pkl','rb'))
text2argfeatures(string_test)

{'inconsistency': 18,
 'conditionals': 8,
 'contrast': 12,
 'emphasis': 30,
 'causation': 38,
 'wants': 6,
 'difficulty': 11,
 'inyourshoes': 4,
 'rhetoricalquestion': 5,
 'assessments': 24,
 'generalization': 5,
 'structure': 3,
 'necessity': 25,
 'doubt': 4,
 'priority': 8,
 'possibility': 21,
 'authority': 1}

## Subjectivity
http://mpqa.cs.pitt.edu/lexicons/subj_lexicon/


In [435]:
path = '/home/snorre/Dropbox/Forskning/PhD/undervisning/subjectivity_clues_hltemnlp05/subjclueslen1-HLTEMNLP05.tff'

l = open(path,'r').read().split('\n')

data = []
for val in l[0:-1]:
    typ = val.split()[0].split('=')[1]
    length = int(val.split()[1].split('=')[1])
    word = ' '.join(val.split()[2:2+length]).split('=')[1]
    vals = val.split()[2+length:]
    d = dict([i.split('=') for i in vals if len(i.split('='))==2])
    d.update({'w':word,'length':length,'type':typ})
    data.append(d)




In [635]:
from collections import Counter
df = pd.DataFrame(data)
df.sample(5)
subjectivity_types = list(df.type.unique())
w2subj = dict(df[['w','type']].values)
def get_subjectivity(doc, tokenizer=nltk.word_tokenize,agg='mean'):
    if type(doc)==str:
        doc = tokenizer(doc)
    assert type(doc)==list,"please input either a list or a string"
    if len(doc)==0:
        return np.nan
    matches = Counter()
    for w in doc:
        w = w.lower()
        if w in w2subj:
            matches[w2subj[w]]+=1
    
    if len(matches)==0:
        return {typ:0 for typ in subjectivity_types}
    scores = pd.Series(np.array([matches[typ] for typ in subjectivity_types]),index=subjectivity_types)
    if agg=='mean':
        scores =  scores/len(doc)
    elif agg =='abs': 
        scores = scores
    else:
        scores =  agg(scores)
    return dict(scores)

pickle.dump([w2subj,subjectivity_types],open('lexicon_functions/subjectivity_score.pkl','wb'))
w2subj,subjectivity_types = pickle.load(open('lexicon_functions/subjectivity_score.pkl','rb'))
get_subjectivity('absolutely, i once kissed a girl and i liked it')

{'weaksubj': 0.0, 'strongsubj': 0.09090909090909091}

# Vader Sentiment

*Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.**

In [211]:
import nltk.sentiment
vader = nltk.sentiment.vader.SentimentIntensityAnalyzer()

In [220]:
vader.polarity_scores('Hello everybody. Nothing to see here.')

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

# NRC

** Not for commercial use ** 
http://sentiment.nrc.ca/lexicons-for-research/

In [306]:
#! wget http://sentiment.nrc.ca/lexicons-for-research/NRC-Sentiment-Emotion-Lexicons.zip
#! unzip NRC-Sentiment-Emotion-Lexicons.zip

In [304]:
#! ls NRC-Sentiment-Emotion-Lexicons/

In [253]:
path = 'NRC-Sentiment-Emotion-Lexicons/'
dirs = [path+i+'/' for i in os.listdir(path) if 'NRC' in i and 'Colour' not in i]
files = []
for directory in dirs:
    files +=[directory+i for i in os.listdir(directory) if not 'readme' in i.lower() and '.pdf' not in i and not 'ForVariousLanguages' in i]

In [307]:
#! wget http://sentiment.nrc.ca/lexicons-for-research/NRC-VAD-Lexicon.zip
#! wget http://sentiment.nrc.ca/lexicons-for-research/NRC-Affect-Intensity-Lexicon.zip
#! unzip NRC-Affect-Intensity-Lexicon.zip
#! unzip NRC-VAD-Lexicon.zip

In [308]:
dfs = []
for filename in files:
    if 'Older' in filename:
        continue
    if not '.txt' in filename:
        continue
    try:
        df = pd.read_csv(filename,sep='\t')
        print(filename,df.columns)
    except:
        print(filename)
    dfs.append(df)


NRC-Sentiment-Emotion-Lexicons/NRC-Affect-Intensity-Lexicon/NRC-AffectIntensity-Lexicon.txt Index(['term', 'score', 'AffectDimension'], dtype='object')
NRC-Sentiment-Emotion-Lexicons/NRC-Emotion-Lexicon-v0.92/NRC-Emotion-Lexicon-Senselevel-v0.92.txt Index(['gut--opening, fistula, tubule', 'fear', '0'], dtype='object')
NRC-Sentiment-Emotion-Lexicons/NRC-Emotion-Lexicon-v0.92/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt Index(['aback', 'anger', '0'], dtype='object')
NRC-Sentiment-Emotion-Lexicons/NRC-VAD-Lexicon/NRC-VAD-Lexicon.txt Index(['Word', 'Valence', 'Arousal', 'Dominance'], dtype='object')


In [311]:
AIL_df = dfs[0]
VAD_df = dfs[-1]

In [637]:
import numpy as np
w2scores = {}
for w,val,ar,dom in VAD_df.values:
    w2scores[w] = {'valence':val,'arousal':ar,'dominance':dom}
def get_vad_score(doc,tokenizer=nltk.word_tokenize,agg='mean'):
    if type(doc)==str:
        doc = tokenizer(doc)
    assert type(doc)==list,"please input either a list or a string"
        
    matches = []
    for w in doc:
        w = w.lower()
        if w in w2scores:
            matches.append(w2scores[w])
    if len(matches)==0:
        return {'arousal':np.nan,'dominance':np.nan,'valence':np.nan}
    scores = pd.DataFrame(matches)
    if agg=='mean':
        scores =  scores.mean()
    elif agg=='max':
        scores =  scores.max()
    else:
        scores =  agg(scores)
    return dict(scores)
pickle.dump(w2scores,open('lexicon_functions/vad_score.pkl','wb'))
w2scores = pickle.load(open('lexicon_functions/vad_score.pkl','rb'))
get_vad_score('and I love you! hate you',agg='mean')



{'arousal': 0.6605000000000001, 'dominance': 0.5515, 'valence': 0.5155}

In [344]:
#AIL_df.columns
AIL_df[AIL_df.term=='feeling']#.value_counts()

Unnamed: 0,term,score,AffectDimension
1417,feeling,0.147,anger
2838,feeling,0.328,fear
4458,feeling,0.172,sadness
5440,feeling,0.359,joy


In [638]:

w2affect = []
for w,score,dim in AIL_df.values:
    w2affect.append({dim:score,'w':w})
w2affects = {}
df = pd.DataFrame(w2affect)
df = df.groupby('w').sum().reset_index()
for w,anger,fear,joy,sadness in df[['w','anger','fear','joy','sadness']].values:
    w2affects[w] = {'anger':anger,'joy':joy,'sadness':sadness,'fear':fear}

def get_affect_intensity_score(doc,tokenizer=nltk.word_tokenize,agg='mean'):
    if type(doc)==str:
        doc = tokenizer(doc)
    assert type(doc)==list,"please input either a list or a string"
    matches = []
    for w in doc:
        w = w.lower()
        if w in w2affects:
            matches.append(w2affects[w])
    if len(matches)==0:
        return {'anger':np.nan,'joy':np.nan,'sadness':np.nan,'fear':np.nan}
    scores = pd.DataFrame(matches)
    if agg=='mean':
        scores = scores.mean()
    elif agg=='max':
        scores =  scores.max()
    else:
        scores = agg(scores)
    return dict(scores)
pickle.dump(w2affects,open('lexicon_functions/ail_score.pkl','wb'))
w2affects = pickle.load(open('lexicon_functions/ail_score.pkl','rb'))
get_affect_intensity_score('hello I love you and you suck so much balls. I hate you. ')



{'anger': 0.414, 'fear': 0.242, 'joy': 0.414, 'sadness': 0.32799999999999996}

## Afinn
Finn Nielsen (DTU)

http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6010

In [None]:
! pip install afinn

In [580]:
from afinn import Afinn
afinn = Afinn(emoticons=True)
def get_afinn(text):
    if type(text)==list:
        text = ' '.join(text)
    return {'afinn':afinn.score(text)}
#get_afinn('hello I love you so much')
get_afinn(['hello','I','love','you','so','much'])

{'afinn': 3.0}

## Conglomerate
- Bing, Liu Opinion
- MPQA subjectivity
- Harvard General Inquirer
- NRC Emotion
				


In [586]:
df = pd.read_csv('https://raw.githubusercontent.com/beefoo/text-analysis/master/lexicons/lexicons_compiled.csv')
df_dummy = pd.get_dummies(df[['emotion','orientation','sentiment','subjectivity']])
idx = (df.apply(lambda x: x.apply(lambda x: type(x)==str),axis=1)).sum(axis=1).sort_values(ascending=False).index
w2conglomerate = dict(list(zip(df.word,df_dummy.values)))
conglomerate_cols = df_dummy.columns



def get_conglomerate_scores(doc,tokenizer=nltk.word_tokenize,agg='mean'):
    if type(doc)==str:
        doc = tokenizer(doc)
    assert type(doc)==list,"please input either a list or a string"
    matches = []
    for w in doc:
        w = w.lower()
        if w in w2conglomerate:
            matches.append(dict(list(zip(conglomerate_cols,w2conglomerate[w]))))
    if len(matches)==0:
        return dict(list(zip(conglomerate_cols,[np.nan]*len(conglomerate_cols))))
    scores = pd.DataFrame(matches)
    if agg=='mean':
        scores = scores.mean()
    elif agg =='max':
        scores  = scores.max()
    else:
        assert hasattr(agg,'__call__'),'"agg" should be a function if not "mean" or "max"'
        scores = agg(scores)
    return dict(scores)

In [639]:
pickle.dump([w2conglomerate,conglomerate_cols],open('lexicon_functions/conglomerate.pkl','wb'))
#w2conglomerate,conglomerate_cols,get_conglomerate_scores = pickle.load(open('lexicon_functions/conglomerate.pkl','rb'))

In [593]:
w2conglomerate,conglomerate_cols = pickle.load(open('lexicon_functions/conglomerate.pkl','rb'))
get_conglomerate_scores('Do you like drugs? uugs. Do you like druuggs? I do. So what')

{'emotion_anger': 0.0,
 'emotion_anticipation': 0.0,
 'emotion_disgust': 0.0,
 'emotion_fear': 0.0,
 'emotion_joy': 0.0,
 'emotion_sadness': 0.0,
 'emotion_surprise': 0.0,
 'emotion_trust': 0.0,
 'orientation_active': 0.5,
 'orientation_passive': 0.3333333333333333,
 'sentiment_negative': 0.0,
 'sentiment_positive': 0.3333333333333333,
 'subjectivity_strong': 0.3333333333333333,
 'subjectivity_weak': 0.16666666666666666}

# Positive and negative Liu
https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html

In [644]:
from nltk.corpus import opinion_lexicon
positive_w = set(opinion_lexicon.positive())
negative_w = set(opinion_lexicon.negative())


def get_pos_neg_liu(doc,tokenizer=nltk.word_tokenize,agg='sum'):
    if type(doc)==str:
        doc = [i.lower() for i in tokenizer(doc)]
    assert type(doc)==list, 'input has to be either string or list'
    if len(doc)==0:
        return {'positive_count':np.nan,'negative_count':np.nan}
    
    d = {'positive_count':count_words(doc,positive_w),
        'negative_count':count_words(doc,negative_w)}
    if agg=='sum':
        return d
    elif agg=='mean':
        return {key:val/len(doc) for key,val in d.items()}
    
def count_words(doc,s):
    c = Counter(doc)
    return sum([c[i] for i in s])
get_pos_neg_liu('you love hating me right?')    

{'positive_count': 2, 'negative_count': 1}

## Hedometer
https://hedonometer.org/index.html

https://raw.githubusercontent.com/andyreagan/hedonometer/master/hedonometer/static/hedonometer/labMT1.txt

In [642]:
df = pd.read_csv('https://raw.githubusercontent.com/andyreagan/hedonometer/master/hedonometer/static/hedonometer/labMT1.txt',sep='\t',
                header =None,names = ['w','rank','score','std','twitter','googlebooks','newyorktimes','lyrics'])[['w','score','std']]


In [643]:
w2happy = dict(df[['w','score']].values)
def get_happiness(doc,tokenizer=nltk.word_tokenize,agg='sum'):
    if type(doc)==str:
        doc = tokenizer(doc)
    assert type(doc)==list, 'please input string or list'

    scores = []
    for w in doc:
        if w in w2happy:
            scores.append(w2happy[w])
    score = np.mean(scores)
    return {'happiness':score}
pickle.dump(w2happy,open('lexicon_functions/happiness.pkl','wb'))
w2happy = pickle.load(open('lexicon_functions/happiness.pkl','rb'))
get_happiness('i love laughing')

{'happiness': 22.54}

In [604]:
df.shape

(10221, 8)

## Wrap them all in one big function

In [None]:
w2scores,get_subjectivity = pickle.load(open('lexicon_functions/subjectivity_score.pkl','rb'))
text2argfeatures,class2re,string_test = pickle.load(open('lexicon_functions/text2arg.pkl','rb'))
w2scores,get_vad_score = pickle.load(open('lexicon_functions/vad_score.pkl','rb'))
w2affects,get_affect_intensity_score = pickle.load(open('lexicon_functions/ail_score.pkl','rb'))
w2conglomerate,conglomerate_cols,get_conglomerate_scores = pickle.load(open('lexicon_functions/conglomerate.pkl','rb'))
w2happy,get_happiness = pickle.load(open('lexicon_functions/happiness.pkl','rb'))

In [631]:
name2func = {'liu':get_pos_neg_liu,
             'conglomerate':get_conglomerate_scores,
             'affect_intensity':get_affect_intensity_score,
             'vad':get_vad_score,
             'subjectivity':get_subjectivity,
             'hedometer':get_happiness
            }
textbased_funcs = {'vader':vader.polarity_scores,
             'afinn':get_afinn,
                  'argumentation':text2argfeatures}

def lexical_mining(text,tokenizer = nltk.word_tokenize,agg = {}):
    if type(text)==str:
        doc = tokenizer(text)
    if type(text)==np.nan:
        return np.nan
    d = {}
    for name,func in textbased_funcs.items():
        temp_d = {'%s_%s'%(name,key):val for key,val in func(text).items()}
        d.update(temp_d)
    for name,func in name2func.items():
        temp_d = {'%s_%s'%(name,key):val for key,val in func(doc).items()}
        d.update(temp_d)
    return pd.Series(d)
        
lexical_mining(string_test)

vader_neg                               0.102000
vader_neu                               0.783000
vader_pos                               0.115000
vader_compound                          0.985500
afinn_afinn                            65.000000
argumentation_inconsistency            18.000000
argumentation_conditionals              8.000000
argumentation_contrast                 12.000000
argumentation_emphasis                 30.000000
argumentation_causation                38.000000
argumentation_wants                     6.000000
argumentation_difficulty               11.000000
argumentation_inyourshoes               4.000000
argumentation_rhetoricalquestion        5.000000
argumentation_assessments              24.000000
argumentation_generalization            5.000000
argumentation_structure                 3.000000
argumentation_necessity                25.000000
argumentation_doubt                     4.000000
argumentation_priority                  8.000000
argumentation_possib

## Sentiwordnet

In [460]:
from nltk.corpus import sentiwordnet as swn

In [462]:
list(swn.senti_synsets('slow'))

[SentiSynset('decelerate.v.01'),
 SentiSynset('slow.v.02'),
 SentiSynset('slow.v.03'),
 SentiSynset('slow.a.01'),
 SentiSynset('slow.a.02'),
 SentiSynset('dense.s.04'),
 SentiSynset('slow.a.04'),
 SentiSynset('boring.s.01'),
 SentiSynset('dull.s.08'),
 SentiSynset('slowly.r.01'),
 SentiSynset('behind.r.03')]

In [464]:
happy = swn.senti_synsets('happy', 'a')
list(happy)

[SentiSynset('happy.a.01'),
 SentiSynset('felicitous.s.02'),
 SentiSynset('glad.s.02'),
 SentiSynset('happy.s.04')]