In [1]:
import json
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook

# Features for text sentiment classification

In [2]:
D = pd.read_csv('data/yelp_example_1_small.tsv', sep='\t')

In [3]:
D.head()

Unnamed: 0,content,score,business,avgstars
0,This place is WAAAY over priced for the generi...,1,Lee's Buffet,2.0
1,Our taxi driver had told us to go to this plac...,5,Village Pub and Cafe,3.5
2,Not worth the $20! I'm a Las Vegas buffet conn...,2,Golden Nugget Buffet,2.5
3,Great All-American cuisine with hearty helping...,5,Black Bear Diner,4.0
4,The bacon burger is a MUST! One of the most de...,5,Bacon Bar,3.5


## Terms and term frequencies

In [4]:
import spacy
from collections import defaultdict

In [5]:
nlp = spacy.load("en_core_web_sm")

In [6]:
I = defaultdict(lambda: defaultdict(lambda: 0))

In [7]:
records = []
rows = tqdm_notebook(list(D.iterrows()))
for i, row in rows:
    doc = nlp(row.content)
    for s, sent in enumerate(doc.sents):
        for t, token in enumerate(sent):
            record = {'doc': i, 'sentence': s, 'position': t}
            record['token'] = token.text
            record['lower'] = token.text.lower()
            record['lemma'] = token.lemma_
            record['pos'] = token.pos_
            record['alpha'] = token.is_alpha
            record['stop'] = token.is_stop
            record['doc_size'] = len(doc)
            record['sentence_size'] = len(sent)
            records.append(record)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




## Store index

In [10]:
import pymongo

In [14]:
db = pymongo.MongoClient()['textsent']
yelp = db['yelp_simple']

In [10]:
yelp.insert_many(records)

<pymongo.results.InsertManyResult at 0x7fc2a97b50f0>

In [15]:
m = {'$match': {'pos': {'$in': ['NOUN', 'ADJ']}}}
p = {'$project': {'_id': 0, 'doc': 1, 'sentence': 1, 'position': 1, 'lemma': 1}}
s = {'$sort': {'doc': 1, 'sentence': 1}}
g = {'$group': {'_id': '$doc', 'tokens': {'$push': '$lemma'}}}

In [16]:
for record in yelp.aggregate([m, p, s, g], allowDiskUse=True):
    print(record)

{'_id': 3599, 'tokens': ['spot', 'breakfast', 'last', 'weekend', 'awesome', 'french', 'toast', 'amazing', 'bacon']}
{'_id': 4013, 'tokens': ['food', 'good', 'chicken', 'taco', 'fountain', 'pico', 'great', 'salsa', 'coconut', 'margarita', 'good', 'husband', 'drink', 'tic', 'tonight', 'deal', 'service', 'great']}
{'_id': 108, 'tokens': ['gross', 'summer', 'uncomfortable', 'thing', 'rubbery', 'fish', 'gross', 'fresh', 'sure', 'place', 'sick', 'service', 'poor', 'one', 'friendly', 'thing', 'great', 'food', 'case']}
{'_id': 3019, 'tokens': ['delicious', 'wait', 'staff', 'knowledgeable', 'food', 'cooking', 'process', 'star', 'service', 'place', 'dream', 'foodie', 'foodie', 'expensive', 'worth', 'experience', 'food', 'work', 'art', 'delicious']}
{'_id': 3843, 'tokens': ['good', 'food', 'scallop', 'good', 'delicious', 'fatty', 'tuna', 'sea', 'bass', 'good', 'salmon', 'good', 'scallop', 'delicious', 'lobster', 'delicious', 'steak', 'delicious']}
{'_id': 4162, 'tokens': ['hungry', 'family', 'lad

{'_id': 171, 'tokens': ['nice', 'warm', 'ambience', 'spacious', 'good', 'large', 'group', 'raman', 'previous', 'post', 'tender', 'broth', 'flavor', 'noodle', 'raman', 'typical', 'raman', 'noodle', 'bit', 'cooked', 'mushy', 'rice', 'ok', 'wrong', 'fried', 'oyster', 'nice', 'light', 'panko', 'batter', 'place', 'potential']}
{'_id': 417, 'tokens': ['food', 'ok', 'good', 'year', 'dish', 'sanitize', 'water', 'food', 'sorry']}
{'_id': 460, 'tokens': ['time', 'food', 'service', 'great']}
{'_id': 1020, 'tokens': ['recent', 'day', 'colleague', 'first', 'roll', 'lobster', 'roll', 'lobster', 'melt', 'fry', 'chip', 'yummy', 'blueberry', 'slaw', 'bit', 'lobster', 'solid', 'star', 'food', 'good', 'place', 'show', 'fry', 'chip', 'crisp', 'bread', 'bisque', 'good', 'place', 'fast', 'food', 'little', 'expensive', 'small', 'claw', 'meat', 'chip', 'store', 'ripple', 'bag', 'condiment', 'sized', 'bit', 'coleslaw', 'combo', 'substituting', 'chip', 'fry', 'no', 'brainer', 'drink', 'tail', 'meat', 'bit', 'sk

In [17]:
m = {'$match': {'pos': {'$in': ['NOUN', 'ADJ']}}}
g = {'$group': {'_id': {'doc': '$doc', 'size': '$doc_size', 'lemma': '$lemma'}, 'tf': {'$sum': 1}}}
h = {'$match': {'tf': {'$gte': 3}}}
s = {'$sort': {'tf': -1}}

In [18]:
m = {'$match': {'pos': {'$in': ['NOUN', 'ADJ']}}}
g = {'$group': {'_id': '$lemma', 'docs': {'$addToSet': '$doc'}}}
p = {'$project': {'_id': 1, 'docs': {'$size': '$docs'}}}

In [19]:
N = len(yelp.distinct('doc'))

In [20]:
for record in yelp.aggregate([m, g, p], allowDiskUse=True):
    print(record['_id'], np.log(N / record['docs']))

coating 7.1308988302963465
declined- 8.517193191416238
passionate 6.571283042360924
affliction 8.517193191416238
90th 8.517193191416238
conversion 7.824046010856292
lateness 8.517193191416238
lagging 8.517193191416238
hubbub 8.517193191416238
nori 7.824046010856292
gob 7.824046010856292
dry 3.5899395062590327
wok 7.418580902748128
yum 7.418580902748128
denser 8.517193191416238
glove 6.437751649736401
mmmm 7.418580902748128
closeby 7.824046010856292
loitering 8.517193191416238
warming 7.418580902748128
bar\/lounge 7.824046010856292
eyesight 8.517193191416238
snickerdoodle 7.824046010856292
cream 3.4673371841667002
excuse 5.472670753692815
marbling 8.517193191416238
brownish 8.517193191416238
fruit 4.268697949366879
manhattan 8.517193191416238
babymoon 8.517193191416238
pate 7.418580902748128
leery 7.418580902748128
sparkler 8.517193191416238
avvery 8.517193191416238
colorful 6.319968614080018
k 7.824046010856292
coq 8.517193191416238
reputation 7.418580902748128
drunchie 6.7254337221881

protion 8.517193191416238
herbal 6.907755278982137
blank 7.824046010856292
addictive 7.418580902748128
lot 2.5207411027972166
nautical 8.517193191416238
borne 8.517193191416238
clientele 7.1308988302963465
commercial 7.418580902748128
skimp 8.517193191416238
watering 7.1308988302963465
sanitize 8.517193191416238
sumac 8.517193191416238
career 8.517193191416238
alien 7.824046010856292
730pm 7.824046010856292
wasabi 6.319968614080018
pushy 6.437751649736401
ginger 5.381698975487088
refund 6.907755278982137
insane 6.214608098422191
room\/section 8.517193191416238
lobster 4.074541934925921
total 4.491841500681089
equivalent 6.907755278982137
smokiness 8.517193191416238
STATLER 8.517193191416238
intent 8.517193191416238
fill 7.1308988302963465
profit 8.517193191416238
korean 4.585367558691912
coffee 3.4802405890026082
taro 6.319968614080018
computer 6.725433722188183
mural 7.1308988302963465
hump 8.517193191416238
serendipity 8.517193191416238
pain 5.8781358618009785
a. 7.418580902748128
ta

sitting-- 8.517193191416238
lively 6.725433722188183
sog 8.517193191416238
vender 8.517193191416238
songs- 8.517193191416238
24th 7.824046010856292
eager 6.725433722188183
prefect 8.517193191416238
timer 6.319968614080018
stratosphere 8.517193191416238
guide 8.517193191416238
love 4.327538449389812
exaggerated 8.517193191416238
sweeter 8.517193191416238
envy 8.517193191416238
chestnut 7.824046010856292
quick 3.223888366691745
eggshell 8.517193191416238
collard 7.824046010856292
baja 7.824046010856292
lone 8.517193191416238
cover 6.119297918617867
hour 2.659260036932778
thirst 7.824046010856292
wavelength 8.517193191416238
slophouse 8.517193191416238
taiyaki 7.418580902748128
yesterday 5.184988681241034
stool 7.418580902748128
advertised 8.517193191416238
sleep 7.418580902748128
lemon 4.374058465024705
creativity 7.418580902748128
limitation 8.517193191416238
BANG 8.517193191416238
biscut 8.517193191416238
connection 7.418580902748128
lounge 5.683979847360021
teppanyaki 6.57128304236092

spice 4.226733750267846
musk 8.517193191416238
skewer 5.521460917862246
strict 6.907755278982137
sneer 8.517193191416238
athletic 8.517193191416238
zoo 7.824046010856292
palace 6.319968614080018
Sardegna 8.517193191416238
mein 6.119297918617867
basin 8.517193191416238
chewiness 8.517193191416238
eatable 8.517193191416238
sort 4.990832666800076
unwelcoming 8.517193191416238
searing 8.517193191416238
tot 6.319968614080018
ooey 8.517193191416238
legendary 8.517193191416238
sandwich 3.0491330502811063
rule 5.572754212249797
swoon 7.824046010856292
seamless 8.517193191416238
likely 6.119297918617867
taco 3.5972122655881127
s\/o 8.517193191416238
lion 7.418580902748128
selection 2.972015746936675
chernobyl 8.517193191416238
equal 6.725433722188183
housing 7.824046010856292
stinky 7.824046010856292
bankruptcy 8.517193191416238
chips\/salsa\/bean 8.517193191416238
comparaison 8.517193191416238
cancer 7.824046010856292
stain 8.517193191416238
produce 6.907755278982137
menu 1.995100393246085
has

jelly 7.1308988302963465
deer 7.1308988302963465
dietary 6.907755278982137
sizzling 8.517193191416238
cha 8.517193191416238
today 3.8076629901039034
pleny 8.517193191416238
childhood 6.725433722188183
day\/time 8.517193191416238
tryna 7.824046010856292
mister 6.725433722188183
smoke 5.221356325411908
itinerary 8.517193191416238
habit 7.1308988302963465
trafficky 8.517193191416238
ur 8.517193191416238
renovation 6.907755278982137
hype 4.961845129926823
body 5.683979847360021
euphoria 8.517193191416238
calf 8.517193191416238
whistle 8.517193191416238
tic 7.824046010856292
variable 8.517193191416238
smorgy 8.517193191416238
comparable 5.809142990314028
p 6.907755278982137
baguette 5.744604469176457
retention 8.517193191416238
diet 5.626821433520073
irraz 8.517193191416238
medjol 8.517193191416238
dissatisfied 7.418580902748128
extract 8.517193191416238
fixture 7.418580902748128
fund 8.517193191416238
magic 6.437751649736401
frequent 5.744604469176457
themed 7.824046010856292
deduction 7.8

In [21]:
def get_document(collection, doc_id, sentence=None, pos_filter=None, field='lower'):
    m = {'$match': {'doc': doc_id}}
    if sentence is not None:
        m['$match']['sentence'] = sentence
    if pos_filter is not None:
        m['$match']['pos'] = {'$in': pos_filter}
    p = {'$project': {'_id': 0, 'sentence': 1, 'position': 1, field: 1}}
    s = {'$sort': {'sentence': 1, 'position': 1}}
    g = {'$group': {'_id': '$sentence', 'tokens': {'$push': '${}'.format(field)}}}
    return [r['tokens'] for r in collection.aggregate([m, p, s, g])]

## Add sentiment lexicon to the index

In [22]:
from nltk.corpus import sentiwordnet as swn

In [23]:
doc = get_document(yelp, doc_id=0, pos_filter=['NOUN', 'ADJ', 'VERB'])

In [20]:
for sentence in doc:
    print(sentence)

['place', 'priced', 'generic', 'cuisine', 'serve']
['food', 'room', 'temperature']
['better', 'spending', 'dollars']
['least', 'decent', 'assortment', 'standard', 'mongolian']


In [25]:
def avg_score(token):
    synsets = list(swn.senti_synsets(token))
    scores = []
    for syn in synsets:
        scores.append([syn.pos_score(), syn.neg_score(), syn.obj_score()])
    if len(scores) > 0:
        m = np.array(scores).mean(axis=0)
    else:
        m = np.zeros(3)
    return m

In [26]:
I = defaultdict(lambda: defaultdict(lambda: 0))
for i, sentence in enumerate(doc):
    for token in sentence:
        scores = avg_score(token)
        w = scores[0] - scores[1]
        I[i][token] = w
I = pd.DataFrame(I)
I.fillna(0, inplace=True)

In [27]:
I

Unnamed: 0,0,1,2,3
least,0.0,0.0,0.0,0.0
decent,0.392857,0.0,0.0,0.0
assortment,0.0,0.0,0.0,0.0
standard,0.056818,0.0,0.0,0.0
mongolian,0.0,0.0,0.0,0.0
place,0.0,0.007812,0.0,0.0
priced,0.0,0.0,0.0,0.0
generic,0.0,-0.025,0.0,0.0
cuisine,0.0,0.0,0.0,0.0
serve,0.0,0.015625,0.0,0.0


## Deal with the logical structure of sentences
### Take into account negation using a dependency parser

In [28]:
from spacy import displacy

In [29]:
doc = get_document(yelp, doc_id=0, pos_filter=None)

In [30]:
sentence = " ".join(doc[0])

In [31]:
sentence

'at least there they have a decent assortment of sushi and a standard mongolian bbq .'

In [32]:
s = nlp(sentence)

In [33]:
displacy.render(s, style='dep')

In [32]:
table = {'token': [], 'token dep': [], 'head': [], 'head pos': [], 'children': [], 'ancestors': []}
for token in s:
    table['token'].append(token.text)
    table['token dep'].append(token.dep_)
    table['head'].append(token.head.text)
    table['head pos'].append(token.head.pos_)
    table['children'].append(", ".join([child.text for child in token.children]))
    table['ancestors'].append(", ".join([a.text for a in token.ancestors]))
S = pd.DataFrame(table)

In [33]:
S

Unnamed: 0,token,token dep,head,head pos,children,ancestors
0,this,det,place,NOUN,,"place, is"
1,place,nsubj,is,AUX,this,is
2,is,ROOT,is,AUX,"place, priced, .",
3,waaay,advmod,over,ADV,,"over, priced, is"
4,over,advmod,priced,VERB,waaay,"priced, is"
5,priced,acomp,is,AUX,"over, for",is
6,for,prep,priced,VERB,cuisine,"priced, is"
7,the,det,cuisine,NOUN,,"cuisine, for, priced, is"
8,generic,amod,cuisine,NOUN,,"cuisine, for, priced, is"
9,cuisine,pobj,for,ADP,"the, generic, serve","for, priced, is"


In [34]:
neg = nlp('In the restaurant they serve good food but the service is not so good')

In [35]:
displacy.render(neg, style='dep')

In [36]:
table = {'token': [], 'token dep': [], 'head': [], 'head pos': [], 'children': [], 'ancestors': []}
for token in neg:
    table['token'].append(token.text)
    table['token dep'].append(token.dep_)
    table['head'].append(token.head.text)
    table['head pos'].append(token.head.pos_)
    table['children'].append(", ".join([child.text for child in token.children]))
    table['ancestors'].append(", ".join([a.text for a in token.ancestors]))
N = pd.DataFrame(table)

In [37]:
N

Unnamed: 0,token,token dep,head,head pos,children,ancestors
0,In,prep,serve,VERB,restaurant,serve
1,the,det,restaurant,NOUN,,"restaurant, In, serve"
2,restaurant,pobj,In,ADP,the,"In, serve"
3,they,nsubj,serve,VERB,,serve
4,serve,ROOT,serve,VERB,"In, they, food, but, is",
5,good,amod,food,NOUN,,"food, serve"
6,food,dobj,serve,VERB,good,serve
7,but,cc,serve,VERB,,serve
8,the,det,service,NOUN,,"service, is, serve"
9,service,nsubj,is,AUX,the,"is, serve"


In [38]:
for chunk in neg.noun_chunks:
    print("\t".join([chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text]))

the restaurant	restaurant	pobj	In
they	they	nsubj	serve
good food	food	dobj	serve
the service	service	nsubj	is


## Exercize: Use parse to get negation and use it to score polarity

In [45]:
def is_negation(word):
    for child in word.children:
        if child.dep_ == "neg":
            return True
        
    if word.pos_ in {"VERB"}:
        for anc in word.ancestors:
            if anc.pos_ in {"VERB"}:
                for sub_child in anc.children:
                    if sub_child.dep_ == "neg":
                        return True
    return False

In [55]:
def sum_strategy(text):
    s = np.zeros(3)
    sent_ = nlp(text)
    all_s = [list(swn.senti_synsets(token.text)) for token in sent_]
        
    for i, token in enumerate(sent_):
        try:
            synsets = all_s[i]
            sidf = np.log(max([len(l) for l in all_s]) / len(synsets))
            for syn in synsets:
                p, n, o = syn.pos_score(), syn.neg_score(), syn.obj_score()
                
                if is_negation(token):
                    p = -p
                    n = -n
                    o = -o
                    
                s[0] += p * sidf
                s[1] += n * sidf
                s[2] += o * sidf # this is neutral
        except ZeroDivisionError:
            pass
    return s

In [56]:
from sklearn.preprocessing import MinMaxScaler

res = sum_strategy('The service is not so good')
X = pd.DataFrame(res).T
X.columns = ['p', 'n', 'o']
scaler = MinMaxScaler()
Xs = scaler.fit_transform(X)
Xs = pd.DataFrame(X)
Xs

Unnamed: 0,p,n,o
0,-0.274083,1.927859,9.316252
