In [1]:
import json
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook

# Features for text sentiment classification

In [2]:
D = pd.read_csv('data/yelp_example_1_small.tsv', sep='\t')

In [3]:
D.head()

Unnamed: 0,content,score,business,avgstars
0,This place is WAAAY over priced for the generi...,1,Lee's Buffet,2.0
1,Our taxi driver had told us to go to this plac...,5,Village Pub and Cafe,3.5
2,Not worth the $20! I'm a Las Vegas buffet conn...,2,Golden Nugget Buffet,2.5
3,Great All-American cuisine with hearty helping...,5,Black Bear Diner,4.0
4,The bacon burger is a MUST! One of the most de...,5,Bacon Bar,3.5


## Terms and term frequencies

In [4]:
import spacy
from collections import defaultdict

In [5]:
nlp = spacy.load("en_core_web_sm")

In [6]:
I = defaultdict(lambda: defaultdict(lambda: 0))

In [7]:
records = []
rows = tqdm_notebook(list(D.iterrows()))
for i, row in rows:
    doc = nlp(row.content)
    for s, sent in enumerate(doc.sents):
        for t, token in enumerate(sent):
            record = {'doc': i, 'sentence': s, 'position': t}
            record['token'] = token.text
            record['lower'] = token.text.lower()
            record['lemma'] = token.lemma_
            record['pos'] = token.pos_
            record['alpha'] = token.is_alpha
            record['stop'] = token.is_stop
            record['doc_size'] = len(doc)
            record['sentence_size'] = len(sent)
            records.append(record)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




## Store index

In [12]:
import pymongo

In [13]:
db = pymongo.MongoClient()['textsent']
yelp = db['yelp_simple']

In [10]:
yelp.insert_many(records)

<pymongo.results.InsertManyResult at 0x7fc2a97b50f0>

In [19]:
m = {'$match': {'pos': {'$in': ['NOUN', 'ADJ']}}}
p = {'$project': {'_id': 0, 'doc': 1, 'sentence': 1, 'position': 1, 'lemma': 1}}
s = {'$sort': {'doc': 1, 'sentence': 1}}
g = {'$group': {'_id': '$doc', 'tokens': {'$push': '$lemma'}}}
l = {'$limit' : 5 }

In [20]:
for record in yelp.aggregate([m, p, s, g, l], allowDiskUse=True):
    print(record)

{'_id': 1539, 'tokens': ['quest', 'dim', 'sum', 'place', 'home', 'place', 'previous', 'post', 'cart', 'menu', 'food', 'hot', 'fresh', 'selection', 'dim', 'sum', 'excellent', 'least', 'item', 'service', 'average', 'kudo', 'attentive', 'food', 'delicious', 'price', 'little', 'high', 'usual', 'dum', 'sum', 'place', 'only', 'criticism', 'item', 'major', 'pet', 'peeve', 'regular', 'menu', 'item', 'stock', 'job', 'inventory', 'multiple', 'item', 'understandable', 'good', 'experience']}
{'_id': 1177, 'tokens': ['guy', 'love', 'place', 'certificate', 'minimum', 'hard', 'sampler', 'platter', 'appetizer', 'delicious', 'nachos', 'quesadilla', 'entree', 'grill', 'food', 'portion', 'big', 'next', 'trip']}
{'_id': 655, 'tokens': ['boyfriend', 'place', 'show', 'good', 'food', 'fish', 'cake', 'boyfriend', 'northern', 'larb', 'pork', 'duck', 'red', 'curry', 'yellow', 'curry', 'amazing', 'staff', 'nice', 'attentive', 'nontheless', 'amazing', 'place']}
{'_id': 659, 'tokens': ['location', 'food', 'today',

In [21]:
m = {'$match': {'pos': {'$in': ['NOUN', 'ADJ']}}}
g = {'$group': {'_id': {'doc': '$doc', 'size': '$doc_size', 'lemma': '$lemma'}, 'tf': {'$sum': 1}}}
h = {'$match': {'tf': {'$gte': 3}}}
s = {'$sort': {'tf': -1}}

In [22]:
m = {'$match': {'pos': {'$in': ['NOUN', 'ADJ']}}}
g = {'$group': {'_id': '$lemma', 'docs': {'$addToSet': '$doc'}}}
p = {'$project': {'_id': 1, 'docs': {'$size': '$docs'}}}

In [23]:
N = len(yelp.distinct('doc'))

In [25]:
for record in yelp.aggregate([m, g, p, l], allowDiskUse=True):
    print(record['_id'], np.log(N / record['docs']))

tenderloin 6.907755278982137
edginess 8.517193191416238
loft 7.824046010856292
wha 8.517193191416238
vibrant 7.824046010856292


In [26]:
def get_document(collection, doc_id, sentence=None, pos_filter=None, field='lower'):
    m = {'$match': {'doc': doc_id}}
    if sentence is not None:
        m['$match']['sentence'] = sentence
    if pos_filter is not None:
        m['$match']['pos'] = {'$in': pos_filter}
    p = {'$project': {'_id': 0, 'sentence': 1, 'position': 1, field: 1}}
    s = {'$sort': {'sentence': 1, 'position': 1}}
    g = {'$group': {'_id': '$sentence', 'tokens': {'$push': '${}'.format(field)}}}
    return [r['tokens'] for r in collection.aggregate([m, p, s, g])]

## Add sentiment lexicon to the index

In [27]:
from nltk.corpus import sentiwordnet as swn

In [28]:
doc = get_document(yelp, doc_id=0, pos_filter=['NOUN', 'ADJ', 'VERB'])

In [29]:
for sentence in doc:
    print(sentence)

['better', 'spending', 'dollars']
['least', 'decent', 'assortment', 'standard', 'mongolian']
['food', 'room', 'temperature']
['place', 'priced', 'generic', 'cuisine', 'serve']


In [30]:
def avg_score(token):
    synsets = list(swn.senti_synsets(token))
    scores = []
    for syn in synsets:
        scores.append([syn.pos_score(), syn.neg_score(), syn.obj_score()])
    if len(scores) > 0:
        m = np.array(scores).mean(axis=0)
    else:
        m = np.zeros(3)
    return m

In [31]:
I = defaultdict(lambda: defaultdict(lambda: 0))
for i, sentence in enumerate(doc):
    for token in sentence:
        scores = avg_score(token)
        w = scores[0] - scores[1]
        I[i][token] = w
I = pd.DataFrame(I)
I.fillna(0, inplace=True)

In [32]:
I

Unnamed: 0,0,1,2,3
better,0.52918,0.0,0.0,0.0
spending,0.075,0.0,0.0,0.0
dollars,-0.03125,0.0,0.0,0.0
least,0.0,0.0,0.0,0.0
decent,0.0,0.392857,0.0,0.0
assortment,0.0,0.0,0.0,0.0
standard,0.0,0.056818,0.0,0.0
mongolian,0.0,0.0,0.0,0.0
food,0.0,0.0,-0.041667,0.0
room,0.0,0.0,0.1,0.0


## Deal with the logical structure of sentences
### Take into account negation using a dependency parser

In [33]:
from spacy import displacy

In [34]:
doc = get_document(yelp, doc_id=0, pos_filter=None)

In [35]:
sentence = " ".join(doc[0])

In [36]:
sentence

'you are much better off spending your dollars at buffet @ asia .'

In [37]:
s = nlp(sentence)

In [38]:
displacy.render(s, style='dep')

In [39]:
table = {'token': [], 'token dep': [], 'head': [], 'head pos': [], 'children': [], 'ancestors': []}
for token in s:
    table['token'].append(token.text)
    table['token dep'].append(token.dep_)
    table['head'].append(token.head.text)
    table['head pos'].append(token.head.pos_)
    table['children'].append(", ".join([child.text for child in token.children]))
    table['ancestors'].append(", ".join([a.text for a in token.ancestors]))
S = pd.DataFrame(table)

In [40]:
S

Unnamed: 0,token,token dep,head,head pos,children,ancestors
0,you,nsubj,are,AUX,,are
1,are,ROOT,are,AUX,"you, better, off, spending, .",
2,much,advmod,better,ADJ,,"better, are"
3,better,acomp,are,AUX,much,are
4,off,advmod,are,AUX,,are
5,spending,advcl,are,AUX,"dollars, at, @",are
6,your,poss,dollars,NOUN,,"dollars, spending, are"
7,dollars,dobj,spending,VERB,your,"spending, are"
8,at,prep,spending,VERB,buffet,"spending, are"
9,buffet,pobj,at,ADP,,"at, spending, are"


In [41]:
neg = nlp('In the restaurant they serve good food but the service is not so good')

In [42]:
displacy.render(neg, style='dep')

In [43]:
table = {'token': [], 'token dep': [], 'head': [], 'head pos': [], 'children': [], 'ancestors': []}
for token in neg:
    table['token'].append(token.text)
    table['token dep'].append(token.dep_)
    table['head'].append(token.head.text)
    table['head pos'].append(token.head.pos_)
    table['children'].append(", ".join([child.text for child in token.children]))
    table['ancestors'].append(", ".join([a.text for a in token.ancestors]))
N = pd.DataFrame(table)

In [44]:
N

Unnamed: 0,token,token dep,head,head pos,children,ancestors
0,In,prep,serve,VERB,restaurant,serve
1,the,det,restaurant,NOUN,,"restaurant, In, serve"
2,restaurant,pobj,In,ADP,the,"In, serve"
3,they,nsubj,serve,VERB,,serve
4,serve,ROOT,serve,VERB,"In, they, food, but, is",
5,good,amod,food,NOUN,,"food, serve"
6,food,dobj,serve,VERB,good,serve
7,but,cc,serve,VERB,,serve
8,the,det,service,NOUN,,"service, is, serve"
9,service,nsubj,is,AUX,the,"is, serve"


In [45]:
for chunk in neg.noun_chunks:
    print("\t".join([chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text]))

the restaurant	restaurant	pobj	In
they	they	nsubj	serve
good food	food	dobj	serve
the service	service	nsubj	is


## Exercize: Use parse to get negation and use it to score polarity

In [46]:
def is_negation(word):
    for child in word.children:
        if child.dep_ == "neg":
            return True
        
    if word.pos_ in {"VERB"}:
        for anc in word.ancestors:
            if anc.pos_ in {"VERB"}:
                for sub_child in anc.children:
                    if sub_child.dep_ == "neg":
                        return True
    return False

In [47]:
def sum_strategy(text):
    s = np.zeros(3)
    sent_ = nlp(text)
    all_s = [list(swn.senti_synsets(token.text)) for token in sent_]
        
    for i, token in enumerate(sent_):
        try:
            synsets = all_s[i]
            sidf = np.log(max([len(l) for l in all_s]) / len(synsets))
            for syn in synsets:
                p, n, o = syn.pos_score(), syn.neg_score(), syn.obj_score()
                
                if is_negation(token):
                    p = -p
                    n = -n
                    o = -o
                    
                s[0] += p * sidf
                s[1] += n * sidf
                s[2] += o * sidf # this is neutral
        except ZeroDivisionError:
            pass
    return s

In [48]:
from sklearn.preprocessing import MinMaxScaler

res = sum_strategy('The service is not so good')
X = pd.DataFrame(res).T
X.columns = ['p', 'n', 'o']
scaler = MinMaxScaler()
Xs = scaler.fit_transform(X)
Xs = pd.DataFrame(X)
Xs

Unnamed: 0,p,n,o
0,-0.274083,1.927859,9.316252
