In [1]:
import json
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook

# Features for text sentiment classification

In [2]:
D = pd.read_csv('data/yelp_example_1_small.tsv', sep='\t')

In [3]:
D.head()

Unnamed: 0,content,score,business,avgstars
0,This place is WAAAY over priced for the generi...,1,Lee's Buffet,2.0
1,Our taxi driver had told us to go to this plac...,5,Village Pub and Cafe,3.5
2,Not worth the $20! I'm a Las Vegas buffet conn...,2,Golden Nugget Buffet,2.5
3,Great All-American cuisine with hearty helping...,5,Black Bear Diner,4.0
4,The bacon burger is a MUST! One of the most de...,5,Bacon Bar,3.5


## Terms and term frequencies

In [4]:
import spacy
from collections import defaultdict

In [5]:
nlp = spacy.load("en_core_web_sm")

In [None]:
I = defaultdict(lambda: defaultdict(lambda: 0))

In [None]:
records = []
rows = tqdm_notebook(list(D.iterrows()))
for i, row in rows:
    doc = nlp(row.content)
    for s, sent in enumerate(doc.sents):
        for t, token in enumerate(sent):
            record = {'doc': i, 'sentence': s, 'position': t}
            record['token'] = token.text
            record['lower'] = token.text.lower()
            record['lemma'] = token.lemma_
            record['pos'] = token.pos_
            record['alpha'] = token.is_alpha
            record['stop'] = token.is_stop
            record['doc_size'] = len(doc)
            record['sentence_size'] = len(sent)
            records.append(record)

## Store index

In [6]:
import pymongo

In [7]:
db = pymongo.MongoClient()['textsent']
yelp = db['yelp_simple']

In [None]:
yelp.insert_many(records)

In [None]:
m = {'$match': {'pos': {'$in': ['NOUN', 'ADJ']}}}
p = {'$project': {'_id': 0, 'doc': 1, 'sentence': 1, 'position': 1, 'lemma': 1}}
s = {'$sort': {'doc': 1, 'sentence': 1}}
g = {'$group': {'_id': '$doc', 'tokens': {'$push': '$lemma'}}}

In [None]:
for record in yelp.aggregate([m, p, s, g], allowDiskUse=True):
    print(record)

In [None]:
m = {'$match': {'pos': {'$in': ['NOUN', 'ADJ']}}}
g = {'$group': {'_id': {'doc': '$doc', 'size': '$doc_size', 'lemma': '$lemma'}, 'tf': {'$sum': 1}}}
h = {'$match': {'tf': {'$gte': 3}}}
s = {'$sort': {'tf': -1}}

In [None]:
m = {'$match': {'pos': {'$in': ['NOUN', 'ADJ']}}}
g = {'$group': {'_id': '$lemma', 'docs': {'$addToSet': '$doc'}}}
p = {'$project': {'_id': 1, 'docs': {'$size': '$docs'}}}

In [None]:
N = len(yelp.distinct('doc'))

In [None]:
for record in yelp.aggregate([m, g, p], allowDiskUse=True):
    print(record['_id'], np.log(N / record['docs']))

In [8]:
def get_document(collection, doc_id, sentence=None, pos_filter=None, field='lower'):
    m = {'$match': {'doc': doc_id}}
    if sentence is not None:
        m['$match']['sentence'] = sentence
    if pos_filter is not None:
        m['$match']['pos'] = {'$in': pos_filter}
    p = {'$project': {'_id': 0, 'sentence': 1, 'position': 1, field: 1}}
    s = {'$sort': {'sentence': 1, 'position': 1}}
    g = {'$group': {'_id': '$sentence', 'tokens': {'$push': '${}'.format(field)}}}
    return [r['tokens'] for r in collection.aggregate([m, p, s, g])]

## Add sentiment lexicon to the index

In [9]:
from nltk.corpus import sentiwordnet as swn

In [10]:
doc = get_document(yelp, doc_id=0, pos_filter=['NOUN', 'ADJ', 'VERB'])

In [11]:
for sentence in doc:
    print(sentence)

['least', 'have', 'decent', 'assortment', 'sushi', 'standard', 'mongolian', 'bbq']
['are', 'better', 'spending', 'dollars']
['food', 'is', 'room', 'temperature']
['place', 'is', 'priced', 'generic', 'cuisine', 'serve']


In [12]:
def avg_score(token):
    synsets = list(swn.senti_synsets(token))
    scores = []
    for syn in synsets:
        scores.append([syn.pos_score(), syn.neg_score(), syn.obj_score()])
    if len(scores) > 0:
        m = np.array(scores).mean(axis=0)
    else:
        m = np.zeros(3)
    return m

In [13]:
I = defaultdict(lambda: defaultdict(lambda: 0))
for i, sentence in enumerate(doc):
    for token in sentence:
        scores = avg_score(token)
        w = scores[0] - scores[1]
        I[i][token] = w
I = pd.DataFrame(I)
I.fillna(0, inplace=True)

In [14]:
I

Unnamed: 0,0,1,2,3
least,0.0,0.0,0.0,0.0
have,-0.03125,0.0,0.0,0.0
decent,0.392857,0.0,0.0,0.0
assortment,0.0,0.0,0.0,0.0
sushi,0.0,0.0,0.0,0.0
standard,0.056818,0.0,0.0,0.0
mongolian,0.0,0.0,0.0,0.0
bbq,0.0,0.0,0.0,0.0
are,0.0,0.008929,0.0,0.0
better,0.0,0.52918,0.0,0.0


## Deal with the logical structure of sentences
### Take into account negation using a dependency parser

In [15]:
from spacy import displacy

In [16]:
doc = get_document(yelp, doc_id=0, pos_filter=None)

In [17]:
sentence = " ".join(doc[0])

In [18]:
sentence

'at least there they have a decent assortment of sushi and a standard mongolian bbq .'

In [19]:
s = nlp(sentence)

In [20]:
displacy.render(s, style='dep')

In [21]:
table = {'token': [], 'token dep': [], 'head': [], 'head pos': [], 'children': [], 'ancestors': []}
for token in s:
    table['token'].append(token.text)
    table['token dep'].append(token.dep_)
    table['head'].append(token.head.text)
    table['head pos'].append(token.head.pos_)
    table['children'].append(", ".join([child.text for child in token.children]))
    table['ancestors'].append(", ".join([a.text for a in token.ancestors]))
S = pd.DataFrame(table)

In [22]:
S

Unnamed: 0,token,token dep,head,head pos,children,ancestors
0,at,advmod,least,ADJ,,"least, there, have"
1,least,advmod,there,ADV,at,"there, have"
2,there,advmod,have,VERB,least,have
3,they,nsubj,have,VERB,,have
4,have,ROOT,have,VERB,"there, they, assortment, .",
5,a,det,assortment,NOUN,,"assortment, have"
6,decent,amod,assortment,NOUN,,"assortment, have"
7,assortment,dobj,have,VERB,"a, decent, of",have
8,of,prep,assortment,NOUN,sushi,"assortment, have"
9,sushi,pobj,of,ADP,"and, bbq","of, assortment, have"


In [23]:
neg = nlp('In the restaurant they serve good food but the service is not so good')

In [24]:
displacy.render(neg, style='dep')

In [25]:
table = {'token': [], 'token dep': [], 'head': [], 'head pos': [], 'children': [], 'ancestors': []}
for token in neg:
    table['token'].append(token.text)
    table['token dep'].append(token.dep_)
    table['head'].append(token.head.text)
    table['head pos'].append(token.head.pos_)
    table['children'].append(", ".join([child.text for child in token.children]))
    table['ancestors'].append(", ".join([a.text for a in token.ancestors]))
N = pd.DataFrame(table)

In [26]:
N

Unnamed: 0,token,token dep,head,head pos,children,ancestors
0,In,prep,serve,VERB,restaurant,serve
1,the,det,restaurant,NOUN,,"restaurant, In, serve"
2,restaurant,pobj,In,ADP,the,"In, serve"
3,they,nsubj,serve,VERB,,serve
4,serve,ROOT,serve,VERB,"In, they, food, but, is",
5,good,amod,food,NOUN,,"food, serve"
6,food,dobj,serve,VERB,good,serve
7,but,cc,serve,VERB,,serve
8,the,det,service,NOUN,,"service, is, serve"
9,service,nsubj,is,VERB,the,"is, serve"


In [27]:
for chunk in neg.noun_chunks:
    print("\t".join([chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text]))

the restaurant	restaurant	pobj	In
they	they	nsubj	serve
good food	food	dobj	serve
the service	service	nsubj	is


## Exercize: Use parse to get negation and use it to score polarity