In [1]:
import pymongo
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook

## Feature selection
Parameters for selecting features:
- **raw**: take all tokens returned by the tokenizer
- **pos**: keep ADJ and NOUNS only
- **dep**: keep text depending on ADJ with negation

Tokens:

- **text**: take lower text
- **lemma**: take lemma

Weight:

- **tfidf**: tfidf weighting
- **sentiwn**: use average pos - neg scores in sentiwn
- **combo**: tfidf * sentiwn

In [2]:
db = pymongo.MongoClient()['textsent']
yelp = db['yelp_simple']

In [3]:
def documents(collection, pos_filter=None, field='lower'):
    pipeline = []
    if pos_filter is not None:
        m = {'$match': {'pos': {'$in': pos_filter}}}
        pipeline.append(m)
    p = {'$project': {'_id': 0, 'doc': 1, 'sentence': 1, 'position': 1, field: 1}}
    s = {'$sort': {'doc': 1, 'sentence': 1, 'position': 1}}
    g = {'$group': {'_id': {'doc': '$doc', 'sentence': '$sentence'}, 'tokens': {'$push': '${}'.format(field)}}}
    k = {'$group': {'_id': '$_id.doc', 'tokens': {'$push': '$tokens'}}}
    pipeline += [p, s, g, k]
    return [(r['_id'], ". ".join([" ".join(x) for x in r['tokens']]).replace('\n', '')) for r in 
            collection.aggregate(pipeline, allowDiskUse=True)]

In [4]:
raw_text = dict(documents(yelp, pos_filter=None, field='lower'))
raw_lemma = dict(documents(yelp, pos_filter=None, field='lemma'))
pos_text = dict(documents(yelp, pos_filter=['ADJ', 'NOUN'], field='lower'))
pos_lemma = dict(documents(yelp, pos_filter=['ADJ', 'NOUN'], field='lemma'))

In [7]:
pos_lemma[12]

'real good pizza nothing bad home. great crust wood. oven nice guy counter. good value money'

### Dependency

In [8]:
import spacy
from spacy import displacy

In [9]:
nlp = spacy.load("en_core_web_sm")

In [10]:
D = pd.read_csv('../data/yelp_example_1_small.tsv', sep='\t')

In [11]:
D.loc[10].content

'Not great not bad if you are in a hurry and need a bite then stop in if you are looking for a great pizza then keep looking.'

In [12]:
def dep(text, nlp, lemma=False):
    s = nlp(text)
    tokens = []
    for token in s:
        if token.dep_ == 'amod':
            if lemma:
                tokens += [token.lemma_, token.head.lemma_]
            else:
                tokens += [token.text.lower(), token.head.text.lower()]
        elif token.dep_ == 'neg':
            data = [x for x in token.head.children] + [token.head]
            for x in data:
                if lemma:
                    w = x.lemma_
                else:
                    w = x.text.lower()
                if x.pos_ == 'ADJ':
                    tokens.append("NOT_{}".format(w))
                else:
                    tokens.append(w)
    return tokens

In [13]:
dep_data = [(i, row.content) for i, row in D.iterrows()]

In [14]:
data = tqdm_notebook(dep_data)
dep_text = [(i, " ".join(dep(x, nlp, lemma=False))) for i, x in data]

HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))




In [15]:
data = tqdm_notebook(dep_data)
dep_lemma = [(i, " ".join(dep(x, nlp, lemma=True))) for i, x in data]

HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))




In [16]:
dep_text = dict(dep_text)
dep_lemma = dict(dep_lemma)

## Save

In [17]:
import json

In [18]:
training = {
    'raw_text': raw_text,
    'raw_lemma': raw_lemma,
    'pos_text': pos_text,
    'pos_lemma': pos_lemma,
    'dep_text': dep_text,
    'dep_lemma': dep_lemma
}

In [19]:
with open('../data/yelp_tset.json', 'w') as out:
    json.dump(training, out)

## Example

In [20]:
with open('../data/yelp_tset.json', 'r') as infile:
    T = json.load(infile)

In [23]:
D.loc[10].content

'Not great not bad if you are in a hurry and need a bite then stop in if you are looking for a great pizza then keep looking.'

In [24]:
for k, t in T.items():
    print(k)
    print(t['10'])

raw_text
not great not bad if you are in a hurry and need a bite then stop in if you are looking for a great pizza then keep looking .
raw_lemma
not great not bad if -PRON- be in a hurry and need a bite then stop in if -PRON- be look for a great pizza then keep look .
pos_text
great bad hurry bite great pizza
pos_lemma
great bad hurry bite great pizza
dep_text
not NOT_great NOT_great not stop keep . NOT_bad great pizza
dep_lemma
not NOT_great NOT_great not stop keep . NOT_bad great pizza
