In [None]:
import json
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook

# Features for text sentiment classification

In [None]:
D = pd.read_csv('data/yelp_example_1_small.tsv', sep='\t')

In [None]:
D.head()

## Terms and term frequencies

In [None]:
import spacy
from collections import defaultdict

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
I = defaultdict(lambda: defaultdict(lambda: 0))

In [None]:
records = []
rows = tqdm_notebook(list(D.iterrows()))
for i, row in rows:
    doc = nlp(row.content)
    for s, sent in enumerate(doc.sents):
        for t, token in enumerate(sent):
            record = {'doc': i, 'sentence': s, 'position': t}
            record['token'] = token.text
            record['lower'] = token.text.lower()
            record['lemma'] = token.lemma_
            record['pos'] = token.pos_
            record['alpha'] = token.is_alpha
            record['stop'] = token.is_stop
            record['doc_size'] = len(doc)
            record['sentence_size'] = len(sent)
            records.append(record)

## Store index

In [None]:
import pymongo

In [None]:
db = pymongo.MongoClient()['textsent']
yelp = db['yelp_simple']

In [None]:
yelp.insert_many(records)

In [None]:
m = {'$match': {'pos': {'$in': ['NOUN', 'ADJ']}}}
p = {'$project': {'_id': 0, 'doc': 1, 'sentence': 1, 'position': 1, 'lemma': 1}}
s = {'$sort': {'doc': 1, 'sentence': 1}}
g = {'$group': {'_id': '$doc', 'tokens': {'$push': '$lemma'}}}

In [None]:
for record in yelp.aggregate([m, p, s, g], allowDiskUse=True):
    print(record)

In [None]:
m = {'$match': {'pos': {'$in': ['NOUN', 'ADJ']}}}
g = {'$group': {'_id': {'doc': '$doc', 'size': '$doc_size', 'lemma': '$lemma'}, 'tf': {'$sum': 1}}}
h = {'$match': {'tf': {'$gte': 3}}}
s = {'$sort': {'tf': -1}}

In [None]:
m = {'$match': {'pos': {'$in': ['NOUN', 'ADJ']}}}
g = {'$group': {'_id': '$lemma', 'docs': {'$addToSet': '$doc'}}}
p = {'$project': {'_id': 1, 'docs': {'$size': '$docs'}}}

In [None]:
N = len(yelp.distinct('doc'))

In [None]:
for record in yelp.aggregate([m, g, p], allowDiskUse=True):
    print(record['_id'], np.log(N / record['docs']))

## Add sentiment lexicon to the index

## Deal with the logical structure of sentences
### Take into account negation using a dependency parser