In [1]:
import pymongo
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
import json

In [2]:
with open('../data/yelp_tset.json', 'r') as infile:
    T = json.load(infile)

## Weighting schemes
- **tfidf**: tfidf weights
- **sentiwn**: average sentiwn
- **combo**: tfidf x average sentiwn

### tfidf

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD

In [4]:
def tf_idf(data, components=200):
    docs = [x for x in data.keys()]
    texts = [data[k] for k in docs]
    c = CountVectorizer()
    tf_idf = TfidfTransformer(use_idf=True)
    k = c.fit_transform(texts)
    j = tf_idf.fit_transform(k)
    return j, TruncatedSVD(n_components=components).fit_transform(j), docs, c, tf_idf

In [5]:
Tx = {}
for k, tset in T.items():
    Tx[k + '_tfidf'] = tf_idf(tset, components=200)

In [6]:
Tx['raw_text_tfidf']

(<5000x15941 sparse matrix of type '<class 'numpy.float64'>'
 	with 351949 stored elements in Compressed Sparse Row format>,
 array([[ 0.11718984, -0.04305845,  0.02964049, ..., -0.08335055,
         -0.00749146, -0.00362476],
        [ 0.23313605, -0.10159979,  0.00855815, ...,  0.00563066,
          0.01971707, -0.02069645],
        [ 0.33740785,  0.15747412, -0.12231325, ..., -0.00653974,
         -0.03229353, -0.00732497],
        ...,
        [ 0.40064878,  0.01901839,  0.06522342, ...,  0.01159464,
         -0.00321371, -0.04169511],
        [ 0.26785394, -0.15032798,  0.00865237, ..., -0.03048094,
         -0.02308914,  0.00787002],
        [ 0.13334714, -0.03210702, -0.01240796, ..., -0.01018819,
          0.1186839 , -0.00968295]]),
 ['4126',
  '3873',
  '2827',
  '2822',
  '2858',
  '3264',
  '441',
  '4528',
  '3614',
  '4306',
  '754',
  '1268',
  '4793',
  '4297',
  '172',
  '2003',
  '1667',
  '3915',
  '4945',
  '838',
  '4061',
  '2230',
  '4823',
  '1146',
  '4206',
  

### sentiwn

In [7]:
from nltk.corpus import sentiwordnet as swn
from scipy.sparse import csr_matrix

In [8]:
def sentiwn(data, components=200):
    docs = [x for x in data.keys()]
    texts = tqdm_notebook([data[k].split() for k in docs])
    indptr, indices, data, dictionary = [0], [], [], {}
    for doc in texts:
        for token in doc:
            t_index = dictionary.setdefault(token, len(dictionary))
            indices.append(t_index)
            if token.startswith('NOT_'):
                synsets = list(swn.senti_synsets(token.replace('NOT_', '')))
                modifier = -1
            else:
                synsets = list(swn.senti_synsets(token))
                modifier = 1
            w = 0
            for syn in synsets:
                w += (syn.pos_score() - syn.neg_score()) * modifier
            try:
                data.append(w / len(synsets))
            except ZeroDivisionError:
                data.append(0)
        indptr.append(len(indices))
    csr = csr_matrix((data, indices, indptr), dtype=np.float64)
    return csr, TruncatedSVD(n_components=components).fit_transform(csr), docs, dictionary

In [9]:
for k, tset in T.items():
    Tx[k + '_sentiwn'] = sentiwn(tset)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4999.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4999.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




### combo

In [10]:
def combo(case, Tx, T, components=200):
    m1d, _, docs1, d1 = Tx['{}_sentiwn'.format(case)]
    m2d, _, docs2, d2, _ = Tx['{}_tfidf'.format(case)]
    m1 = m1d.toarray()
    m2 = m2d.toarray()
    M = np.zeros(m1.shape)
    run = tqdm_notebook(list(enumerate(docs1)))
    for i, doc in run:
        tokens = T[case][doc].split()
        d2_index = docs2.index(doc)
        for t in tokens:
            try:
                t1_index = d1[t]
                sw = m1[i,t1_index]
            except KeyError:
                t1_index = None
                sw = 0
            try:
                t2_index = d2.vocabulary_[t]
                tw = m2[d2_index,t2_index]
            except KeyError:
                tw = 0
            if t1_index is not None:
                M[i,t1_index] = sw * tw
    out = csr_matrix(M, dtype=np.float64)
    return out, TruncatedSVD(n_components=components).fit_transform(out), docs1, d1

In [11]:
for k, tset in T.items():
    Tx[k + '_combo'] = combo(k, Tx, T)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4999.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4999.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




### Save

In [12]:
import pickle

In [13]:
to_save = {}
for k, v in Tx.items():
    to_save[k] = list(v)[1:]

In [14]:
with open('../data/yelp_training.pkl', 'wb') as out:
    pickle.dump(to_save, out)