In [1]:
import pymongo
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
import json

In [2]:
with open('../data/yelp_tset.json', 'r') as infile:
    T = json.load(infile)

## Weighting schemes
- **tfidf**: tfidf weights
- **sentiwn**: average sentiwn
- **combo**: tfidf x average sentiwn

### tfidf

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD

In [4]:
def tf_idf(data, components=200):
    docs = [x for x in data.keys()]
    texts = [data[k] for k in docs]
    c = CountVectorizer()
    tf_idf = TfidfTransformer(use_idf=True)
    k = c.fit_transform(texts)
    j = tf_idf.fit_transform(k)
    return j, TruncatedSVD(n_components=components).fit_transform(j), docs, c, tf_idf

In [5]:
Tx = {}
for k, tset in T.items():
    Tx[k + '_tfidf'] = tf_idf(tset, components=200)

In [6]:
Tx['raw_text_tfidf']

(<5000x15942 sparse matrix of type '<class 'numpy.float64'>'
 	with 351950 stored elements in Compressed Sparse Row format>,
 array([[ 0.02058127, -0.0042384 ,  0.00566932, ..., -0.02628164,
         -0.0157799 ,  0.00767678],
        [ 0.19324864,  0.02701875,  0.03589353, ..., -0.01188111,
          0.0221455 ,  0.0127169 ],
        [ 0.0775461 , -0.06638321, -0.0182733 , ...,  0.00641266,
          0.02611651,  0.01186202],
        ...,
        [ 0.20034007,  0.08485615, -0.0248902 , ..., -0.04429403,
          0.001204  ,  0.03813316],
        [ 0.24272672,  0.06623954,  0.02953674, ...,  0.00634126,
          0.02407981,  0.00464192],
        [ 0.27584979,  0.02346916, -0.01204607, ...,  0.02053717,
          0.04135146,  0.02880469]]),
 ['2674',
  '710',
  '2924',
  '1735',
  '103',
  '1780',
  '1736',
  '143',
  '85',
  '667',
  '134',
  '2943',
  '3202',
  '672',
  '1341',
  '1926',
  '4700',
  '185',
  '4703',
  '3',
  '88',
  '218',
  '563',
  '2328',
  '574',
  '576',
  '577

### sentiwn

In [7]:
from nltk.corpus import sentiwordnet as swn
from scipy.sparse import csr_matrix

In [8]:
def sentiwn(data, components=200):
    docs = [x for x in data.keys()]
    texts = tqdm_notebook([data[k].split() for k in docs])
    indptr, indices, data, dictionary = [0], [], [], {}
    for doc in texts:
        for token in doc:
            t_index = dictionary.setdefault(token, len(dictionary))
            indices.append(t_index)
            if token.startswith('NOT_'):
                synsets = list(swn.senti_synsets(token.replace('NOT_', '')))
                modifier = -1
            else:
                synsets = list(swn.senti_synsets(token))
                modifier = 1
            w = 0
            for syn in synsets:
                w += (syn.pos_score() - syn.neg_score()) * modifier
            try:
                data.append(w / len(synsets))
            except ZeroDivisionError:
                data.append(0)
        indptr.append(len(indices))
    csr = csr_matrix((data, indices, indptr), dtype=np.float64)
    return csr, TruncatedSVD(n_components=components).fit_transform(csr), docs, dictionary

In [9]:
for k, tset in T.items():
    Tx[k + '_sentiwn'] = sentiwn(tset)

HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4999), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4999), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))




### combo

In [10]:
def combo(case, Tx, T, components=200):
    m1d, _, docs1, d1 = Tx['{}_sentiwn'.format(case)]
    m2d, _, docs2, d2, _ = Tx['{}_tfidf'.format(case)]
    m1 = m1d.toarray()
    m2 = m2d.toarray()
    M = np.zeros(m1.shape)
    run = tqdm_notebook(list(enumerate(docs1)))
    for i, doc in run:
        tokens = T[case][doc].split()
        d2_index = docs2.index(doc)
        for t in tokens:
            try:
                t1_index = d1[t]
                sw = m1[i,t1_index]
            except KeyError:
                t1_index = None
                sw = 0
            try:
                t2_index = d2.vocabulary_[t]
                tw = m2[d2_index,t2_index]
            except KeyError:
                tw = 0
            if t1_index is not None:
                M[i,t1_index] = sw * tw
    out = csr_matrix(M, dtype=np.float64)
    return out, TruncatedSVD(n_components=components).fit_transform(out), docs1, d1

In [11]:
for k, tset in T.items():
    Tx[k + '_combo'] = combo(k, Tx, T)

HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4999), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4999), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))




### Save

In [None]:
import pickle

In [None]:
to_save = {}
for k, v in Tx.items():
    to_save[k] = list(v)[1:]

In [None]:
with open('../data/yelp_training.pkl', 'wb') as out:
    pickle.dump(to_save, out)