In [1]:
# code for loading the format for the notebook
import os

# path : store the current path to convert back to it later
path = os.getcwd()
os.chdir('../notebook_format')
from formats import load_style
load_style()

In [2]:
os.chdir(path)
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 8, 6 # change default figure size

# 1. magic to print version
# 2. magic so that the notebook will reload external python modules
%load_ext watermark
%load_ext autoreload 
%autoreload 2

# python -m pip install pymongo
import json
import gensim
from gensim import corpora
from gensim.models import LdaModel
import pymongo
from tqdm import tqdm_notebook # progress bar

# remove stop words built in with sklearn and nltk and punctuation marks; 
# emoticons are not removed (hopefully)
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

# note that string's punctuation only includes english punctuation marks
STOPWORDS = set( stopwords.words('english') ).union( set(ENGLISH_STOP_WORDS) )
STOPLIST  = STOPWORDS.union( set(string.punctuation) )

# loading the spacy nlp english model takes a moment
import spacy
parser = spacy.load('en')

%watermark -a 'Ethen' -d -t -v -p numpy,pandas,matplotlib,pymongo,tqdm,spacy,nltk,scikit-learn,gensim

Ethen 2016-08-14 13:23:19 

CPython 3.5.2
IPython 4.2.0

numpy 1.11.1
pandas 0.18.1
matplotlib 1.5.1
pymongo 3.3.0
tqdm 4.8.2
spacy 0.101.0
nltk 3.2.1
scikit-learn 0.17.1
gensim 0.13.1


## Preprocessing

In [3]:
FILE = 'yelp_dataset_challenge_academic_dataset'
DATASET = 'yelp_academic_dataset_review.json'
DATASET_PATH = os.path.join( FILE, DATASET )

# print one line of the json file
with open(DATASET_PATH) as f:
    print( json.loads( f.readline() ) )

{'text': 'Mr Hoagie is an institution. Walking in, it does seem like a throwback to 30 years ago, old fashioned menu board, booths out of the 70s, and a large selection of food. Their speciality is the Italian Hoagie, and it is voted the best in the area year after year. I usually order the burger, while the patties are obviously cooked from frozen, all of the other ingredients are very fresh. Overall, its a good alternative to Subway, which is down the road.', 'stars': 4, 'date': '2012-08-01', 'type': 'review', 'votes': {'cool': 0, 'useful': 0, 'funny': 0}, 'review_id': 'Ya85v4eqdd6k9Od8HbQjyA', 'business_id': '5UmKMjUEUNdYWqANhGckJw', 'user_id': 'PUFPaY9KxDAcGqfsorJp3Q'}


In [4]:
def count_file_len(fname):
    """
    count the total number of lines (minus 1) for a given file
    http://stackoverflow.com/questions/845058/how-to-get-line-count-cheaply-in-python
    """
    with open(fname) as f:
        for i, _ in enumerate(f):
            pass
    return i

In [5]:
def upload_to_mongo( N, dataset_path = DATASET_PATH, stop_list = STOPLIST, parser = parser ):
    """
    gets the reviews from the json file and 
    upload them to a MongoDB database called 'Yelp'
    and into a collection called 'Review'
    
    N: specify the number of data to read
    """
    
    # connect to mongodb and only built the database if it's the first time
    conn = pymongo.MongoClient()
    db = conn['Yelp']
    collection = db['Review']     
    if not collection.count():
        
        # confirm that the number of data to read is
        # lower than the total file length, if not
        # switch to that
        file_len = count_file_len(dataset_path)
        if N > file_len:
            N = file_len
        
        with open(dataset_path) as f:
            for _ in tqdm_notebook( range(N) ):
                line = next(f)
                review = json.loads(line)

                # clean up the review text using spacy's 
                # pre-trained english model `parser`; also remove stop words
                # and punctuation marks from the pre-specified list
                words = []
                text = review['text'].strip()
                parsed_text = parser(text)                       
                for token in parsed_text:

                    # lemma_ will access the stemmed, lemmatized 
                    # and lower-cased version of the token
                    token = token.lemma_ 
                    if token not in stop_list:
                        words.append(token)

                text_cleaned = ' '.join(words)

                collection.insert_one({
                    'review_id': review['review_id'],
                    'business_id': review['business_id'],
                    'stars': review['stars'],
                    'text': text,
                    'text_cleaned': text_cleaned
                })
    return db, collection

Remember to have a mongodb instance already running by typing 

```bash
mongod
```
in the command line.

We can remove the collection and re-upload the whole thing again using

```python
db['Review'].drop()
```

In [6]:
# confirm that the documents in the collection 
# matches the specified number
number_of_data_to_read = 100000
db, collection = upload_to_mongo( N = number_of_data_to_read )
assert collection.count() == number_of_data_to_read

In [7]:
class MyCorpus:
    """
    streaming way of building up the gensim corpus
    """
    def __init__( self, cursor, corpora_dict ):
        self.cursor = cursor
        self.corpora_dict = corpora_dict
    
    def __iter__(self):
        # rewind the cursor so that we can loop over it again
        self.cursor.rewind()
        for review in self.cursor:
            tokens = review['text_cleaned'].split()
            yield self.corpora_dict.doc2bow(tokens)

In [17]:
cursor = collection.find()

# build the id2word dictionary and the corpus (map the word to id)
# and filter out words that appear in less than 2 documents
corpora_dict = corpora.Dictionary( review['text_cleaned'].split() for review in cursor )
corpora_dict.filter_extremes(no_below = 2)
corpora_dict.compactify()
print( 'number of unique tokens: ', len(corpora_dict) )

# create the corpus using the built dictionary
corpus = MyCorpus( cursor, corpora_dict )

number of unique tokens:  38407


Before building up the corpus, we can use the following piece of code to only keep the number of most frequent words if we want to limit the size of our vocabulary.

```python
# specify the number for keep_n
corpora_dict.filter_extremes(keep_n = 10000)
corpora_dict.compactify()
```

For more info, check the [gensim Dictionary API](https://radimrehurek.com/gensim/corpora/dictionary.html).

In [18]:
def save_corpus_and_corpora_dict( corpus, corpora_dict, dir_name ):
    """
    Creates a `dir_name` directory and store
    corpus and corpora dict in it.
    
    Returns the corpus's path so we can load it later
    for training models
    """
    if not os.path.isdir(dir_name):
        os.mkdir(dir_name)
    
    corpora_dict_path = os.path.join( dir_name, 'corpora_dict.dict' )
    corpora.Dictionary.save( corpora_dict, corpora_dict_path )
    
    corpus_path = os.path.join( dir_name, 'corpus.lda-c' )
    corpora.BleiCorpus.serialize( corpus_path, corpus )
    return corpus_path, corpora_dict_path

In [19]:
# save and load
dir_name = 'model'
corpus_path, corpora_dict_path = save_corpus_and_corpora_dict( corpus, corpora_dict, dir_name )
corpus_loaded = corpora.BleiCorpus(corpus_path)

## Sentiment

In [20]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer

In [21]:
# convert gensim corpus to sparse matrix
# note that you'll need to transpose the matrix
# so that the rows will be each data points
X = gensim.matutils.corpus2csc(corpus_loaded).T
X.shape

(100000, 38407)

In [22]:
# reviews that are lower than 3 starts are considered negative, 
# which are labeled as 0 and positive reviews are labled as 1
cursor.rewind()
y = np.array([ review['stars'] for review in cursor ])
y[ y <= 3 ] = 0
y[ y > 3 ] = 1
y.shape

(100000,)

In [23]:
tfidf = TfidfTransformer()
X_tfidf = tfidf.fit_transform(X)
X_tfidf.shape

(100000, 38407)

In [24]:
logreg = LogisticRegression(n_jobs = -1)
logreg.fit( X_tfidf, y )

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [25]:
y_pred = logreg.predict(X_tfidf)
accuracy_score( y, y_pred )

0.88675000000000004

## Reference

- [Blog: Predicting what user reviews are about with LDA and gensim](http://www.vladsandulescu.com/topic-prediction-lda-user-reviews/)
- [Github to the blog post above](https://github.com/vladsandulescu/topics)
- [Gensim Tutorial: Corpora and Vector Spaces](https://radimrehurek.com/gensim/tut1.html)

http://stackoverflow.com/questions/17317418/stemmers-vs-lemmatizers