# NLP: Analyzing Yelp Reviews

1. Use [**spaCy**](https://spacy.io) to process text: Tokenization, text normalization, sentence detection 
1. Apply phrase modeling: Look for words that appear one after another as multi-word concepts
1. Discover topics with LDA: Find topics for each review based on the words used
1. Visualize the topic model with [**pyLDAvis**](https://pyldavis.readthedocs.io/en/latest/readme.html) library
1. Describe text with LDA: Each review might look like Topic A and Topic B with some probabilities
1. Train a word vector model with word2vec using [**gensim**](https://radimrehurek.com/gensim/index.html) library: Predict which word will appear in a given context
1. Visuailize word vectors by reducing dimensionality with t-Distributed Stochastic Neighbor Embedding or [TSNE](http://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html)

In [3]:
import os
import codecs

data_directory = os.path.join('C:/Users/andre/Documents/', 'yelp_dataset_challenge_round9')

businesses_filepath = os.path.join(data_directory, 'yelp_academic_dataset_business.json')

with codecs.open(businesses_filepath, encoding='utf_8') as f:
    first_business_record = f.readline() 

print (first_business_record)

{"business_id":"0DI8Dt2PJp07XkVvIElIcQ","name":"Innovative Vapors","neighborhood":"","address":"227 E Baseline Rd, Ste J2","city":"Tempe","state":"AZ","postal_code":"85283","latitude":33.3782141,"longitude":-111.936102,"stars":4.5,"review_count":17,"is_open":0,"attributes":["BikeParking: True","BusinessAcceptsBitcoin: False","BusinessAcceptsCreditCards: True","BusinessParking: {'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}","DogsAllowed: False","RestaurantsPriceRange2: 2","WheelchairAccessible: True"],"categories":["Tobacco Shops","Nightlife","Vape Shops","Shopping"],"hours":["Monday 11:0-21:0","Tuesday 11:0-21:0","Wednesday 11:0-21:0","Thursday 11:0-21:0","Friday 11:0-22:0","Saturday 10:0-22:0","Sunday 11:0-18:0"],"type":"business"}



In [4]:
review_json_filepath = os.path.join(data_directory, 'yelp_academic_dataset_review.json')

with codecs.open(review_json_filepath, encoding='utf_8') as f:
    first_review_record = f.readline()
    
print (first_review_record)

{"review_id":"NxL8SIC5yqOdnlXCg18IBg","user_id":"KpkOkG6RIf4Ra25Lhhxf1A","business_id":"2aFiy99vNLklCx3T_tGS9A","stars":5,"date":"2011-10-10","text":"If you enjoy service by someone who is as competent as he is personable, I would recommend Corey Kaplan highly. The time he has spent here has been very productive and working with him educational and enjoyable. I hope not to need him again (though this is highly unlikely) but knowing he is there if I do is very nice. By the way, I'm not from El Centro, CA. but Scottsdale, AZ.","useful":0,"funny":0,"cool":0,"type":"review"}



In [396]:
import json
import numpy as np

healthcare_ids = []

# open the businesses file
with codecs.open(businesses_filepath, encoding='utf_8') as f:
    
    # iterate through each line (json record) in the file
    for business_json in f:
        
        # convert the json record to a Python dict
        business = json.loads(business_json)
        
        # if this business has no categories or is not a target entity, skip to the next one
        if business[u'categories'] is None or u'Health & Medical' not in business[u'categories']:
            continue
        # Remove businesses in BW, Germany
        if u'BW' in business[u'state']:
            continue
        # Remove businesses that are restaurants, food and pets
        if u'Restaurants' in business[u'categories'] or u'Food' in business[u'categories'] or 'Pets' in business[u'categories']:
            continue
            
        # add the business id to our healthcare_ids set
        healthcare_ids.append(business[u'business_id'])

# Turn the list of ids into a set, which is faster for testing whether an element is in the set
healthcare_ids = set(healthcare_ids)

# print the number of unique ids in the dataset
print ('{:,}'.format(len(healthcare_ids)), u'health & medical entities in the dataset.')

10,211 health & medical entities in the dataset.


In [397]:
intermediate_directory = os.path.join(data_directory, 'intermediate')

review_txt_filepath = os.path.join(intermediate_directory,
                                   'review_text_all.txt')

In [398]:
# Create a new file that contains only the text from reviews about healthcare entities.
# One review per line in the this new file.

# Make if statement true to create the new text file or load if the file is already prepared.
if 1 == 1:
    
    review_count = 0

    # create & open a new file in write mode
    with codecs.open(review_txt_filepath, 'w', encoding='utf_8') as review_txt_file:

        # open the existing review json file
        with codecs.open(review_json_filepath, encoding='utf_8') as review_json_file:

            # loop through all reviews in the existing file and convert to dict
            for review_json in review_json_file:
                review = json.loads(review_json)

                # if this review is not in the target set, skip to the next one
                if review[u'business_id'] not in healthcare_ids:
                    continue

                # write each review as a line in the new file
                # escape newline characters in the original review text
                review_txt_file.write(review[u'text'].replace('\n', '\\n') + '\n')
                review_count += 1

    print (u'''Text from {:,} healthcare reviews
              written to the new txt file.'''.format(review_count))
    
else:
    
    with codecs.open(review_txt_filepath, encoding='utf_8') as review_txt_file:
        for review_count, line in enumerate(review_txt_file):
            pass
        
    print (u'Text from {:,} healthcare reviews in the txt file.'.format(review_count + 1))

Text from 114,556 healthcare reviews
              written to the new txt file.


In [399]:
import spacy
import pandas as pd
import itertools as it

nlp = spacy.load('en')

In [400]:
with codecs.open(review_txt_filepath, encoding='utf_8') as f:
    sample_review = list(it.islice(f, 9005, 10000))[0]
    sample_review = sample_review.replace('\\n', '\n')
        
print (sample_review)

Great dentist for kids...been going here for years. The practice is well run and managed and the dentist is super careful with the kids...they never feel any pain.



In [401]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

In [402]:
def punct_space(token):
    """Eliminate tokens that are pure punctuation or white space"""
    
    return token.is_punct or token.is_space

def person(token):
    """Remove tokens that are PERSON entities"""
    
    return token.ent_type_ == 'PERSON'

def line_review(filename):
    """Generator function (iterator without storing all texts)
    to read in reviews from file and return the original line breaks"""
    
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')

def lemmatized_sentence_corpus(filename):
    """Generator function to use spaCy to parse reviews, lemmatize the text and yield sentences"""
    
    for parsed_review in nlp.pipe(line_review(filename), batch_size=10000, n_threads=4):
        for sent in parsed_review.sents:
            yield u' '.join([token.lemma_ for token in sent if not (punct_space(token) or person(token))])

In [403]:
unigram_sentences_filepath = os.path.join(intermediate_directory, 'unigram_sentences_all.txt')

In [404]:
# Segment reviews into sentences and normalize the text
# Save the parsed sentences file on disk to avoid storing the entire corpus in RAM
if 1 == 1:
    with codecs.open(unigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for sentence in lemmatized_sentence_corpus(review_txt_filepath):
            f.write(sentence + '\n')

In [405]:
# gensim's LineSentence class takes the format: one sentence = one line
# words are preprocessed and separated by whitespace.
unigram_sentences = LineSentence(unigram_sentences_filepath)

In [406]:
for unigram_sentence in it.islice(unigram_sentences, 2710, 2713):
    print (unigram_sentence)

['doc', 'walk', 'in', 'and', '-PRON-', 'realize', '-PRON-', 'work', 'on', '-PRON-', 'awhile', 'back', 'when', '-PRON-', 'be', 'both', 'ad']
['-PRON-', 'be', 'a', 'major', 'then']
['small', 'world']


In [407]:
bigram_model_filepath = os.path.join(intermediate_directory, 'bigram_model_all')

In [408]:
# Run a phrase model to link two-words phrases together
if 1 == 1:
    bigram_model = Phrases(unigram_sentences)
    bigram_model.save(bigram_model_filepath)

else:
    bigram_model = Phrases.load(bigram_model_filepath)

In [409]:
bigram_sentences_filepath = os.path.join(intermediate_directory, 'bigram_sentences_all.txt')

In [410]:
# Apply the bigram model to unigram sentences and create a text with bigram sentences
if 1 == 1:
    with codecs.open(bigram_sentences_filepath, 'w', encoding='utf-8') as f:
        for unigram_sentence in unigram_sentences:
            bigram_sentence = u' '.join(bigram_model[unigram_sentence])
            f.write(bigram_sentence + '\n')

In [411]:
bigram_sentences = LineSentence(bigram_sentences_filepath)

In [412]:
for bigram_sentence in it.islice(bigram_sentences, 2710, 2713):
    print(bigram_sentence)

['doc', 'walk', 'in', 'and', '-PRON-', 'realize', '-PRON-', 'work', 'on', '-PRON-', 'awhile', 'back', 'when', '-PRON-', 'be', 'both', 'ad']
['-PRON-', 'be', 'a', 'major', 'then']
['small', 'world']


In [413]:
trigram_model_filepath = os.path.join(intermediate_directory, 'trigram_model_all')

In [414]:
if 1 == 1:
    trigram_model = Phrases(bigram_sentences)
    trigram_model.save(trigram_model_filepath)
else:
    trigram_model = Phrases.load(trigram_model_filepath)

In [415]:
trigram_sentences_filepath = os.path.join(intermediate_directory, 'trigram_sentences_all.txt')

In [416]:
if 1 == 1:
    with codecs.open(trigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for bigram_sentence in bigram_sentences:
            trigram_sentence = u' '.join(trigram_model[bigram_sentence])
            f.write(trigram_sentence + '\n')            

In [417]:
trigram_sentences = LineSentence(trigram_sentences_filepath)

In [418]:
for trigram_sentence in it.islice(trigram_sentences, 2710, 2713):
    print(trigram_sentence)    

['doc', 'walk', 'in', 'and', '-PRON-', 'realize', '-PRON-', 'work', 'on', '-PRON-', 'awhile', 'back', 'when', '-PRON-', 'be', 'both', 'ad']
['-PRON-', 'be', 'a', 'major', 'then']
['small', 'world']


In [419]:
trigram_reviews_filepath = os.path.join(intermediate_directory, 'trigram_transformed_reviews_all.txt')

In [420]:
# Write a transformed text into a new file, with one review per line
if 1 == 1:
    with codecs.open(trigram_reviews_filepath, 'w', encoding='utf-8') as f:
        for parsed_review in nlp.pipe(line_review(review_txt_filepath), batch_size=10000, n_threads=4):
            # Lemmatize the text, removing punctuation and whitespace
            unigram_review = [token.lemma_ for token in parsed_review if not (punct_space(token) or person(token))]
            # Apply the first-order and second-order phrase models
            bigram_review = bigram_model[unigram_review]
            trigram_review = trigram_model[bigram_review]
            # Remove any remaining stopwords
            trigram_review = [term for term in trigram_review if term not in spacy.en.language_data.STOP_WORDS]
            # Write the transformed review as a new line
            trigram_review = u' '.join(trigram_review)
            f.write(trigram_review + '\n')

In [421]:
print (u'Original:' + u'\n')
for review in it.islice(line_review(review_txt_filepath), 50, 51):
    print(review)
print(u'Transformed:' + u'\n')
with codecs.open(trigram_reviews_filepath, encoding='utf-8') as f:
    for review in it.islice(f, 50, 51):
        print(review)

Original:

Buyer beware! This company apparently cares far more about their office's revenue than their patients' mental health. 

I recently needed a prescription refilled. According to the last conversation I had w/ my therapist (whom I adore, pity she is associated with this place), she indicated I would not need a return visit - unless I needed to see her, our sessions were wrapped up and the prescription would continue.

I requested a refill via my pharmacy on Monday 3/17, which was 'authorization required', and the pharmacy sent the request to DV Counseling the same day. I needed to get this refilled prior to Thurs. 3/20 because I was going out of town on 3/21. When I hadn't heard anything by the 20th, I followed up with their office THREE TIMES that day - morning, mid day, mid afternoon, and let them know of the urgency.  On 3/24, I still had not received a call back or a prescription refill, so I called to inquire again and left yet another message. And again after no call back

In [422]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

import pyLDAvis, pyLDAvis.gensim
import warnings
import pickle

In [423]:
trigram_dictionary_filepath = os.path.join(intermediate_directory, 'trigram_dict_all.dict')

In [448]:
# Learn the full vocabulary of the corpus, using gensim's Dictionary class
if 1 == 1:
    trigram_reviews = LineSentence(trigram_reviews_filepath)
    # Learn the dictionary by iterating over all the reviews
    trigram_dictionary = Dictionary(trigram_reviews)
    # Filter out tokens that are too rare or too common
    trigram_dictionary.filter_extremes(no_below=200, no_above=0.3)
    # Filter out other unuseful words used in reviews
    stoplist = set('feel tell come like'.split())
    stop_ids = [trigram_dictionary.token2id[stopword] for stopword in stoplist]
    #trigram_dictionary.filter_tokens(stop_ids)
    # Reassign integer ids
    trigram_dictionary.compactify()
    trigram_dictionary.save(trigram_dictionary_filepath)

trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)

In [449]:
trigram_bow_filepath = os.path.join(intermediate_directory, 'trigram_bow_corpus_all.mm')

In [450]:
def trigram_bow_generator(filepath):
    """Generator function to read reviews from a file and yield a bag-of-words representation"""
    for review in LineSentence(filepath):
        yield trigram_dictionary.doc2bow(review)

In [451]:
# Build a bag-of-words corpus
if 1 == 1:
    # Generate bag-of-words representations for all reviews and save them as a matrix
    MmCorpus.serialize(trigram_bow_filepath, trigram_bow_generator(trigram_reviews_filepath))
trigram_bow_corpus = MmCorpus(trigram_bow_filepath)

In [452]:
lda_model_filepath = os.path.join(intermediate_directory, 'lda_model_all')

In [453]:
# To build the topci model, pass the bag-of-words matrix and Dictionary to LdaMulticore as inputs
if 1 == 1:
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        
        # workers = # of cores minus one
        lda = LdaMulticore(trigram_bow_corpus, num_topics=30, id2word=trigram_dictionary, workers=3, iterations=300, passes=5, random_state=0)
    lda.save(lda_model_filepath)
lda = LdaMulticore.load(lda_model_filepath)

In [454]:
def explore_topic(topic_number, topn=10):
    """Print out a list of top terms for a specified topic number"""
    print (u'{:20} {}'.format(u'Topic ' + str(topic_number), u'Frequency') + u'\n')
    for term, frequency in lda.show_topic(topic_number, topn):
        print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))
    print (u' ')

In [455]:
LDAvis_data_filepath = os.path.join(intermediate_directory, 'ldavis_prepared')

In [456]:
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus, trigram_dictionary)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

In [457]:
pyLDAvis.display(LDAvis_prepared)

In [458]:
pyLDAvis.save_html(LDAvis_prepared, 'LDAvis30a.html')

In [459]:
lda.show_topics(30,3)

[(0, '0.080*"friendly" + 0.045*"great" + 0.042*"professional"'),
 (1, '0.125*"review" + 0.038*"experience" + 0.030*"read"'),
 (2, '0.100*"clinic" + 0.048*"dr" + 0.021*"med"'),
 (3, '0.024*"process" + 0.019*"visit" + 0.019*"test"'),
 (4, '0.082*"son" + 0.069*"child" + 0.067*"question"'),
 (5, '0.034*"place" + 0.027*"like" + 0.022*"people"'),
 (6, '0.092*"insurance" + 0.064*"pay" + 0.045*"bill"'),
 (7, '0.041*"tell" + 0.035*"office" + 0.034*"doctor"'),
 (8, '0.026*"baby" + 0.022*"like" + 0.021*"know"'),
 (9, '0.102*"doctor" + 0.065*"dr." + 0.047*"office"'),
 (10, '0.078*"treatment" + 0.027*"groupon" + 0.023*"money"'),
 (11, '0.112*"massage" + 0.029*"place" + 0.021*"feel"'),
 (12, '0.061*"hospital" + 0.060*"nurse" + 0.029*"er"'),
 (13, '0.076*"pain" + 0.023*"dr." + 0.022*"feel"'),
 (14, '0.037*"tell" + 0.017*"ask" + 0.015*"leave"'),
 (15, '0.073*"eye" + 0.067*"glass" + 0.030*"frame"'),
 (16, '0.103*"husband" + 0.048*"family" + 0.048*"friend"'),
 (17, '0.075*"surgery" + 0.070*"dr." + 0.052

In [377]:
trigram_dictionary.token2id['question']

499

In [379]:
trigram_dictionary[499]

'question'