In [104]:
import pandas as pd
import numpy as np
import sqlalchemy as sqla
import random
import spacy
from spacy.util import minibatch, compounding
import time

# Sentiment Analysis

**It's important to use sentiments to evaluate the review of business. Although 'stars' seems to be a sufficient estimator, it lacks objectivity. A person can give different stars under his/her certain mood even though the actual quality of the business is constant. Thus, bringing up sentiment into account is a good way of measuring the actually quality of the business**

In [2]:
# construct connection to database
conn = sqla.create_engine('sqlite:///yelp.sqlite')

In [3]:
# query data from database
test = pd.read_sql_query("select * from review limit 20",conn)

In [156]:
test['text'][0]

"The pizza was okay. Not the best I've had. I prefer Biaggio's on Flamingo / Fort Apache. The chef there can make a MUCH better NY style pizza. The pizzeria @ Cosmo was over priced for the quality and lack of personality in the food. Biaggio's is a much better pick if youre going for italian - family owned, home made recipes, people that actually CARE if you like their food. You dont get that at a pizzeria in a casino. I dont care what you say..."

## Textblob classifier

In [93]:
%%time
# Default estimator
m = test['text'][2]
import textblob
from textblob import TextBlob
senti = TextBlob(m)
print(senti.sentiment)

Sentiment(polarity=-0.5333333333333333, subjectivity=0.8)
CPU times: user 1.4 ms, sys: 1.02 ms, total: 2.42 ms
Wall time: 3.86 ms


In [6]:
%%time
# NaiveBayes estimator:
from textblob.sentiments import NaiveBayesAnalyzer
senti = TextBlob(m,analyzer=NaiveBayesAnalyzer())
print(senti.sentiment)

Sentiment(classification='pos', p_pos=0.9136346853028313, p_neg=0.08636531469717702)
CPU times: user 4.15 s, sys: 514 ms, total: 4.66 s
Wall time: 5.23 s


**textblob doesn't have a acceptable accuracy especially for food review. Naivebayes is slow and inaccurate. I did a little research on how textblob calculates the sentiment. It turns out they have a XML file that contains polarity score for each words, and the overall polarity score is just the average of polarity scores of each word (Link: https://planspace.org/20150607-textblob_sentiment/ ). This is a poor way of estimating the sentiment score. Consider training my own NLP model.**

# SpaCy Text Categorizer    
I will train the model with Amazon's food review

In [9]:
amz_review = pd.read_csv('Amazon_Reviews.csv')
amz_review.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [17]:
# rescale score between -1 and 1
norm_score = np.interp(amz_review.Score.values,(amz_review.Score.values.min(),amz_review.Score.values.max()),(-1,+1))

In [18]:
amz_review['norm_score'] = norm_score

In [64]:
amz_review['train_tuple'] = amz_review.apply(lambda row: (row['Text'],row['Score']),axis=1)
train = amz_review['train_tuple'].tolist()
train[0]

('I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.',
 5)

In [70]:
#functions from spacy documentation
def load_data(limit=0, split=0.8):
    train_data = train
    np.random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    cats = [{'POSITIVE': y >= 3} for y in labels]
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 1e-8  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 1e-8  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = 2 * (precision * recall) / (precision + recall)
    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}


In [71]:
#("Number of texts to train from","t" , int)
n_texts=300000
#You can increase texts count if you have more computational power.
#("Number of training iterations", "n", int))
n_iter=10

In [72]:
nlp = spacy.load('en_core_web_sm')  # create english Language class

In [73]:
# add the text classifier to the pipeline if it doesn't exist
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'textcat' not in nlp.pipe_names:
    textcat = nlp.create_pipe('textcat')
    nlp.add_pipe(textcat, last=True)
# otherwise, get it, so we can add labels to it
else:
    textcat = nlp.get_pipe('textcat')

# add label to text classifier
textcat.add_label('POSITIVE')

# load the dataset
print("Loading food reviews data...")
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)
print("Using {} examples ({} training, {} evaluation)"
      .format(n_texts, len(train_texts), len(dev_texts)))
train_data = list(zip(train_texts,
                      [{'cats': cats} for cats in train_cats]))

Loading food reviews data...
Using 30000 examples (24000 training, 6000 evaluation)


In [77]:
%%time
# Training
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()
    print("Training the model...")
    print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
    for i in range(n_iter):
        start = time.time()
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(train_data, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                       losses=losses)
        with textcat.model.use_params(optimizer.averages):
            # evaluate on the dev data split off in load_data()
            scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
        stop = time.time()
        print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
              .format(losses['textcat'], scores['textcat_p'],
                      scores['textcat_r'], scores['textcat_f']))
        print('time for one iteration is {}'.format(stop-start))


Training the model...
LOSS 	  P  	  R  	  F  
204.768	0.910	0.983	0.945
time for one iteration is 155.03080892562866
123.836	0.925	0.975	0.949
time for one iteration is 178.083074092865
94.492	0.930	0.973	0.951
time for one iteration is 176.36578583717346
84.068	0.934	0.969	0.951
time for one iteration is 181.42677283287048
74.010	0.934	0.969	0.951
time for one iteration is 185.3597228527069
68.926	0.933	0.969	0.950
time for one iteration is 188.23954820632935
64.172	0.935	0.966	0.950
time for one iteration is 177.8659210205078
60.054	0.935	0.968	0.951
time for one iteration is 180.7558081150055
63.498	0.935	0.968	0.951
time for one iteration is 178.3223419189453
57.024	0.936	0.968	0.951
time for one iteration is 174.05425381660461
CPU times: user 48min 14s, sys: 1min 54s, total: 50min 9s
Wall time: 29min 35s


In [92]:
doc = nlp(test['text'][2])
doc.cats

{'POSITIVE': 0.0005693367565982044}

In [80]:
test_text2 = 'This tea is fun to watch as the flower expands in the water. Very smooth taste and can be used again and again in the same day. If you love tea, you gotta try these "flowering teas"'
doc2 = nlp(test_text2)
doc2.cats

{'POSITIVE': 0.9758155345916748}

**The trained model doesn't perform well as well. Have to train on more datasets and see the result. Now, try google NLP api**

# Google NLP API

In [98]:
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types

In [101]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/Users/jixingwei/Desktop/STA_141b_project/yelp_review.json"

In [103]:
# Instantiates a client
client = language.LanguageServiceClient()

# The text to analyze
text = u'Hello, world!'
document = types.Document(
    content=text,
    type=enums.Document.Type.PLAIN_TEXT)

# Detects the sentiment of the text
sentiment = client.analyze_sentiment(document=document).document_sentiment

print('Text: {}'.format(text))
print('Sentiment: {}, {}'.format(sentiment.score, sentiment.magnitude))

Text: Hello, world!
Sentiment: 0.30000001192092896, 0.30000001192092896


In [106]:
# Instantiates a client
client = language.LanguageServiceClient()

def AnalyseSentiment(text):
    document = types.Document(content=text,type=enums.Document.Type.PLAIN_TEXT)
    # Detects the sentiment of the text
    sentiment = client.analyze_sentiment(document=document).document_sentiment
    return (sentiment.score, sentiment.magnitude)

Load 3500 samples

In [105]:
df = pd.read_sql_query("select * from review limit 3500",conn)
df.head()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,iCQpiavjjPzJ5_3gPD5Ebg,0,2011-02-25 00:00:00.000000,0,x7mDIiDB3jEiPGPHOmDzyw,2,The pizza was okay. Not the best I've had. I p...,0,msQe1u7Z_XuqjGoqhB0J5g
1,pomGBqfbxcqPv14c3XH-ZQ,0,2012-11-13 00:00:00.000000,0,dDl8zu1vWPdKGihJrwQbpw,5,I love this place! My fiance And I go here atl...,0,msQe1u7Z_XuqjGoqhB0J5g
2,jtQARsP6P-LbkyjbO1qNGg,1,2014-10-23 00:00:00.000000,1,LZp4UX5zK3e-c5ZGSeo3kA,1,Terrible. Dry corn bread. Rib tips were all fa...,3,msQe1u7Z_XuqjGoqhB0J5g
3,elqbBhBfElMNSrjFqW3now,0,2011-02-25 00:00:00.000000,0,Er4NBWCmCD4nM8_p1GRdow,2,Back in 2005-2007 this place was my FAVORITE t...,2,msQe1u7Z_XuqjGoqhB0J5g
4,Ums3gaP2qM3W1XcA5r6SsQ,0,2014-09-05 00:00:00.000000,0,jsDu6QEJHbwP2Blom1PLCA,5,Delicious healthy food. The steak is amazing. ...,0,msQe1u7Z_XuqjGoqhB0J5g


In [110]:
AnalyseSentiment(df['text'][0])

(-0.10000000149011612, 3.200000047683716)

In [114]:
df['score'] = df['text'].apply(lambda text: AnalyseSentiment(text)[0])

In [119]:
df.head()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id,test_score,score
0,iCQpiavjjPzJ5_3gPD5Ebg,0,2011-02-25 00:00:00.000000,0,x7mDIiDB3jEiPGPHOmDzyw,2,The pizza was okay. Not the best I've had. I p...,0,msQe1u7Z_XuqjGoqhB0J5g,-2,-0.1
1,pomGBqfbxcqPv14c3XH-ZQ,0,2012-11-13 00:00:00.000000,0,dDl8zu1vWPdKGihJrwQbpw,5,I love this place! My fiance And I go here atl...,0,msQe1u7Z_XuqjGoqhB0J5g,-5,0.7
2,jtQARsP6P-LbkyjbO1qNGg,1,2014-10-23 00:00:00.000000,1,LZp4UX5zK3e-c5ZGSeo3kA,1,Terrible. Dry corn bread. Rib tips were all fa...,3,msQe1u7Z_XuqjGoqhB0J5g,-1,-0.3
3,elqbBhBfElMNSrjFqW3now,0,2011-02-25 00:00:00.000000,0,Er4NBWCmCD4nM8_p1GRdow,2,Back in 2005-2007 this place was my FAVORITE t...,2,msQe1u7Z_XuqjGoqhB0J5g,-2,-0.2
4,Ums3gaP2qM3W1XcA5r6SsQ,0,2014-09-05 00:00:00.000000,0,jsDu6QEJHbwP2Blom1PLCA,5,Delicious healthy food. The steak is amazing. ...,0,msQe1u7Z_XuqjGoqhB0J5g,-5,0.7


In [None]:
df.drop(columns=['test_score'],inplace=True)

In [122]:
df.head()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id,score
0,iCQpiavjjPzJ5_3gPD5Ebg,0,2011-02-25 00:00:00.000000,0,x7mDIiDB3jEiPGPHOmDzyw,2,The pizza was okay. Not the best I've had. I p...,0,msQe1u7Z_XuqjGoqhB0J5g,-0.1
1,pomGBqfbxcqPv14c3XH-ZQ,0,2012-11-13 00:00:00.000000,0,dDl8zu1vWPdKGihJrwQbpw,5,I love this place! My fiance And I go here atl...,0,msQe1u7Z_XuqjGoqhB0J5g,0.7
2,jtQARsP6P-LbkyjbO1qNGg,1,2014-10-23 00:00:00.000000,1,LZp4UX5zK3e-c5ZGSeo3kA,1,Terrible. Dry corn bread. Rib tips were all fa...,3,msQe1u7Z_XuqjGoqhB0J5g,-0.3
3,elqbBhBfElMNSrjFqW3now,0,2011-02-25 00:00:00.000000,0,Er4NBWCmCD4nM8_p1GRdow,2,Back in 2005-2007 this place was my FAVORITE t...,2,msQe1u7Z_XuqjGoqhB0J5g,-0.2
4,Ums3gaP2qM3W1XcA5r6SsQ,0,2014-09-05 00:00:00.000000,0,jsDu6QEJHbwP2Blom1PLCA,5,Delicious healthy food. The steak is amazing. ...,0,msQe1u7Z_XuqjGoqhB0J5g,0.7


In [124]:
df.to_csv('review_with_score.csv')

In [166]:
df[df.score < 0].head()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id,score
0,iCQpiavjjPzJ5_3gPD5Ebg,0,2011-02-25 00:00:00.000000,0,x7mDIiDB3jEiPGPHOmDzyw,2,The pizza was okay. Not the best I've had. I p...,0,msQe1u7Z_XuqjGoqhB0J5g,-0.1
2,jtQARsP6P-LbkyjbO1qNGg,1,2014-10-23 00:00:00.000000,1,LZp4UX5zK3e-c5ZGSeo3kA,1,Terrible. Dry corn bread. Rib tips were all fa...,3,msQe1u7Z_XuqjGoqhB0J5g,-0.3
3,elqbBhBfElMNSrjFqW3now,0,2011-02-25 00:00:00.000000,0,Er4NBWCmCD4nM8_p1GRdow,2,Back in 2005-2007 this place was my FAVORITE t...,2,msQe1u7Z_XuqjGoqhB0J5g,-0.2
5,vgfcTvK81oD4r50NMjU2Ag,0,2011-02-25 00:00:00.000000,0,pfavA0hr3nyqO61oupj-lA,1,This place sucks. The customer service is horr...,2,msQe1u7Z_XuqjGoqhB0J5g,-0.6
11,UBv8heCQR0RPnUQG0zkXIQ,0,2016-09-23 00:00:00.000000,0,HkYqGb0Gplmmk-xlHTRBoA,1,The score should be negative. Its HORRIBLE. Th...,0,NhOc64RsrTT1Dls50yYW8g,-0.5


A good review is the review with more than 2 useful upvote

In [136]:
df_good_review = pd.read_sql_query("select * from review where useful>2",conn)

In [137]:
df_good_review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 926606 entries, 0 to 926605
Data columns (total 9 columns):
business_id    926606 non-null object
cool           926606 non-null int64
date           926606 non-null object
funny          926606 non-null int64
review_id      926606 non-null object
stars          926606 non-null int64
text           926606 non-null object
useful         926606 non-null int64
user_id        926606 non-null object
dtypes: int64(4), object(5)
memory usage: 63.6+ MB


In [138]:
df_good_review_4000 = df_good_review.loc[:4000,]

In [139]:
df_good_review_4000.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4001 entries, 0 to 4000
Data columns (total 9 columns):
business_id    4001 non-null object
cool           4001 non-null int64
date           4001 non-null object
funny          4001 non-null int64
review_id      4001 non-null object
stars          4001 non-null int64
text           4001 non-null object
useful         4001 non-null int64
user_id        4001 non-null object
dtypes: int64(4), object(5)
memory usage: 281.4+ KB


In [150]:
df_good_review_4000['score'],df_good_review_4000['score_magnitude'] = zip(*df_good_review_4000['text'].apply(lambda text: AnalyseSentiment(text)))
df_good_review_4000.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id,score,score_magnitude
0,jtQARsP6P-LbkyjbO1qNGg,1,2014-10-23 00:00:00.000000,1,LZp4UX5zK3e-c5ZGSeo3kA,1,Terrible. Dry corn bread. Rib tips were all fa...,3,msQe1u7Z_XuqjGoqhB0J5g,-0.3,2.0
1,3Mx4renubPRnjHUw1n2UkA,0,2015-09-30 00:00:00.000000,1,MTrzrLQT_LK2VLK9xantHw,1,If I could leave zero stars I would. This rude...,4,eAavwM32i2h7sNNer--kGw,-0.3,4.9
2,qdC2h3MqCNussc5lhzL7gg,0,2015-11-05 00:00:00.000000,0,AJ0IglpYAp2D0daOlVGIdg,1,This place doesn't even deserve 1 star. Horrib...,18,Ka7TIp74VHSa90rfdg4_2g,-0.7,4.4
3,brVIRXT4AubkUbe6P-NAAA,0,2016-05-01 00:00:00.000000,0,IRw-bVzY_9ASqmeDLzXPig,3,"Laser maze is a blast for the kids, check Grou...",3,0pf5VuzE4_1pwj5NJHG5TQ,0.2,2.9
4,Wc9UpJhOcdSj7olZkz7SJA,0,2016-04-11 00:00:00.000000,0,dbfzieXNqQLsZjE7BSRZXg,3,"First time here started off with no line, by t...",6,0pf5VuzE4_1pwj5NJHG5TQ,0.0,3.2


In [151]:
df_good_review_4000.to_csv('good_review.csv')

**google's NLP api is expensive, can't afford it. Must try to something else**

# Comparison textblob with Google's NLP api

In [163]:
df_good_review_4000['textblob_score'] = df_good_review_4000['text'].apply(lambda text: TextBlob(text).sentiment[0]*TextBlob(text).sentiment[1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [165]:
df_good_review_4000[['textblob_score','score']].head()

Unnamed: 0,textblob_score,score
0,-0.426667,-0.3
1,-0.003218,-0.3
2,-0.52,-0.7
3,0.278819,0.2
4,0.11385,0.0
