In [262]:
import numpy as np
import pandas as pd
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
import matplotlib.pyplot as plt
import stanfordnlp
from nltk.stem import WordNetLemmatizer 
from senticnet.senticnet import SenticNet
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
from nltk.corpus import wordnet
from itertools import product
from operator import itemgetter
from textblob import TextBlob
import progressbar
import warnings
warnings.filterwarnings('ignore')
import pickle


Initialize sentiment and sentence parser

In [21]:
nltk_sent = SentimentIntensityAnalyzer()
sen = SenticNet()
nlp = stanfordnlp.Pipeline()

Read in Restaurant Reviews and Businesses Dataframe that is stored externally and gotten from a publicly available Yelp challenge data set

In [None]:
rest_reviews=pd.read_pickle('/Volumes/DrewM_External/DataScience/Incubator_Challenge/Yelp_data/yelp_dataset/restaurant_reviews')

In [5]:
business=pd.read_feather('/Volumes/DrewM_External/DataScience/Incubator_Challenge/Yelp_data/yelp_dataset/business')

Grab only restaurants 

In [7]:
restaurants=business[business['categories'].str.contains("Restaurants",na=False)]

Find frequent users: Defined as over 250 reviews, Grab their reviews

In [258]:
freq_users = user_id_count[user_id_count>250].index.to_list()

In [280]:
reviews_for_freq_user = rest_reviews[rest_reviews['user_id'].isin(freq_users)]

Merge with business data frame to get business name, catagory, and stars

In [284]:
reviews_for_freq_user_merge = reviews_for_freq_user.merge(\
                            restaurants, how='left', left_on='business_id', right_on='business_id'\
                            )[['user_id', 'business_id', 'name', 'stars_x', 'categories', 'stars_y']]

In [285]:
pickle.dump( reviews_for_freq_user_merge, open( "reviews_for_freq_user_merge.p", "wb" ) )

CoreNLP parses sentences into word XPOS, UPOS, governors (word that governs it)

In [286]:
'''
XPOS Cheat-sheet
	1.	CC	Coordinating conjunction
	2.	CD	Cardinal number
	3.	DT	Determiner
	4.	EX	Existential there
	5.	FW	Foreign word
	6.	IN	Preposition or subordinating conjunction
	7.	JJ	Adjective
	8.	JJR	Adjective, comparative
	9.	JJS	Adjective, superlative
	10.	LS	List item marker
	11.	MD	Modal
	12.	NN	Noun, singular or mass
	13.	NNS	Noun, plural
	14.	NNP	Proper noun, singular
	15.	NNPS	Proper noun, plural
	16.	PDT	Predeterminer
	17.	POS	Possessive ending
	18.	PRP	Personal pronoun
	19.	PRP$	Possessive pronoun
	20.	RB	Adverb
	21.	RBR	Adverb, comparative
	22.	RBS	Adverb, superlative
	23.	RP	Particle
	24.	SYM	Symbol
	25.	TO	to
	26.	UH	Interjection
	27.	VB	Verb, base form
	28.	VBD	Verb, past tense
	29.	VBG	Verb, gerund or present participle
	30.	VBN	Verb, past participle
	31.	VBP	Verb, non-3rd person singular present
	32.	VBZ	Verb, 3rd person singular present
	33.	WDT	Wh-determiner
	34.	WP	Wh-pronoun
	35.	WP$	Possessive wh-pronoun
	36.	WRB	Wh-adverb
'''

verb = ["VBD", "VB", "VBG", "VBN","VBP", "VBZ"]
noun = ["NN", "NNS", "NNP", "NNPS"]
adverb =["RB", "RBR", "RBS"]
adjective = ["JJ", "JJR", "JJS"]
auxiliary_verb = ["be" , "am" , "are", "is", "was", "being", "can", "could", "do", "did", "does", "doing", "have", "had",
         "has", "having", "may", "might", "might", "must", "shall", "should", "will", "'ve", "n't", "were"]

Use set rules to extract aspect sentiment pairs:
1. Noun is Adj
2. Noun Adj pair
3. Evaluative Verb -> Noun Object

Output dictionary of aspect, modifier and sentiment

In [138]:
def extract_asp_sent_pairs(doc):
    asp_sent_pairs = []
    for i in range(len(doc.sentences)):
        for j in range(len(doc.sentences[i].words)):
            # extract noun is adj type
            if doc.sentences[i].words[j].lemma == 'be' and doc.sentences[i].words[j-1].upos == 'NOUN':
                governor_idx = doc.sentences[i].words[j].governor
                if doc.sentences[i].words[governor_idx-1].upos == 'ADJ':
                    if nltk_sent.polarity_scores(doc.sentences[i].words[governor_idx-1].lemma)['pos'] == 1:
                        asp_sent_pairs.append({'aspect':doc.sentences[i].words[j-1].lemma,\
                                                  'modifier':doc.sentences[i].words[governor_idx-1].lemma,\
                                                  'sentiment':'positive'})                    
                    if nltk_sent.polarity_scores(doc.sentences[i].words[governor_idx-1].lemma)['neg'] == 1:
                        asp_sent_pairs.append({'aspect':doc.sentences[i].words[j-1].lemma,\
                                                  'modifier':doc.sentences[i].words[governor_idx-1].lemma,\
                                                  'sentiment':'negative'})
                        
            # extract nouns adj pairs         
            if doc.sentences[i].words[j].upos == 'ADJ':
                governor_idx = doc.sentences[i].words[j].governor
                if doc.sentences[i].words[governor_idx-1].upos == 'NOUN':
                    try:
                        asp_sent_pairs.append({'aspect':doc.sentences[i].words[governor_idx-1].lemma,\
                                           'modifier':doc.sentences[i].words[j].lemma,\
                                           'sentiment':sen.polarity_value(doc.sentences[i].words[j].lemma)})
                    except:
                        continue
            
            # extract evalative verbs and objects
            if doc.sentences[i].words[j].dependency_relation == 'obj' and doc.sentences[i].words[j].upos == 'NOUN':
                governor_idx = doc.sentences[i].words[j].governor
                if doc.sentences[i].words[governor_idx-1].upos == 'VERB':
                    try:
                        asp_sent_pairs.append({'aspect':doc.sentences[i].words[j].lemma,\
                                           'modifier':doc.sentences[i].words[governor_idx-1].lemma,\
                                           'sentiment':sen.polarity_value(doc.sentences[i].words[governor_idx-1].lemma)})
                    except:
                        continue
    return asp_sent_pairs 

Define catagories that aspect will be sorted into and get their wordnets

In [199]:
catagories = ['food', 'drink', 'ambience', 'price', 'service']
catagory_net_dict = {}
for cat in catagories:
    catagory_net_dict[cat] = wordnet.synsets(cat, pos=wordnet.NOUN)

Given a dictionary of Aspects, Modifiers, and Sentiments assign Catagory for each using wup_similarity of their wordnets

In [230]:
def assign_catagory(ABSA_dict, catagory_net_dict, cat_sent_dict):
    similarities={}
    aspect_nets = wordnet.synsets(ABSA_dict['aspect'],pos=wordnet.NOUN)
    if not aspect_nets:
        ABSA_dict['catagory'] = None
    else:
        max_score = 0
       # print(f'ABSA_dict:{ABSA_dict}')
        for catagory in catagory_net_dict:
            for asp_net,cat_nat in product(aspect_nets, catagory_net_dict[catagory]):
                score=wordnet.wup_similarity(asp_net,cat_nat)
                if score >= max_score:
                    max_score = score
                    max_catagory = catagory
        ABSA_dict['catagory'] = max_catagory
        if ABSA_dict['catagory'] == 'food':
            if ABSA_dict['sentiment'] == 'positive':
                cat_sent_dict['pos_food'] += 1
            if ABSA_dict['sentiment'] == 'negative':
                cat_sent_dict['neg_food'] += 1
        if ABSA_dict['catagory'] == 'drink':
            if ABSA_dict['sentiment'] == 'positive':
                cat_sent_dict['pos_drink'] += 1
            if ABSA_dict['sentiment'] == 'negative':
                cat_sent_dict['neg_drink'] += 1
        if ABSA_dict['catagory'] == 'ambience':
            if ABSA_dict['sentiment'] == 'positive':
                cat_sent_dict['pos_ambience'] += 1
            if ABSA_dict['sentiment'] == 'negative':
                cat_sent_dict['neg_ambience'] += 1
        if ABSA_dict['catagory'] == 'price':
            if ABSA_dict['sentiment'] == 'positive':
                cat_sent_dict['pos_price'] += 1
            if ABSA_dict['sentiment'] == 'negative':
                cat_sent_dict['neg_price'] += 1       
        if ABSA_dict['catagory'] == 'service':
            if ABSA_dict['sentiment'] == 'positive':
                cat_sent_dict['pos_service'] += 1
            if ABSA_dict['sentiment'] == 'negative':
                cat_sent_dict['neg_service'] += 1

For all aspect-sentiment pairs in a review, count number of positive and negative in each catagory and return a review dictionary containing them along with stars given

In [256]:
def get_user_cat_sent_dict(user,rest_reviews_df):
    reviews_of_freq_user = rest_reviews_df[rest_reviews_df['user_id'] == user][['text','stars']].sample(150)
    cat_sent_dict_list = []
    for _, review in reviews_of_freq_user.iterrows():
        doc = nlp(review.text)
        ABSA_dict_list = extract_asp_sent_pairs(doc)
        cat_sent_dict = {'pos_food': 0, 'neg_food': 0, 'pos_drink': 0, 'neg_drink': 0, 'pos_ambience' : 0, 'neg_ambience': 0, 'pos_price' : 0, 'neg_price' : 0, 'pos_service': 0, 'neg_service': 0, 'stars': review.stars}
        for ABSA_dict in ABSA_dict_list:
            assign_catagory(ABSA_dict,catagory_net_dict,cat_sent_dict)
        cat_sent_dict_list.append(cat_sent_dict)
    return cat_sent_dict_list

Fit linear regression using catagory counts as features predicting star value given and return the fit coeff to give measure of how each impacts score given

In [238]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [246]:
def get_user_coef(cat_sent_dict_list):
    df = pd.DataFrame(data=cat_sent_dict_list)
    lr.fit(df.loc[:, df.columns != 'stars'],df['stars'])
    return lr.coef_

Run everything and get dictionary keys: user_id values: coef-array for each frequent user (takes a long time)

In [264]:
freq_users_coef = {}
for user in progressbar.progressbar(freq_users):
    cat_sent_dict_list = get_user_cat_sent_dict(user,rest_reviews)
    coef_array = get_user_coef(cat_sent_dict_list)
    freq_users_coef[user] = coef_array
pickle.dump( freq_users_coef, open( "freq_users_coef.p", "wb" ) )

100% (320 of 320) |#####################| Elapsed Time: 16:53:42 Time: 16:53:42


In [265]:
freq_users_coef

{'CxDOIDnH8gp9KXzpBHJYXw': array([ 0.03268888, -0.06539061,  0.02263805, -0.10860524,  0.13095062,
        -0.15844953, -0.0447858 ,  0.09530167,  0.04670657, -0.05586259]),
 'bLbSNkLggFnqwNNzzq-Ijw': array([ 0.08291738, -0.10810695,  0.09109293, -0.17995121, -0.05453009,
        -0.12453603,  0.07974153, -0.10600097,  0.02614277, -0.13684697]),
 'ELcQDlf69kb-ihJfxZyL0A': array([ 0.06984315, -0.23956638,  0.04753956, -0.22096698,  0.14715724,
        -0.50403123, -0.08234751, -0.10624931,  0.01816444, -0.17859739]),
 'd_TBs6J3twMy9GChqUEXkg': array([ 0.0360954 , -0.01840295,  0.0629943 ,  0.03819664,  0.01418193,
        -0.04125322, -0.07565513, -0.02935869, -0.03399542, -0.05013343]),
 'DK57YibC5ShBmqQl97CKog': array([ 0.04564412, -0.04971494, -0.01904002, -0.08012976,  0.0448509 ,
        -0.0443734 ,  0.00279451, -0.10218805,  0.02071919, -0.13359454]),
 'U4INQZOPSUaj8hMjLlZ3KA': array([ 0.029421  , -0.04880277,  0.06897768, -0.16267494,  0.01651383,
        -0.05376672, -0.0188546

In [263]:
pickle.dump( freq_users_coef, open( "freq_users_coef.p", "wb" ) )