# This file is to generate feature summary and topic scores for all business. 
## Usage: This file will take review_train.csv as initial input, and then it will generate data_sentence.csv, read data_sentence.csv, generate data_sentence_scores.csv and business_scores.csv. 

### 1. Import modules :

In [None]:
## import modules
import warnings
warnings.filterwarnings('ignore')
import json
import re
import pandas as pd
import nltk
import math
from math import sqrt
import sklearn
import statsmodels.api as sm
from nltk.stem import WordNetLemmatizer 
from nltk import FreqDist
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import string
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from collections import Counter


from tqdm import tqdm
import collections
from gensim.models import Word2Vec
from gensim.models import word2vec
import gensim
from gensim import corpora
from gensim import models
from gensim.models.coherencemodel import CoherenceModel
import multiprocessing
import time
import inflect
from textblob import TextBlob
from keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer
import spacy
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


## define stopwords
sr = stopwords.words('english')
sr.remove('not')
p = inflect.engine()
wnl = WordNetLemmatizer() 
table = str.maketrans('', '', string.punctuation)
porter = nltk.PorterStemmer()
nlp = spacy.load("en")
LDA = gensim.models.ldamodel.LdaModel
pyLDAvis.enable_notebook()


### 2. Read in the reviews data for gym business as pandas dataframe and let the name be data_review.

In [None]:
## read review data
data_review=pd.read_csv("review_train.csv")
data_review["review_number"] = range(len(data_review))
idset = data_review['business_id']
idset = list(set(idset))

### 3. Main step: clean data---LDA model---creaete topic-terms list---get dominant topics---get sentence scores---get business scores and feature summary.

### 3.1 Create data_sentence dictionary and save it

In [None]:
def dataprocess(dataset):
    data_sentence = {}
    
    for i in tqdm(range(len(dataset))):
        
        ## dealing with \n and \n\n things
        xx = dataset.iloc[i]['text']
        xx = re.sub(r' \n\n','.',xx)
        xx = re.sub(r'\n','',xx)
        xx = re.sub(r'\.\.','. ',xx)
        xx = re.sub(r'(\.)(\S)',r'\1 \2',xx)
        
        ## for each review, tokenize it into sentences, saved in sent_set
        sent_set = sent_tokenize(xx)
    
        ## for each sentence in sent_set, tokenized and cleaning into tokenized_sentence
        for j in range(len(sent_set)):
        
            sent = sent_set[j]
            if len(sent) > 3:
                ## get "review_number" for this sentence
                data_sentence.setdefault('review_number', []).append(dataset.iloc[i]['review_number'])
        
                ## assign this sentence into column "sentence" and sentence_list
                data_sentence.setdefault('sentence', []).append(sent)

        
                ## clean this sentence into "tokenized_sentence"
                x = re.sub(r'n\'t',' not',sent)
                ## split into words
                x = word_tokenize(x)
                ## remove punctuation
                x = [w.translate(table) if not re.match(r'not_.*', w) else w for w in x]
                ## change numbers into words
                x = [p.number_to_words(w) if w.isdigit() else w  for w in x ]
                ## remove not alphabetic
                x = [w for w in x if w.isalpha() or re.match(r'not_.*',w)]
                ## convert to lower case
                x = [w.lower() for w in x]
                ## remove stop words
                x = [w for w in x if not w in sr]
                ## lemmatization
                x = [wnl.lemmatize(w) for w in x]
            
                ## assign cleaned sentence words to "tokenized_sentence" and tokenized_sentence_list
                data_sentence.setdefault('tokenized_sentence', []).append(x)


                
                ## POS
                ## change cleaned words into nlp format
                sent_nlp = nlp(" ".join(x))
                ## get nouns for each sentence and saved into nouns_list
                nolis = [token.lemma_ for token in sent_nlp if token.pos_ == "NOUN"]
                ## assign nouns to "nouns"
                data_sentence.setdefault('nouns', []).append(nolis)
            
    return data_sentence

In [None]:
data_sentence = dataprocess(data_review)

In [None]:
## save data into data_sentence.txt
fw = open("data_sentence.txt",'w+')
fw.write(str(data_sentence))
fw.close()

In [None]:
## read data_sentence.txt
fr = open("data_sentence.txt",'r+')
data_sentence_dic = eval(fr.read()) 

### 3.2 Create frequency and tfidf matrix for nouns

In [None]:
def getcorpus(datadict):
    
    nouns_list = datadict["nouns"]
    
    ## define dictionary for "nouns"
    dictionary = corpora.Dictionary(nouns_list)

    ## create frequency matrix
    frequency_matrix = [dictionary.doc2bow(n) for n in nouns_list]
                    
    ## create tfidf matrix
    tfidf = gensim.models.TfidfModel(frequency_matrix)
    corpus_tfidf = tfidf[frequency_matrix]
    
    return corpus_tfidf, dictionary

In [None]:
corpus_tfidf, dictionary = getcorpus(data_sentence_dic)

### 3.3 Apply LDA topic model to tfidf matrix and visualization and evaluation

In [None]:
## apply LDA model on tfidf matrix                                                        
lda_model = LDA(corpus_tfidf, id2word=dictionary, num_topics=7, random_state=100,chunksize=10000, passes=50)                      

In [None]:
## save model
lda_model.save("ldamodel/lda_7topics.model")

## load model
lda_model=  models.LdaModel.load('ldamodel/lda_7topics.model')

In [None]:
## print all topics
lda_model.show_topics()

In [None]:
## visualize the topics
plot_lda = pyLDAvis.gensim.prepare(lda_model, corpus_tfidf, dictionary)
plot_lda

In [None]:
## model evaluation

## compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus_tfidf))  # a measure of how good the model is. lower the better.

## compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_sentence_dic["nouns"], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

### 3.4 Decide final topic term lists for all seven topics

In [None]:
topic_0_terms = ['equipment','machine','facility','weight','cardio','pool','studio','floor','wall','stuff']
topic_1_terms = ['class','training','fun','group','session','yoga','schdule','program']
topic_2_terms = ['service','everything','locker','customer','kid','shower','sauna','desk','change','child','girl','treatment','staff','bathroom']
topic_3_terms = ['year','member','membership','money','contract','family','deal','rate']
topic_4_terms = ['trainer','instructor','help','coach']
topic_5_terms = ['time','day','month','morning','night','hour','door']
topic_6_terms = ['anything','music','planet','house']

topic_terms = [topic_0_terms,topic_1_terms,topic_2_terms,topic_3_terms,topic_4_terms,topic_5_terms,topic_6_terms]

### 3.5 Enlarge the topic term lists using synsets manually

In [None]:
topic_0_terms_en = ['equipment','machine','facility','weight','cardio','pool','studio','floor','wall','stuff','tools','treadmill','barbell','dumbbell','elliptical','ball','band','spinning','stepper','back','adductor','cycling','step','cardio','machine','cycle','climber','beach','bike']
topic_1_terms_en = ['class','training','fun','group','session','yoga','schdule','program','course','strength','aerobics','workout','boxing','crunch','squat','raw','entrench','press','fitball','abs','karate','zumba','speed','trial','programming','level','crossfit']
topic_2_terms_en = ['service','everything','locker','customer','kid','shower','sauna','desk','change','child','girl','treatment','staff','bathroom','dryer']
topic_3_terms_en = ['year','member','membership','money','contract','family','deal','rate','bill','month','season','flexibility']
topic_4_terms_en = ['trainer','instructor','help','coach','teacher','advisor','mentor','helper','tutor']
topic_5_terms_en = ['time','day','month','morning','night','hour','door','minute','24hr','hour']
topic_6_terms_en = ['anything','music','planet','house','environment','capability','layout','design','smell']

topic_terms_en = [topic_0_terms_en,topic_1_terms_en,topic_2_terms_en,topic_3_terms_en,topic_4_terms_en ,topic_5_terms_en,topic_6_terms_en]

### 3.6 Create dominant topic in each sentence

In [None]:
## use terms detection to find the dominant topic, here, we follow a rule that if a sentence mention more than one topic, and the numbers of espective 
## terms are the same, then we assign topic with smaller index to this sentence.
## the proof is that the smaller index the topic have, the important this topic is.
## of course, this will cause some inaccuracy.
for num in tqdm(range(len(data_sentence_dic["review_number"]))):
    sent = data_sentence_dic["sentence"][num]
    nouns_list = data_sentence_dic["nouns"][num]
    count_topic=[]
    for t in range(7):
        topic_term_num = topic_terms_en[t]
        count = 0
        if any(element in sent for element in topic_term_num):
            for term in topic_term_num:
                if term in nouns_list:
                    count+=1
        count_topic.append(count)
        
    if sum(count_topic)==0:
        data_sentence_dic.setdefault("dominant_topic",[]).append(7)
    else:
        data_sentence_dic.setdefault("dominant_topic",[]).append(count_topic.index(max(count_topic)))
        

### 3.7 Read sentiment dictionary

In [None]:
## import another positive and negative words dictionary 
positive_dic = set(pd.read_csv('positive-words.txt',names = ['word'])['word'])
negative_dic = set(pd.read_csv('negative-words.txt', encoding = "ISO-8859-1",names = ['word'])['word'])

In [None]:
pos_dict = positive_dic
neg_dict = negative_dic
deny_dict = ["not","aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt",
             "ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't",
             "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither",
             "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't",
             "neednt", "needn't", "never", "none", "nope", "nor", "not", "nothing", "nowhere",
             "oughtnt", "shant", "shouldnt", "uhuh", "wasnt", "werent",
             "oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't",
             "without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite"]

most_dict = ['absolute','absolutely','alarmingly','amazingly','astonishingly','awfullybitterly','completely',
             'deep-rooted','deep-seated','deeply','definitely','disastrously','downright','entirely','exceedingly',
             'excessively','extreme','extremely','fully','greatest','greatly','heinous','hundred-percent','immensely',
             'immoderate','incomparably','ingrained','matchlessly','monstrous','most','outstanding','outstandingly',
             'right-down','sharply','sheer','superb','terribly','totally','towering','unusually','utmost','utterly','most']
very_dict = ['awfully','badly','better','considerably','deep','disastrously','especially','extraordinarily''extremely','greatly',
             'how','however','indeed','much','particularly','really','terribly','unusually','very']
more_dict = ['comparatively','further','increasingly','more','plus','relatively','so','such']

ish_dict = ['bit','fairly','passably','pretty','quite','rather','slightly','some','somewhat']


### 3.8 Get sentence score for each sentence use sentiment dictionary

In [None]:
def get_sentence_score(data_sentence_dic_sent):
    data_sentence_dic_sent['sentiment_score_topic']=[0]*len(data_sentence_dic_sent['review_number'])
    for t in range(7):
        ## for each topic, compute the scores of sentences under this topic  
        index_topic_t = [i for i in range(len(data_sentence_dic_sent['dominant_topic'])) if data_sentence_dic_sent['dominant_topic'][i] == t]
        ## save word score in each sentence into count_sentence
        count_sentence = []
        ## save count_sentence into count_topic
        count_topic = []
        for index in tqdm(index_topic_t):
            ##for each sentence, first get tokenized_sentence
            tokenized_sent = data_sentence_dic_sent['tokenized_sentence'][index] 
            word_loc = 0 ## record the location of adjective
            adj_loc = 0 ## record the location of word
            poscount_origin = 0 ## original score of positive word 
            poscount_inverse = 0 ## if there are deny words before this positive word, poscount_origin will be replaced by poscount_inverse
            poscount_final = 0 ## final score for positive words
            negcount_origin = 0 ## original score of negative words
            negcount_inverse = 0 ## inverse score of negative words
            negcount_final = 0 ##final score for negative words
            ## loop all the words in this sentence's tokenized_sentence
            for word in tokenized_sent:
                ##judge if it is positive word
                if word in pos_dict: 
                    poscount_origin +=1
                    ## count the number of deny word
                    deny_num = 0
                    ## loop all the words before this specific word to find degree adverb
                    for w in tokenized_sent[adj_loc:word_loc]: 
                        if w in most_dict: ## judge if the adverb is most degree, then the score will be  four times
                            poscount_origin *= 4.0
                        elif w in very_dict:  ##judge if the adverb is very degree, then the score will be three times
                            poscount_origin *= 3.0
                        elif w in more_dict: ## judge if the adverb is more degree, then the score will be two times
                            poscount_origin *= 2.0
                        elif w in ish_dict:  ## judge if the adverb is a bit degree, then the score will be half
                            poscount_origin *= 0.5
                        elif w in deny_dict: ## if it is deny word, deny_num will add one
                            deny_num+= 1
                    if deny_num % 2 != 0 : ## if number of deny word is odd, score will inverse. ow, score will stay the same
                        poscount_origin *= -1.0
                        poscount_inverse += poscount_origin
                        poscount_origin = 0
                        poscount_final = poscount_origin + poscount_inverse + poscount_final
                        poscount_inverse = 0
                    else:
                        poscount_final = poscount_origin + poscount_inverse + poscount_final
                        poscount_origin = 0
                    adj_loc = word_loc + 1
                elif word in neg_dict: 
                    negcount_origin += 1
                    num_deny = 0
                    for w in tokenized_sent[adj_loc:word_loc]:
                        if w in most_dict:
                            negcount_origin *= 4.0
                        elif w in very_dict:
                            negcount_origin *= 3.0
                        elif w in more_dict:
                            negcount_origin *= 2.0
                        elif w in ish_dict:
                            negcount_origin *= 0.5
                        elif w in deny_dict:
                            num_deny += 1
                    if num_deny % 2 != 0 :
                        negcount_origin *= -1.0
                        negcount_inverse += negcount_origin
                        negcount_origin = 0
                        negcount_final = negcount_origin + negcount_inverse + negcount_final
                        negcount_inverse = 0
                    else:
                        negcount_final = negcount_origin + negcount_inverse + negcount_final
                        negcount_origin = 0
                    adj_loc = word_loc + 1
                else:
                    poscount_final=0
                    negcount_final=0

                word_loc += 1
 
            ## in case poscount_final or negcount_final is 0
                pos_count = 0
                neg_count = 0
                if poscount_final <0 and negcount_final > 0:
                    neg_count += negcount_final - poscount_final
                    pos_count = 0
                elif negcount_final <0 and poscount_final > 0:
                    pos_count = poscount_final - negcount_final
                    neg_count = 0
                elif poscount_final <0 and negcount_final < 0:
                    neg_count = -pos_count
                    pos_count = -neg_count
                else:
                    pos_count = poscount_final
                    neg_count = negcount_final
                    
                
                count_sentence.append([pos_count,neg_count]) ## a list with sublist describing every word's pos or neg count
                
            count_topic.append(count_sentence)
            count_sentence=[]
        
        for index in tqdm(range(len(index_topic_t))):
            sent_index = index_topic_t[index]
            sent_s = count_topic[index]
            if len(sent_s) !=0:
                score_array =  np.array(sent_s)
                pos_score = np.mean(score_array[:,0])## average count of positive adjective
                pos_score = float('%.lf' % pos_score)
                neg_score = np.mean(score_array[:, 1])## average count of negative adjective
                neg_score = float('%.1f' % neg_score)
                final_score = pos_score - neg_score
 
                data_sentence_dic_sent['sentiment_score_topic'][sent_index] = final_score

            else:
                ## if tokenized_sentence is empty list
                data_sentence_dic_sent['sentiment_score_topic'][sent_index] = 0
    
    return data_sentence_dic_sent  ## return to data_sentence_dic with sentence sentiment score

In [None]:
result=get_sentence_score(data_sentence_dic)

### 3.9 Evaluate business use sentences scores

In [None]:
def get_business_summary(business_id,):
    business_score = []
    business_summary = [] ## index:  (0,1,2 topic1 total pos and neg，345，topic2 total pos and neg ………………)
    review_number_id = list(data_review.loc[data_review['business_id']==business_id]["review_number"])
    for t in range(8):
        sentence_score= [data_sentence_dic["sentiment_score_topic"][k] for k in range(len(data_sentence_dic["review_number"])) if data_sentence_dic["review_number"][k] in review_number_id and data_sentence_dic['dominant_topic'][k]==t]
        topic_pos_num = 0
        topic_neg_num = 0
        topic_neu_num = 0

        if len(sentence_score)!=0:
            for s in sentence_score:
                if s>0:
                    topic_pos_num+=1
                elif s < 0:
                    topic_neg_num+=1
                else:
                    topic_neu_num+=1
            business_score.append(sum(sentence_score)/len(sentence_score))

        else:
            business_score.append(0)
            
            
        business_summary.append(topic_pos_num+topic_neg_num+topic_neu_num)
        business_summary.append(topic_pos_num)
        business_summary.append(topic_neg_num)
        business_summary.append(topic_neu_num)

    #summary = (
    #    'For your gym, there are '+str(len(review_number_id))+' reviews.\n Among these reviews,'+'Topic_facilities has been mentioned '+str(business_summary[0])+' times,'+str(business_summary[1])+' are positive,'+str(business_summary[2])+' are negative,'+str(business_summary[3])+' are neutral.\n'
    #    +'Topic_Courses has been mentioned '+str(business_summary[4])+' times,'+str(business_summary[5])+' are positive,'+str(business_summary[6])+' are negative,'+str(business_summary[7])+' are neutral.\n  '
    #    +'Topic_Service and Accessories has been mentioned '+str(business_summary[8])+' times,'+str(business_summary[9])+' are positive,'+str(business_summary[10])+' are negative,'+str(business_summary[11])+' are neutral.\n'
    #    +'Topic_Membership and Price has been mentioned '+str(business_summary[12])+' times,'+str(business_summary[13])+' are positive,'+str(business_summary[14])+' are negative,'+str(business_summary[15])+' are neutral.\n'
    #    +'Topic_Trainers has been mentioned '+str(business_summary[16])+' times,'+str(business_summary[17])+' are positive,'+str(business_summary[18])+' are negative,'+str(business_summary[19])+' are neutral.\n'
    #    +'Topic_Time has been mentioned '+str(business_summary[20])+' times,'+str(business_summary[21])+' are positive,'+str(business_summary[22])+' are negative,'+str(business_summary[23])+' are neutral.\n'
    #    +'Topic_Environment has been mentioned '+str(business_summary[24])+' times,'+str(business_summary[25])+' are positive,'+str(business_summary[26])+' are negative,'+str(business_summary[27])+' are neutral.\n'
    #    +'Other topics have been mentioned '+str(business_summary[28])+' times,'+str(business_summary[29])+' are positive,'+str(business_summary[30])+' are negative,'+str(business_summary[31])+' are neutral.\n'
    #    +'Your gym\'s topic scores on seven topics are '+str(business_score)
    #        )
    
    return business_summary



In [None]:
## generate eight topic scores for each business

def get_business_score(business_id):
    business_score = []
    review_number_id = list(data_review.loc[data_review['business_id']==business_id]["review_number"])
    for t in range(8):
        sentence_score= [data_sentence_dic["sentiment_score_topic"][k] for k in range(len(data_sentence_dic["review_number"])) if data_sentence_dic["review_number"][k] in review_number_id and data_sentence_dic['dominant_topic'][k]==t]
        topic_pos_num = 0
        topic_neg_num = 0
        topic_neu_num = 0

        if len(sentence_score)!=0:
            for s in sentence_score:
                if s>0:
                    topic_pos_num+=1
                elif s < 0:
                    topic_neg_num+=1
                else:
                    topic_neu_num+=1
            business_score.append(sum(sentence_score)/len(sentence_score))

        else:
            business_score.append(0)

    return business_score


## create weighted_stars 

def get_review_weight(rev_num):
    nouns_list_rev = [data_sentence_dic['nouns'][i] for i in range(len(data_sentence_dic['review_number'])) if data_sentence_dic['review_number'][i] == rev_num]
    nouns_list_rev = [item for sublist in nouns_list_rev for item in sublist]
    topic_terms_en_rev = [item for sublist in topic_terms_en for item in sublist]
    count_rev = 0
    for word in nouns_list_rev:
        if word in topic_terms_en_rev:
            count_rev+=1
    return count_rev

def get_business_weighted_stars(business_id):
    review_number_id = list(data_review.loc[data_review['business_id']==business_id]["review_number"])
    numerator = 0
    denominator = 0
    for i in review_number_id:
        weight = get_review_weight(i)
        star = data_review.loc[data_review['business_id']==business_id]['stars'][i]
        numerator+=weight*star
        denominator+=weight
        
    return numerator/denominator


In [None]:
business_scores={}
for d in tqdm(idset):
    scores = get_business_score(d)
    summary = get_business_summary(d)
    business_scores.setdefault('business_id',[]).append(d)
    business_scores.setdefault('topic_1_Facilities',[]).append(scores[0])
    business_scores.setdefault('topic_2_Courses',[]).append(scores[1])
    business_scores.setdefault('topic_3_Service_and_Accessories',[]).append(scores[2])
    business_scores.setdefault('topic_4_Membership_and_Price',[]).append(scores[3])
    business_scores.setdefault('topic_5_Trainer',[]).append(scores[4])
    business_scores.setdefault('topic_6_Time',[]).append(scores[5])
    business_scores.setdefault('topic_7_Environment',[]).append(scores[6])
    business_scores.setdefault('topic_8_Other',[]).append(scores[7])
    business_scores.setdefault('weighted_stars',[]).append(get_business_weighted_stars(d))
    for i in range(32):
        business_scores.setdefault('summary_'+str(i),[]).append(summary[i])


In [None]:
## create business_scores.csv
pd.DataFrame(business_scores).to_csv(path_or_buf='business_scores.csv', index=False)