# This is the official jupyter notebook for goal 1, our aim is to develop a shiny app which can give owners of gym business information on their position in gym business, suggestions on how to improve their ratings in yelp from both aspects of reviews and attributes.

### 1. We import python modules we need.

In [16]:
## import modules
import warnings
warnings.filterwarnings('ignore')
import json
import re
import pandas as pd
import nltk
import math
from math import sqrt
import sklearn
import statsmodels.api as sm
from nltk.stem import WordNetLemmatizer 
from nltk import FreqDist
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import string
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from collections import Counter


from tqdm import tqdm
import collections
from gensim.models import Word2Vec
from gensim.models import word2vec
import gensim
from gensim import corpora
from gensim import models
import multiprocessing
import time
import inflect
from textblob import TextBlob
from keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import CountVectorizer #该类会将文本中的词语转换为词频矩阵，矩阵元素a[i][j] 表示j词在i类文本下的词频
from sklearn.feature_extraction.text import TfidfTransformer
import spacy
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


## define stopwords
sr = stopwords.words('english')
p = inflect.engine()
wnl = WordNetLemmatizer() 
table = str.maketrans('', '', string.punctuation)
porter = nltk.PorterStemmer()
nlp = spacy.load("en")
LDA = gensim.models.ldamodel.LdaModel
pyLDAvis.enable_notebook()


Using TensorFlow backend.


### 2. Read in the reviews data for gym business as pandas dataframe and let the name be data_review.

In [17]:
## read review data
data_review=pd.read_csv("review_train.csv")
data_review["review_number"] = range(len(data_review))

### 3. The main mathod we use is LDA topic model. Here are summary of our analysis.
1. Divide each review into sentences, create a pd dataframe named sentence_train with columns are review number and sentence. The review number will help us merge information from data_sentence and data_review.
2. For each sentence, do general data cleaning steps and get a list of words as column3 in data_sentence.
3. For each sentence, filter nouns as a list  to become column 4 in data_sentence
4. Create frequency matrix and tfidf matrix for nouns 
5. Apply LDA topic model on TFIDF matrix.
6. Use some method to pick unique topic for each feature words and get our topic-term list.
7. For each sentence and each topic, pick all the adjectives describing this topic's terms,  Search for the pre defined dictionary of positive and negative words, get this sentence's score at this topic equal to (number of positive words - number of negative words) / (number of positive words + number of negative words) 
8. Define a function of business_id, first get corresponding reviews' numbers, and in data_sentece, get relevant rows. Compute average of sentence_score at each topic as this business_score at each topic
9. After getting all scores on each topic for all business, we can get distribution of scores on different topics. So we can output the position of given business.
10. However, in order to give quantative and actionable suggestions on improving ratings, we want to do some statistical analysis with respect to reviews. For each business, we use weighted average stars of reviews as our response variable with weight equal to the number of feature words in each review. And T topic scores as our predictors to do linear regression （or grouped lasso????). As a result, we can get quantative interpretation on which topic is more important, and help owners to pick the most efficient way to achieve their goal.
11. About the attributes, due to the large amount of missing values, we use GUIDE to build decision trees. This method can give us interpretable results and can deal with missing values. After GUIDE, we may use some statistical methods like ANOVA to find the reasons to interpret the relation between attributes  and stars.

### 3.1 Create data_sentence dictionary and save it

In [111]:
def dataprocess(dataset):
    data_sentence = {}
    
    for i in tqdm(range(len(dataset))):
        
        ## dealing with \n and \n\n things
        xx = dataset.iloc[i]['text']
        xx = re.sub(r' \n\n','.',xx)
        xx = re.sub(r'\n','',xx)
        xx = re.sub(r'\.\.','. ',xx)
        xx = re.sub(r'(\.)(\S)',r'\1 \2',xx)
        
        ## for each review, tokenize it into sentences, saved in sent_set
        sent_set = sent_tokenize(xx)
    
        ## for each sentence in sent_set, tokenized and cleaning into tokenized_sentence
        for j in range(len(sent_set)):
        
            sent = sent_set[j]
            if len(sent) > 3:
                ## get "review_number" for this sentence
                data_sentence.setdefault('review_number', []).append(dataset.iloc[i]['review_number'])
        
                ## assign this sentence into column "sentence" and sentence_list
                data_sentence.setdefault('sentence', []).append(sent)

        
                ## clean this sentence into "tokenized_sentence"
                x = re.sub(r'n\'t',' not',sent)
                ## change not adj into not_adj
                x = re.sub(r'not ','not_',x)
                ## split into words
                x = word_tokenize(x)
                ## remove punctuation
                x = [w.translate(table) if not re.match(r'not_.*', w) else w for w in x]
                ## change numbers into words
                x = [p.number_to_words(w) if w.isdigit() else w  for w in x ]
                ## remove not alphabetic
                x = [w for w in x if w.isalpha() or re.match(r'not_.*',w)]
                ## convert to lower case
                x = [w.lower() for w in x]
                ## remove stop words
                x = [w for w in x if not w in sr]
                ## lemmatization
                x = [wnl.lemmatize(w) for w in x]
            
                ## assign cleaned sentence words to "tokenized_sentence" and tokenized_sentence_list
                data_sentence.setdefault('tokenized_sentence', []).append(x)


                
                ## POS
                ## change cleaned words into nlp format
                sent_nlp = nlp(" ".join(x))
                ## get nouns for each sentence and saved into nouns_list
                nolis = [token.lemma_ for token in sent_nlp if token.pos_ == "NOUN"]
                ## assign nouns to "nouns"
                data_sentence.setdefault('nouns', []).append(nolis)
            
    return data_sentence

In [112]:
data_sentence = dataprocess(data_review)




  0%|          | 0/35033 [00:00<?, ?it/s][A[A[A


  0%|          | 2/35033 [00:00<31:33, 18.50it/s][A[A[A


  0%|          | 3/35033 [00:00<50:30, 11.56it/s][A[A[A


  0%|          | 5/35033 [00:00<47:08, 12.38it/s][A[A[A


  0%|          | 6/35033 [00:00<55:19, 10.55it/s][A[A[A


  0%|          | 7/35033 [00:00<1:03:19,  9.22it/s][A[A[A


  0%|          | 9/35033 [00:00<54:00, 10.81it/s]  [A[A[A


  0%|          | 11/35033 [00:00<48:56, 11.93it/s][A[A[A


  0%|          | 14/35033 [00:01<49:09, 11.87it/s][A[A[A


  0%|          | 16/35033 [00:01<46:58, 12.42it/s][A[A[A


  0%|          | 20/35033 [00:01<40:45, 14.32it/s][A[A[A


  0%|          | 22/35033 [00:01<42:48, 13.63it/s][A[A[A


  0%|          | 24/35033 [00:01<43:03, 13.55it/s][A[A[A


  0%|          | 26/35033 [00:01<40:01, 14.58it/s][A[A[A


  0%|          | 29/35033 [00:02<38:04, 15.32it/s][A[A[A


  0%|          | 31/35033 [00:02<52:51, 11.04it/s][A[A[A


  0%|          |

KeyboardInterrupt: 

In [2]:
# save data into data_sentence.txt
fw = open("data_sentence.txt",'w+')
fw.write(str(data_sentence))
fw.close()

NameError: name 'test' is not defined

In [4]:
# read data_sentence.txt
fr = open("data_sentence.txt",'r+')
data_sentence_dic = eval(fr.read()) 


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [22]:
data_sentence_df = pd.DataFrame(data_sentence_dic)

### 3.2 Create frequency and tfidf matrix for nouns

In [102]:
def getcorpus(datadict):
    
    nouns_list = datadict["nouns"]
    
    ## define dictionary for "nouns"
    dictionary = corpora.Dictionary(nouns_list)

    ## create frequency matrix
    frequency_matrix = [dictionary.doc2bow(n) for n in nouns_list]
                    
    ## create tfidf matrix
    tfidf = gensim.models.TfidfModel(frequency_matrix)
    corpus_tfidf = tfidf[frequency_matrix]
    
    return corpus_tfidf, dictionary

In [104]:
corpus_tfidf, dictionary = getcorpus(data_sentence_dic)

### 3.3 Apply LDA topic model to tfidf matrix and visualization and evaluation

In [None]:
## apply LDA model on tfidf matrix                                                        
lda_model = LDA(corpus_tfidf, id2word=dictionary, num_topics=7, random_state=100,chunksize=1000, passes=50)                      

In [38]:
# visualize the topics
vis = pyLDAvis.gensim.prepare(lda_model, corpus_tfidf, dictionary)
vis

In [None]:
## model evaluation

## compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus_tfidf))  # a measure of how good the model is. lower the better.

## compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_sentence["nouns"], dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

### 3.4 Create topic term lists

In [None]:
topic_0_terms = []
topic_1_terms = []
topic_2_terms = []
topic_3_terms = []
topic_4_terms = []
topic_5_terms = []
topic_6_terms = []

topic_terms = [topic_0_term,topic_1_term,topic_2_term,topic_3_term,topic_4_term,topic_5_term,topic_6_term]

### 3.4 Create dominant topic in each sentence

In [None]:
## get main topic in each sentence
for i, row in enumerate(lda_model[corpus_tfidf]):
    row = sorted(row, key=lambda x: (x[1]), reverse=True)
    ## save the dominant_topic, percage_contribution for each sentence to data_sentence_dic
    for j, (topic_num, prop_topic) in enumerate(row):
        if j == 0:  # => dominant topic
            data_sentence_dic["dominant_topic"] = topic_num
            data_sentence_dic["percentage_contribution"] = round(prop_topic,4)
            break

### 3.5 Get scores of each topic for all data_sentence_dic

In [None]:
def get_sentence_score(topic_num):
    ## get this topic_term list
    topic_term_num = topic_terms[topic_num]
    
    ## define positive dictionary and negative dictionary
    positive_dic = []   ## 放上positive和negative的库
    negative_dic = []
    
    
    ## define count variable
    positive_order = 0
    negative_order = 0
    
    ## define sub_adj_list for each sentence
    sub_adj_list = []


    for num in len(data_sentence_dic["review_number"]):
        sent = data_sentence_dic["sentence"][num]
        nouns_list = data_sentence_dic["nouns"][num]
        if any(element in sent for element in topic_term_num):
            for term in topic_term_num:
                if term in nouns_list:
                    for child in list(term.children):
                        if child.pos_ == "ADJ":
                            adj = child.string.strip()
                            sub_adj_list.append(adj)
                                if adj in positive_dic:
                                    positive_order+=1
                                elif adj in negative_dic:
                                    negative_order+=1
          
            data_sentence_dic.setdefault["adj_list"+str(topic_num),[]].append(sub_adj_list)
            data_sentence_dic.setdefault["num_positive"+str(topic_num),[]].append(positive_order)
            data_sentence_dic.setdefault["num_negative"+str(topic_num),[]].append(negative_order)
            data_sentence_dic.setdefault["score"+str(topic_num),[]].append((positive_order-negative_order)/(positive_order+negative_order))
        else:
            data_sentence_dic.setdefault["adj_list"+str(topic_num),[]].append([])
            data_sentence_dic.setdefault["num_positive"+str(topic_num),[]].append(0)
            data_sentence_dic.setdefault["num_negative"+str(topic_num),[]].append(0)
            data_sentence_dic.setdefault["score"+str(topic_num),[]].append(0)
            
    return data_sentence_dic
        

In [None]:
## get all topics score for each sentence
data_sentence_dic=get_sentence_score(0)
data_sentence_dic=get_sentence_score(1)
data_sentence_dic=get_sentence_score(2)
data_sentence_dic=get_sentence_score(3)
data_sentence_dic=get_sentence_score(4)
data_sentence_dic=get_sentence_score(5)
data_sentence_dic=get_sentence_score(6)

### 3.6 Get all topic scores for given business

In [None]:
def business_score(business_id):
    business_score = []
    review_number_id = list(data_review.loc[data_review['business_id']==business_id]["review_number"])
    for t in range(6):
        ##topic number - 1
        sentence_score= [data_sentence_dic["score"+str(t)][k] for k in range(len(data_sentence_dic["review_number"])) if data_sentence_dic["review_number"][k] in review_number_id]
        denominator = 0
        numerator = 0
        for i in sentence_score:
            if i !=0:
                denominator+=1
                numerator+=i
        business_score.append(numerator/denominator) 
    return business_score

In [34]:
## get seven topic scorebusiness_id 
business_score(1)

### 3.7 Decide best number of topics


In [1]:
def test_num_topics():
    samlist = range(len(data_sentence_dic["review_number"]))
    test = random.sample(samlist, 62508)  #从list中随机获取5个元素，作为一个片断返回  
    train = [x for x in samlist if x not in test]
    data_sentence_train = 
    data_sentence_test = 
    corpus_tfidf_train, dictionary_train = getcorpus(data_sentence_train)
    corpus_tfidf_test, dictionary_test = getcorpus(data_sentence_test)
    for num_topics in [3, 5, 10, 30, 50, 100, 150, 200, 300]:
        start_time = datetime.datetime.now()
        lda_model = LDA(corpus_tfidf_train, num_topics=num_topics, id2word=dictionary_train,random_state=100,chunksize=1000, passes=50)
        end_time = datetime.datetime.now()
        print("total running time = ", end_time - start_time)
        # Compute Perplexity
        print('\nPerplexity with num_topics=%d : ' % num_topics, lda_model.log_perplexity(corpus_tfidf_test))  # a measure of how good the model is. lower the better.
        # Compute Coherence Score
        coherence_model_lda = CoherenceModel(model=lda_model, texts=data_sentence_test["nouns"], dictionary=dictionary_test, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        print('\nCoherence Score with num_topics=%d : ' % num_topics, coherence_lda)
 

SyntaxError: invalid syntax (<ipython-input-1-69ae4f0a11a4>, line 4)