In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

from gensim.parsing.preprocessing import strip_punctuation, strip_tags, strip_numeric
from nltk.stem.wordnet import WordNetLemmatizer   
from nltk.corpus import stopwords
import string

Step 1 : Read zomato reviews

In [2]:
#This review text is propreitary information of zomato, do not reuse
filename = 'reviews.txt'
corpus0 = open(filename).readlines()

**Step 2 - Preprocessing text**

Clean all text, remove stop words etc

In [3]:
lemma = WordNetLemmatizer()

# NLTK Stop words extended
stop_words = stopwords.words('english')
stop_words.extend(['zomato', 'swiggy', 'restaurant', 'delivery', 'food'])

def textClean(text0):
    text1 = [strip_punctuation(doc) for doc in text0]
    text1 = [strip_tags(doc) for doc in text1]
    text1 = [strip_numeric(doc) for doc in text1]
    text1 = [[" ".join([i for i in doc.lower().split() if i not in stop_words])] for doc in text1]
    text2 = [[word for word in ' '.join(doc).split()] for doc in text1]
    normalized = [[" ".join([lemma.lemmatize(word) for word in ' '.join(doc).split()])] for doc in text1]
    return normalized

corpus1 = textClean(corpus0)

**DTM Building and LDA**

In [4]:
## DTM building etc via gensim
corpus2 = [[word for word in ' '.join(doc).split()] for doc in corpus1]

# bigrams and trigrams
bigram = gensim.models.Phrases(corpus2, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[corpus2], threshold=100)  

# get bigrams and trigrams models
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# Create Dictionary, Building gensim corpus. TF DTM creation.
id2word = corpora.Dictionary(corpus2)
corpus = [id2word.doc2bow(text) for text in corpus2]
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 2), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 2), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 2), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 2), (42, 1), (43, 1), (44, 1), (45, 2), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1)]]


**Step 4 - Running the LDA in gensim.**

In [5]:
# Build LDA model for (say) K=6 topics
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=6, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

**Check word document matrix using beta**

In [6]:
## def func for beta_df
import pandas as pd

## obtain the factor matrices - beta
def build_beta_df(lda_model=lda_model, id2word=id2word):
    beta = lda_model.get_topics()
    beta_df = pd.DataFrame(data=beta)

    # convert colnames in beta_df 2 tokens
    token2col = list(id2word.token2id)
    beta_df.columns = token2col
    rowNames=['topic' + format(x+1, '02d') for x in range(beta_df.shape[0])]
    rowNames_series = pd.Series(rowNames)
    beta_df.rename(index=rowNames_series, inplace=True)
    return(beta_df)

beta_df = build_beta_df(lda_model=lda_model, id2word=id2word)
beta_df.iloc[:10, :10]

Unnamed: 0,ambiance,another,around,attention,caught,chef,constantly,counter,deeply,display
topic01,0.000671,0.000672,0.008671,0.000672,0.000672,0.000671,0.000672,0.004669,0.000672,0.000672
topic02,0.000859,0.000859,0.000859,0.000859,0.000859,0.000859,0.000859,0.000859,0.000859,0.000859
topic03,0.000986,0.000986,0.000986,0.000986,0.000986,0.000986,0.000986,0.006905,0.000986,0.000986
topic04,0.004616,0.000672,0.00067,0.000674,0.000668,0.000674,0.000667,0.000668,0.000673,0.000665
topic05,0.000629,0.000634,0.000634,0.000632,0.000632,0.000632,0.000632,0.000631,0.000632,0.000633
topic06,0.00444,0.004447,0.004448,0.004447,0.004453,0.004447,0.004453,0.008293,0.004448,0.004454


In [7]:
def build_gamma_df(lda_model, corpus0):
    gamma_doc = []
    num_topics = lda_model.get_topics().shape[0]
    
    for doc in range(len(corpus0)):
        doc1 = corpus0[doc].split()
        bow_doc = id2word.doc2bow(doc1)
        gamma_doc0 = [0]*num_topics
        gamma_doc1 = lda_model.get_document_topics(bow_doc)
        gamma_doc2_x = [x for (x,y) in gamma_doc1]
        gamma_doc2_y = [y for (x,y) in gamma_doc1]
        for i in range(len(gamma_doc1)):
            x = gamma_doc2_x[i]
            y = gamma_doc2_y[i]
            gamma_doc0[x] = y
        gamma_doc.append(gamma_doc0)
        
    gamma_df = pd.DataFrame(data=gamma_doc)
    topicNames=['topic' + format(x+1, '02d') for x in range(num_topics)]
    topicNames_series = pd.Series(topicNames)
    gamma_df.rename(columns=topicNames_series, inplace=True)
    return(gamma_df)

# now apply func
gamma_df = build_gamma_df(lda_model=lda_model, corpus0=corpus0)

Unnamed: 0,topic01,topic02,topic03,topic04,topic05,topic06
0,0.0,0.0,0.0,0.0,0.0,0.991468
1,0.0,0.0,0.961538,0.0,0.0,0.0
2,0.010535,0.0,0.949478,0.010186,0.011748,0.0
3,0.015347,0.012789,0.012754,0.928484,0.017114,0.013511
4,0.015347,0.01279,0.012754,0.014839,0.930759,0.013511
5,0.028254,0.023546,0.023481,0.868337,0.031508,0.024874
6,0.0,0.0,0.0,0.974704,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.97197


In [9]:
#Check first 25 reviews
gamma_df.iloc[:25, :]

Unnamed: 0,topic01,topic02,topic03,topic04,topic05,topic06
0,0.0,0.0,0.0,0.0,0.0,0.991468
1,0.0,0.0,0.961538,0.0,0.0,0.0
2,0.010535,0.0,0.949478,0.010186,0.011748,0.0
3,0.015347,0.012789,0.012754,0.928484,0.017114,0.013511
4,0.015347,0.01279,0.012754,0.014839,0.930759,0.013511
5,0.028254,0.023546,0.023481,0.868337,0.031508,0.024874
6,0.0,0.0,0.0,0.974704,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.97197
8,0.010535,0.0,0.0,0.010186,0.95247,0.0
9,0.010535,0.949502,0.0,0.010186,0.011748,0.0


## Now if you review closely, it clearly shows that
- topic3 - talks about place (probably location ?)
- topic4 - cuisines
- topic6 - ambience

Now, we can develop some kind of algorithm here to rate each of the reviews based on 6 different factors 