In [12]:
import nltk
import pandas as pd
import re
import string
pd.set_option('display.max_colwidth', 100)
stopwords = nltk.corpus.stopwords.words('english')

In [13]:
data = pd.read_csv('yelp.csv')

In [14]:
data.columns

Index(['business_id', 'date', 'review_id', 'stars', 'text', 'type', 'user_id',
       'cool', 'useful', 'funny'],
      dtype='object')

In [15]:
data = data.drop(['business_id', 'date', 'review_id', 'type', 'user_id'], axis=1)

In [16]:
data.head()

Unnamed: 0,stars,text,cool,useful,funny
0,5,My wife took me here on my birthday for breakfast and it was excellent. The weather was perfect...,2,5,0
1,5,"I have no idea why some people give bad reviews about this place. It goes to show you, you can p...",0,0,0
2,4,love the gyro plate. Rice is so good and I also dig their candy selection :),0,1,0
3,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!! It's very convenient and surrounded by a lot of ...",1,2,0
4,5,"General Manager Scott Petello is a good egg!!! Not to go into detail, but let me assure you if y...",0,0,0


## Clean text

In [22]:
def clean_text(text):
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stopwords]
    return text

In [24]:
data['clean_text'] = data['text'].apply(lambda x: clean_text(x.lower()))
data.head()

Unnamed: 0,stars,text,cool,useful,funny,clean_text
0,5,My wife took me here on my birthday for breakfast and it was excellent. The weather was perfect...,2,5,0,"[wife, took, birthday, breakfast, excellent, weather, perfect, made, sitting, outside, overlooki..."
1,5,"I have no idea why some people give bad reviews about this place. It goes to show you, you can p...",0,0,0,"[idea, people, give, bad, reviews, place, goes, show, please, everyone, probably, griping, somet..."
2,4,love the gyro plate. Rice is so good and I also dig their candy selection :),0,1,0,"[love, gyro, plate, rice, good, also, dig, candy, selection, ]"
3,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!! It's very convenient and surrounded by a lot of ...",1,2,0,"[rosie, dakota, love, chaparral, dog, park, convenient, surrounded, lot, paths, desert, xeriscap..."
4,5,"General Manager Scott Petello is a good egg!!! Not to go into detail, but let me assure you if y...",0,0,0,"[general, manager, scott, petello, good, egg, go, detail, let, assure, issues, albeit, rare, spe..."


## stemmed text

In [25]:
from nltk import PorterStemmer

ps = PorterStemmer()

def stemming_text(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return text

In [26]:
data['stemmed_clean_text'] = data['clean_text'].apply(lambda x: stemming_text(x))
data.head()

Unnamed: 0,stars,text,cool,useful,funny,clean_text,stemmed_clean_text
0,5,My wife took me here on my birthday for breakfast and it was excellent. The weather was perfect...,2,5,0,"[wife, took, birthday, breakfast, excellent, weather, perfect, made, sitting, outside, overlooki...","[wife, took, birthday, breakfast, excel, weather, perfect, made, sit, outsid, overlook, ground, ..."
1,5,"I have no idea why some people give bad reviews about this place. It goes to show you, you can p...",0,0,0,"[idea, people, give, bad, reviews, place, goes, show, please, everyone, probably, griping, somet...","[idea, peopl, give, bad, review, place, goe, show, pleas, everyon, probabl, gripe, someth, fault..."
2,4,love the gyro plate. Rice is so good and I also dig their candy selection :),0,1,0,"[love, gyro, plate, rice, good, also, dig, candy, selection, ]","[love, gyro, plate, rice, good, also, dig, candi, select, ]"
3,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!! It's very convenient and surrounded by a lot of ...",1,2,0,"[rosie, dakota, love, chaparral, dog, park, convenient, surrounded, lot, paths, desert, xeriscap...","[rosi, dakota, love, chaparr, dog, park, conveni, surround, lot, path, desert, xeriscap, basebal..."
4,5,"General Manager Scott Petello is a good egg!!! Not to go into detail, but let me assure you if y...",0,0,0,"[general, manager, scott, petello, good, egg, go, detail, let, assure, issues, albeit, rare, spe...","[gener, manag, scott, petello, good, egg, go, detail, let, assur, issu, albeit, rare, speak, sco..."


## lemmetized text

In [27]:
from nltk import WordNetLemmatizer

wn = WordNetLemmatizer()

def lemmetizing_text(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

## converting text to bag of words

In [28]:
import gensim

In [29]:
dictionary = gensim.corpora.Dictionary(documents=data['stemmed_clean_text'])

In [35]:
bow_corpus = [dictionary.doc2bow(doc) for doc in data['stemmed_clean_text']]

In [37]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 8, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)

In [38]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.009*"time" + 0.009*"get" + 0.008*"go" + 0.007*"one" + 0.006*"like" + 0.006*"would" + 0.006*"place" + 0.006*"us" + 0.006*"servic" + 0.005*"back"


Topic: 1 
Words: 0.007*"go" + 0.007*"drink" + 0.007*"coffe" + 0.007*"like" + 0.006*"great" + 0.006*"love" + 0.006*"one" + 0.006*"get" + 0.006*"good" + 0.005*"would"


Topic: 2 
Words: 0.018*"dog" + 0.006*"get" + 0.006*"like" + 0.005*"go" + 0.004*"dont" + 0.004*"movi" + 0.004*"would" + 0.003*"hot" + 0.003*"one" + 0.003*"think"


Topic: 3 
Words: 0.007*"place" + 0.007*"like" + 0.006*"park" + 0.006*"get" + 0.005*"go" + 0.005*"even" + 0.004*"game" + 0.004*"phoenix" + 0.004*"say" + 0.004*"stadium"


Topic: 4 
Words: 0.016*"pizza" + 0.011*"place" + 0.009*"great" + 0.008*"go" + 0.008*"one" + 0.007*"best" + 0.006*"ive" + 0.006*"like" + 0.006*"store" + 0.006*"get"


Topic: 5 
Words: 0.015*"place" + 0.014*"good" + 0.011*"like" + 0.010*"food" + 0.009*"go" + 0.009*"breakfast" + 0.009*"order" + 0.008*"time" + 0.007*"get" + 0.006*"wait"


In [39]:
# Data preprocessing step for the unseen document
bow_vector = dictionary.doc2bow(clean_text('The movie is worst among all availabe on Amazon.'))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.7076854705810547	 Topic: 0.007*"go" + 0.007*"drink" + 0.007*"coffe" + 0.007*"like" + 0.006*"great"
Score: 0.04181838408112526	 Topic: 0.009*"time" + 0.009*"get" + 0.008*"go" + 0.007*"one" + 0.006*"like"
Score: 0.0418088324368	 Topic: 0.015*"place" + 0.014*"good" + 0.011*"like" + 0.010*"food" + 0.009*"go"
Score: 0.041772469878196716	 Topic: 0.023*"food" + 0.020*"place" + 0.015*"good" + 0.014*"great" + 0.011*"like"
Score: 0.04174960404634476	 Topic: 0.018*"dog" + 0.006*"get" + 0.006*"like" + 0.005*"go" + 0.004*"dont"
Score: 0.041727788746356964	 Topic: 0.012*"good" + 0.009*"order" + 0.007*"like" + 0.007*"place" + 0.007*"food"
Score: 0.041718896478414536	 Topic: 0.016*"pizza" + 0.011*"place" + 0.009*"great" + 0.008*"go" + 0.008*"one"
Score: 0.0417185053229332	 Topic: 0.007*"place" + 0.007*"like" + 0.006*"park" + 0.006*"get" + 0.005*"go"


In [41]:
data['stemmed_clean_text'][100]

['admit',
 'find',
 'think',
 'like',
 'mine',
 'tall',
 'dark',
 'russian',
 'nice',
 'coffe',
 'shop',
 'near',
 'that',
 'starbuck',
 'good',
 'coffe',
 'reason',
 'price',
 'far',
 'stick',
 'hous',
 'brew',
 'smooth',
 'bitter',
 'yet',
 'sweet',
 'sinc',
 'fanci',
 'coffe',
 'drink',
 'usual',
 'mani',
 'calori',
 'need',
 'tri',
 'blueberri',
 'muffinquit',
 'yummi',
 'well',
 'chocol',
 'chip',
 'cooki',
 'delici',
 'cafe',
 'attach',
 'great',
 'indoor',
 'peopl',
 'watch',
 'brows',
 'thru',
 'design',
 'book',
 'take',
 'leisur',
 'stroll',
 'design',
 'center',
 'great',
 'way',
 'unwind',
 'perk',
 'choic',
 'caffein',
 'beverag']