In [29]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

import gensim

In [7]:
data = pd.read_csv('../../yelp-data/new_data/final_data/manip_data.csv')

In [8]:
cv = CountVectorizer(binary=False,
                     stop_words='english',
                     min_df=3)

docs = cv.fit_transform(data.text.dropna())

# Build a mapping of numerical ID to word
id2word = dict(enumerate(cv.get_feature_names()))

In [9]:
from gensim.models.ldamodel import LdaModel
from gensim.matutils import Sparse2Corpus

In [10]:
# convert our word-matrix into gensim's format
corpus = Sparse2Corpus(docs, documents_columns = False)

In [11]:
# fit an LDA model
lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=15)

# need to explicitly specify the number of topics we want the model to uncover
# num_topics = 10
# lda_model = LdaModel(corpus=corpus, 
#                     id2word=dict(enumerate(vectorizer.get_feature_names())), 
#                     num_topics=num_topics)

Did we learn reasonable topics?  
Do the words that make up a topic make sense?  
Is this topic helpful towards our goal?  
We can evaluate fit by viewing the top words in each topic. Some topics will be clearer than others.

In [9]:
num_topics = 15
num_words_per_topic = 10
for ti, topic in enumerate(lda_model.show_topics(num_topics = num_topics, num_words = num_words_per_topic)):
    print "Topic: %d" % (ti)
    print topic
    print

Topic: 0
(0, u'0.016*"like" + 0.014*"place" + 0.011*"old" + 0.008*"new" + 0.007*"just" + 0.007*"don" + 0.006*"ve" + 0.005*"know" + 0.005*"steak" + 0.005*"feel"')

Topic: 1
(1, u'0.034*"fries" + 0.027*"cheese" + 0.026*"burger" + 0.022*"steak" + 0.022*"sandwich" + 0.018*"good" + 0.013*"philly" + 0.013*"place" + 0.009*"like" + 0.009*"burgers"')

Topic: 2
(2, u'0.039*"chicken" + 0.032*"pork" + 0.029*"fried" + 0.018*"ribs" + 0.015*"sauce" + 0.015*"bbq" + 0.014*"beef" + 0.010*"belly" + 0.009*"sweet" + 0.009*"rolls"')

Topic: 3
(3, u'0.028*"breakfast" + 0.027*"eggs" + 0.023*"sushi" + 0.022*"steak" + 0.018*"fried" + 0.013*"chicken" + 0.013*"brunch" + 0.012*"good" + 0.011*"place" + 0.010*"roll"')

Topic: 4
(4, u'0.043*"great" + 0.039*"food" + 0.028*"place" + 0.026*"service" + 0.020*"good" + 0.014*"steak" + 0.011*"staff" + 0.011*"amazing" + 0.011*"restaurant" + 0.010*"time"')

Topic: 5
(5, u'0.017*"food" + 0.012*"table" + 0.011*"time" + 0.011*"minutes" + 0.010*"came" + 0.009*"service" + 0.009*"a

#### https://github.com/RaRe-Technologies/gensim/issues/484

#### Fewer topics...

In [25]:
from gensim import corpora

In [28]:
# fit an LDA model
lda = LdaModel(corpus=corpus, id2word=id2word, num_topics=5)

In [9]:
num_topics = 5
num_words_per_topic = 10
for ti, topic in enumerate(lda.show_topics(num_topics = num_topics, num_words = num_words_per_topic)):
    print "Topic: %d" % (ti)
    print topic
    print

Topic: 0
(0, u'0.019*"great" + 0.017*"steak" + 0.014*"service" + 0.012*"amazing" + 0.012*"food" + 0.011*"best" + 0.009*"delicious" + 0.009*"restaurant" + 0.008*"good" + 0.008*"excellent"')

Topic: 1
(1, u'0.014*"food" + 0.013*"ordered" + 0.012*"came" + 0.012*"steak" + 0.010*"table" + 0.009*"time" + 0.009*"server" + 0.009*"service" + 0.009*"got" + 0.008*"just"')

Topic: 2
(2, u'0.033*"tacos" + 0.017*"steak" + 0.016*"taco" + 0.015*"burrito" + 0.014*"mexican" + 0.013*"salsa" + 0.012*"chips" + 0.007*"beans" + 0.007*"die" + 0.007*"chicken"')

Topic: 3
(3, u'0.020*"steak" + 0.019*"good" + 0.013*"chicken" + 0.011*"cheese" + 0.010*"like" + 0.009*"ordered" + 0.009*"fries" + 0.008*"sauce" + 0.008*"just" + 0.007*"really"')

Topic: 4
(4, u'0.024*"food" + 0.022*"place" + 0.017*"great" + 0.014*"good" + 0.011*"steak" + 0.011*"service" + 0.008*"like" + 0.008*"time" + 0.008*"bar" + 0.007*"just"')



In [20]:
import pyLDAvis.gensim as gensimvis
import pyLDAvis
from gensim.corpora import Dictionary, MmCorpus

In [30]:
docs = data['text']
texts = [[i for i in doc.lower().split()] for doc in docs]

dictionary = corpora.Dictionary(texts)
# gensim.corpora.dictionary.save_as_text(steak_dict)
# dictionary = open('steak_dict.txt').read()

In [36]:
from ast import literal_eval

In [34]:
def nltk_stopwords():
    return set(nltk.corpus.stopwords.words('english'))

def prep_corpus(texts, additional_stopwords=set(), no_below=5, no_above=0.5):
    print('Building dictionary...')
    dictionary = Dictionary(texts)
    # remove stopwords 
    stopwords = nltk_stopwords().union(additional_stopwords)
    stopword_ids = map(dictionary.token2id.get, stopwords)
    # get ids for short words len(word)<=3
    shortword_ids = [tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0])<= 3]
    dictionary.filter_tokens(stopword_ids)
    dictionary.compactify()
    # get ids for short words len(word)<=3
    shortword_ids = [tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0])<= 3]
    dictionary.filter_tokens(shortword_ids)
    dictionary.compactify()
    # remove words that appear only once
    once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems()if docfreq == 1]
    dictionary.filter_tokens(once_ids)
    dictionary.compactify()
    # filter extreme values 
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None)
    dictionary.compactify()

    print('Building corpus...')
    corpus = [dictionary.doc2bow(doc) for doc in texts]

    return dictionary, corpus

In [37]:
dictionary ,corpus = prep_corpus([literal_eval(doc) for doc in texts])

ValueError: malformed string

In [31]:
vis_data = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis_data)

IndexError: index 234551 is out of bounds for axis 1 with size 40733

In [None]:
pyLDAvis.save_html(vis_data, 'data/model/vis_data.html')