# Get topics by Gensim

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
import json

In [3]:
review_data_file = '../lexicon/data/yelp_sample.json'
with open(review_data_file, 'r') as infile:
    R = json.load(infile)

In [4]:
from nltk.tokenize import word_tokenize
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [6]:
R = R[:4000]
lemmatized_docs = [] 
for text in tqdm(R):
    content = nlp(text['content'])
    tokens = []
    for sent in content.sents:
        for token in sent:
            if token.pos_ in {'NOUN', 'PNOUN'}:
                tokens.append(token.lemma_.lower())
    lemmatized_docs.append(tokens)

  0%|          | 0/4000 [00:00<?, ?it/s]

In [7]:
lemmatized_docs[0][:6]

['salad', 'yum', 'addition', 'menu', 'location', 'service']

## LDA by Gensim

In [8]:
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.test.utils import common_texts

In [9]:
common_dictionary = Dictionary(lemmatized_docs)
common_corpus = [common_dictionary.doc2bow(text) for text in lemmatized_docs]

In [10]:
lda = LdaModel(common_corpus, id2word=common_dictionary, num_topics=10)

In [11]:
lda.get_topics().shape

(10, 7062)

In [14]:
for topic in range(10):
    print("Topic {}".format(topic))
    print([x for x, y in lda.show_topic(topic, topn=6)])

Topic 0
['food', 'burger', 'restaurant', 'service', 'table', 'time']
Topic 1
['food', 'place', 'time', 'restaurant', 'room', 'service']
Topic 2
['time', 'food', 'service', 'day', 'place', 'hair']
Topic 3
['food', 'place', 'chicken', 'time', 'price', 'sauce']
Topic 4
['service', 'time', 'place', 'food', 'location', 'experience']
Topic 5
['time', 'place', 'cheese', 'year', 'pizza', 'restaurant']
Topic 6
['place', 'food', 'time', 'drink', 'menu', 'salad']
Topic 7
['time', 'service', 'food', 'room', 'place', 'customer']
Topic 8
['service', 'car', 'time', 'place', 'store', 'day']
Topic 9
['cake', 'time', 'day', 'drink', 'place', 'chocolate']


In [19]:
for doc in lemmatized_docs[:4]:
    dist = lda.get_document_topics(common_dictionary.doc2bow(doc))
    print(doc[:10])
    print(dist)
    print()

['salad', 'yum', 'addition', 'menu', 'location', 'service', 'food', 'temp', 'kid', 'pizza']
[(6, 0.9549691)]

['momo', 'crawl', 'lot', 'restaurant', 'mutton', 'thali']
[(0, 0.014326497), (1, 0.8710648), (2, 0.014325845), (3, 0.014325832), (4, 0.0143260555), (5, 0.014326038), (6, 0.014326456), (7, 0.014326368), (8, 0.014326025), (9, 0.014326103)]

['night', 'people', 'pizza', 'attitude']
[(0, 0.020003648), (1, 0.020006752), (2, 0.020003483), (3, 0.020008855), (4, 0.02000376), (5, 0.020006001), (6, 0.020005068), (7, 0.81995654), (8, 0.020004053), (9, 0.02000187)]

['brisket', 'sandwich', 'gas', 'hipster', 'bbq', 'lunch', 'spot\\/', 'store', 'carwash']
[(0, 0.010009057), (1, 0.0100082755), (2, 0.010007902), (3, 0.010008967), (4, 0.9099241), (5, 0.010007616), (6, 0.010007954), (7, 0.0100078685), (8, 0.0100091), (9, 0.010009191)]

