# Get topics by Gensim

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
import json

In [3]:
review_data_file = '../lexicon/data/yelp_sample.json'
with open(review_data_file, 'r') as infile:
    R = json.load(infile)

In [4]:
from nltk.tokenize import word_tokenize
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [6]:
R = R[:6000]
lemmatized_docs = [] 
for text in tqdm(R):
    content = nlp(text['content'])
    tokens = []
    for sent in content.sents:
        for token in sent:
            if token.pos_ in {'NOUN', 'PNOUN'}:
                tokens.append(token.lemma_.lower())
    lemmatized_docs.append(tokens)

  0%|          | 0/6000 [00:00<?, ?it/s]

In [7]:
lemmatized_docs[0][:6]

['salad', 'yum', 'addition', 'menu', 'location', 'service']

## LDA by Gensim

In [37]:
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models.lsimodel import LsiModel
from gensim.test.utils import common_texts

In [38]:
common_dictionary = Dictionary(lemmatized_docs)
common_corpus = [common_dictionary.doc2bow(text) for text in lemmatized_docs]

In [44]:
lda = LdaModel(common_corpus, id2word=common_dictionary, num_topics=10)

In [45]:
lda.get_topics().shape

(10, 8764)

In [46]:
for topic in range(10):
    print("Topic {}".format(topic))
    print([x for x, y in lda.show_topic(topic, topn=6)])

Topic 0
['nail', 'time', 'place', 'job', 'salon', 'hair']
Topic 1
['food', 'place', 'room', 'time', 'staff', 'service']
Topic 2
['food', 'service', 'burger', 'order', 'place', 'fry']
Topic 3
['place', 'time', 'service', 'people', 'food', 'year']
Topic 4
['class', 'store', 'place', 'dress', 'gym', 'job']
Topic 5
['food', 'time', 'restaurant', 'service', 'place', 'table']
Topic 6
['time', 'service', 'car', 'customer', 'place', 'shop']
Topic 7
['time', 'place', 'service', 'store', 'dog', 'food']
Topic 8
['place', 'drink', 'food', 'service', 'time', 'area']
Topic 9
['food', 'place', 'time', 'ice', 'cream', 'service']


In [47]:
for doc in lemmatized_docs[:4]:
    dist = lda.get_document_topics(common_dictionary.doc2bow(doc))
    print(doc[:10])
    print(dist)
    print()

['salad', 'yum', 'addition', 'menu', 'location', 'service', 'food', 'temp', 'kid', 'pizza']
[(3, 0.49349156), (5, 0.4664632)]

['momo', 'crawl', 'lot', 'restaurant', 'mutton', 'thali']
[(0, 0.0143885305), (1, 0.01439218), (2, 0.014390268), (3, 0.014389338), (4, 0.014387952), (5, 0.014396155), (6, 0.014388528), (7, 0.8704889), (8, 0.014388934), (9, 0.014389274)]

['night', 'people', 'pizza', 'attitude']
[(0, 0.020004109), (1, 0.020004153), (2, 0.020005535), (3, 0.819947), (4, 0.020002637), (5, 0.020007875), (6, 0.02001265), (7, 0.020008035), (8, 0.020003235), (9, 0.020004736)]

['brisket', 'sandwich', 'gas', 'hipster', 'bbq', 'lunch', 'spot\\/', 'store', 'carwash']
[(0, 0.010019705), (1, 0.010022195), (2, 0.010022508), (3, 0.010020465), (4, 0.010019288), (5, 0.010022131), (6, 0.18740547), (7, 0.7324268), (8, 0.010019697), (9, 0.010021674)]



## TfIdf over the topic terms

In [48]:
from collections import defaultdict

In [49]:
I = defaultdict(lambda: defaultdict(lambda: 0))
for topic in range(10):
    for x, y in lda.show_topic(topic, topn=20):
        I[topic][x] += y
Idf = pd.DataFrame(I).T.fillna(0)

In [25]:
IDF = {}
for word in Idf.columns:
    col = Idf[word]
    n = len([x for x, y in col.items() if y > 0])
    IDF[word] = np.log(Idf.shape[0] / n)

In [26]:
IDFs = pd.Series(IDF)

In [28]:
IDFs.sort_values(ascending=False)

piece        2.302585
nail         2.302585
breakfast    2.302585
lunch        2.302585
rice         2.302585
               ...   
staff        0.356675
food         0.356675
service      0.000000
time         0.000000
place        0.000000
Length: 90, dtype: float64

In [36]:
(Idf.loc[7] * IDFs).sort_values(ascending=False)

dish         0.022714
salad        0.021602
flavor       0.021082
sauce        0.018902
bit          0.016056
               ...   
selection    0.000000
friend       0.000000
lot          0.000000
bar          0.000000
parking      0.000000
Length: 90, dtype: float64

In [34]:
Idf.loc[1].sort_values(ascending=False)

food       0.044006
place      0.037613
service    0.024877
time       0.021346
pizza      0.012704
             ...   
stylist    0.000000
care       0.000000
bike       0.000000
pain       0.000000
parking    0.000000
Name: 1, Length: 90, dtype: float64