In [1]:
# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
import pandas as pd
from pprint import pprint
import numpy as np
import re
import seaborn as sns
import matplotlib.colors as mcolors
from collections import Counter
from matplotlib.patches import Rectangle
from sklearn.manifold import TSNE
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors
from matplotlib.ticker import FuncFormatter
from tqdm import tqdm

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['br','from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])

%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [2]:
import time

In [3]:
df = pd.read_csv('data/imdb.csv')
#df = pd.read_json('data/20newsgroup.json')
#df = df.loc[df.target_names.isin(['soc.religion.christian', 'rec.sport.hockey', 'talk.politics.mideast', 'rec.motorcycles']) , :]
print(df.shape)  #> (2361, 3)
df.head()

(50000, 5)


Unnamed: 0,sno,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


In [4]:
def sent_to_words(sentences):
    for sent in tqdm(sentences):
        sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
        sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True) 
        yield(sent)  

# Convert to list
#data = df.content.values.tolist()
data = df.review.values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:1])

100%|███████████████████████████████████████████████████████████████████████████| 50000/50000 [00:55<00:00, 907.34it/s]


[['once', 'again', 'mr', 'costner', 'has', 'dragged', 'out', 'movie', 'for', 'far', 'longer', 'than', 'necessary', 'aside', 'from', 'the', 'terrific', 'sea', 'rescue', 'sequences', 'of', 'which', 'there', 'are', 'very', 'few', 'just', 'did', 'not', 'care', 'about', 'any', 'of', 'the', 'characters', 'most', 'of', 'us', 'have', 'ghosts', 'in', 'the', 'closet', 'and', 'costners', 'character', 'are', 'realized', 'early', 'on', 'and', 'then', 'forgotten', 'until', 'much', 'later', 'by', 'which', 'time', 'did', 'not', 'care', 'the', 'character', 'we', 'should', 'really', 'care', 'about', 'is', 'very', 'cocky', 'overconfident', 'ashton', 'kutcher', 'the', 'problem', 'is', 'he', 'comes', 'off', 'as', 'kid', 'who', 'thinks', 'hes', 'better', 'than', 'anyone', 'else', 'around', 'him', 'and', 'shows', 'no', 'signs', 'of', 'cluttered', 'closet', 'his', 'only', 'obstacle', 'appears', 'to', 'be', 'winning', 'over', 'costner', 'finally', 'when', 'we', 'are', 'well', 'past', 'the', 'half', 'way', 'poi

In [5]:
# Build the bigram and trigram models
start = time.time()
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
print(int(time.time()-start), 'seconds')

def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in tqdm(texts)]
    texts = [bigram_mod[doc] for doc in tqdm(texts)]
    texts = [trigram_mod[bigram_mod[doc]] for doc in tqdm(texts)]
    
    texts_out = []
    
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    for sent in tqdm(texts):
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in tqdm(texts_out)]    
    return texts_out

data_ready = process_words(data_words)  # processed Text Data!

182 seconds


100%|███████████████████████████████████████████████████████████████████████████| 50000/50000 [00:53<00:00, 929.05it/s]
100%|██████████████████████████████████████████████████████████████████████████| 50000/50000 [00:21<00:00, 2292.29it/s]
100%|██████████████████████████████████████████████████████████████████████████| 50000/50000 [00:42<00:00, 1186.06it/s]
100%|███████████████████████████████████████████████████████████████████████████| 50000/50000 [07:27<00:00, 111.69it/s]
100%|██████████████████████████████████████████████████████████████████████████| 50000/50000 [00:31<00:00, 1584.02it/s]


In [6]:
# Create Dictionary
id2word = corpora.Dictionary(data_ready)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]

In [7]:
print(len(corpus))

50000


In [9]:
# Build LDA model
for i in tqdm(range(13,26)):
    lda_model =  gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                                         id2word=id2word,
                                                         num_topics=i, 
                                                         random_state=100,
                                                         chunksize=100,
                                                         passes=10,
                                                         alpha='symmetric',
                                                         iterations=10,
                                                         workers=2,
                                                         per_word_topics=True)


    print('Perplexity: ',i, lda_model.log_perplexity(corpus))
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_ready, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('Coherence Score: ', i, coherence_lda)
    pprint(lda_model.print_topics())
    print("END\n") 


  0%|                                                                                           | 0/13 [00:00<?, ?it/s]

Perplexity:  13 -10.376986692033551
Coherence Score:  13 0.40675528720812026
[(0,
  '0.023*"people" + 0.016*"war" + 0.015*"american" + 0.014*"world" + '
  '0.009*"show" + 0.008*"documentary" + 0.008*"history" + 0.008*"country" + '
  '0.006*"black" + 0.006*"live"'),
 (1,
  '0.030*"story" + 0.026*"character" + 0.013*"life" + 0.009*"movie" + '
  '0.009*"feel" + 0.009*"way" + 0.009*"love" + 0.008*"well" + 0.007*"book" + '
  '0.007*"time"'),
 (2,
  '0.029*"play" + 0.019*"role" + 0.018*"performance" + 0.017*"great" + '
  '0.014*"actor" + 0.013*"well" + 0.013*"cast" + 0.012*"star" + 0.008*"song" + '
  '0.007*"give"'),
 (3,
  '0.121*"movie" + 0.032*"watch" + 0.022*"time" + 0.021*"great" + '
  '0.018*"really" + 0.017*"love" + 0.014*"funny" + 0.011*"enjoy" + '
  '0.011*"people" + 0.010*"well"'),
 (4,
  '0.019*"family" + 0.018*"girl" + 0.018*"young" + 0.017*"woman" + '
  '0.017*"father" + 0.014*"man" + 0.014*"life" + 0.012*"child" + '
  '0.012*"friend" + 0.011*"son"'),
 (5,
  '0.148*"film" + 0.01


  8%|██████                                                                         | 1/13 [31:29<6:17:53, 1889.45s/it]

Perplexity:  14 -10.74313907598339
Coherence Score:  14 0.39356793006200835
[(0,
  '0.020*"thing" + 0.018*"really" + 0.015*"guy" + 0.015*"look" + 0.015*"end" + '
  '0.013*"people" + 0.013*"scene" + 0.012*"way" + 0.011*"something" + '
  '0.010*"happen"'),
 (1,
  '0.035*"story" + 0.029*"character" + 0.012*"life" + 0.011*"love" + '
  '0.010*"book" + 0.009*"feel" + 0.008*"way" + 0.006*"end" + 0.006*"time" + '
  '0.006*"find"'),
 (2,
  '0.020*"song" + 0.017*"music" + 0.014*"play" + 0.014*"star" + 0.012*"dance" '
  '+ 0.010*"year" + 0.009*"musical" + 0.009*"match" + 0.007*"big" + '
  '0.007*"number"'),
 (3,
  '0.054*"show" + 0.032*"series" + 0.032*"funny" + 0.028*"comedy" + '
  '0.023*"episode" + 0.018*"watch" + 0.015*"tv" + 0.013*"time" + 0.013*"laugh" '
  '+ 0.012*"first"'),
 (4,
  '0.024*"family" + 0.023*"young" + 0.022*"love" + 0.020*"woman" + '
  '0.020*"girl" + 0.020*"father" + 0.018*"life" + 0.017*"child" + '
  '0.015*"friend" + 0.015*"man"'),
 (5,
  '0.146*"film" + 0.012*"time" + 0.0


 15%|███████████▊                                                                 | 2/13 [1:02:49<5:45:53, 1886.67s/it]

Perplexity:  15 -10.92100915179716
Coherence Score:  15 0.36864711616231666
[(0,
  '0.023*"people" + 0.011*"world" + 0.007*"film" + 0.007*"point" + 0.007*"way" '
  '+ 0.007*"fact" + 0.006*"thing" + 0.006*"documentary" + 0.006*"live" + '
  '0.005*"life"'),
 (1,
  '0.033*"story" + 0.031*"character" + 0.018*"life" + 0.017*"love" + '
  '0.013*"movie" + 0.010*"feel" + 0.009*"book" + 0.009*"well" + 0.008*"way" + '
  '0.007*"great"'),
 (2,
  '0.030*"play" + 0.025*"great" + 0.020*"role" + 0.017*"performance" + '
  '0.014*"actor" + 0.013*"star" + 0.013*"cast" + 0.012*"well" + 0.011*"music" '
  '+ 0.011*"song"'),
 (3,
  '0.094*"funny" + 0.085*"comedy" + 0.054*"laugh" + 0.032*"humor" + '
  '0.026*"joke" + 0.026*"hilarious" + 0.026*"animation" + 0.019*"fun" + '
  '0.016*"cartoon" + 0.015*"animate"'),
 (4,
  '0.021*"girl" + 0.018*"woman" + 0.018*"man" + 0.012*"friend" + 0.011*"find" '
  '+ 0.011*"young" + 0.010*"wife" + 0.010*"end" + 0.009*"old" + 0.008*"boy"'),
 (5,
  '0.138*"film" + 0.013*"well" 


 23%|█████████████████▊                                                           | 3/13 [1:35:19<5:17:37, 1905.76s/it]

Perplexity:  16 -10.857529714715296
Coherence Score:  16 0.4246900433582756
[(0,
  '0.027*"people" + 0.009*"point" + 0.009*"fact" + 0.009*"way" + '
  '0.008*"character" + 0.008*"thing" + 0.007*"american" + 0.007*"real" + '
  '0.007*"understand" + 0.007*"show"'),
 (1,
  '0.083*"book" + 0.040*"version" + 0.040*"read" + 0.034*"novel" + '
  '0.027*"story" + 0.017*"adaptation" + 0.012*"passion" + 0.012*"king" + '
  '0.011*"base" + 0.010*"page"'),
 (2,
  '0.033*"play" + 0.021*"role" + 0.015*"performance" + 0.015*"star" + '
  '0.010*"well" + 0.010*"actor" + 0.010*"cast" + 0.010*"great" + 0.008*"man" + '
  '0.007*"year"'),
 (3,
  '0.089*"funny" + 0.080*"comedy" + 0.040*"laugh" + 0.031*"humor" + '
  '0.025*"hilarious" + 0.024*"animation" + 0.024*"joke" + 0.019*"fun" + '
  '0.016*"animate" + 0.016*"disney"'),
 (4,
  '0.012*"scene" + 0.010*"end" + 0.010*"guy" + 0.010*"look" + 0.009*"man" + '
  '0.008*"time" + 0.008*"back" + 0.008*"thing" + 0.008*"way" + 0.007*"start"'),
 (5,
  '0.134*"film" + 0.0


 31%|███████████████████████▋                                                     | 4/13 [2:08:21<4:49:16, 1928.45s/it]

Perplexity:  17 -11.015495975571977
Coherence Score:  17 0.42074109985767644
[(0,
  '0.021*"people" + 0.015*"world" + 0.011*"life" + 0.009*"american" + '
  '0.008*"live" + 0.007*"show" + 0.006*"time" + 0.006*"history" + 0.005*"way" '
  '+ 0.005*"country"'),
 (1,
  '0.042*"character" + 0.036*"story" + 0.019*"love" + 0.016*"life" + '
  '0.012*"feel" + 0.010*"book" + 0.009*"end" + 0.008*"way" + 0.008*"movie" + '
  '0.007*"well"'),
 (2,
  '0.031*"play" + 0.020*"role" + 0.017*"great" + 0.015*"performance" + '
  '0.014*"star" + 0.012*"actor" + 0.011*"cast" + 0.011*"song" + 0.010*"love" + '
  '0.009*"well"'),
 (3,
  '0.102*"funny" + 0.090*"comedy" + 0.046*"laugh" + 0.036*"humor" + '
  '0.028*"joke" + 0.028*"hilarious" + 0.018*"fun" + 0.011*"funniest" + '
  '0.011*"moment" + 0.010*"comic"'),
 (4,
  '0.017*"man" + 0.012*"find" + 0.010*"friend" + 0.009*"back" + 0.009*"end" + '
  '0.009*"father" + 0.009*"woman" + 0.008*"old" + 0.008*"wife" + 0.008*"home"'),
 (5,
  '0.141*"film" + 0.017*"well" + 0


 38%|█████████████████████████████▌                                               | 5/13 [2:43:03<4:23:16, 1974.51s/it]

Perplexity:  18 -11.399158629807612
Coherence Score:  18 0.41704924052497844
[(0,
  '0.020*"people" + 0.014*"world" + 0.011*"show" + 0.010*"american" + '
  '0.008*"life" + 0.007*"live" + 0.006*"man" + 0.006*"war" + 0.006*"year" + '
  '0.006*"history"'),
 (1,
  '0.049*"story" + 0.035*"character" + 0.030*"love" + 0.020*"life" + '
  '0.015*"book" + 0.014*"feel" + 0.010*"find" + 0.009*"beautiful" + '
  '0.009*"way" + 0.008*"true"'),
 (2,
  '0.023*"song" + 0.023*"music" + 0.014*"dance" + 0.014*"voice" + '
  '0.013*"animation" + 0.011*"musical" + 0.010*"star" + 0.009*"band" + '
  '0.009*"play" + 0.008*"year"'),
 (3,
  '0.112*"funny" + 0.101*"comedy" + 0.067*"laugh" + 0.039*"humor" + '
  '0.033*"joke" + 0.032*"hilarious" + 0.019*"fun" + 0.013*"funniest" + '
  '0.012*"humour" + 0.011*"moment"'),
 (4,
  '0.024*"family" + 0.020*"father" + 0.020*"girl" + 0.019*"kid" + '
  '0.019*"young" + 0.019*"child" + 0.017*"boy" + 0.015*"friend" + 0.014*"old" '
  '+ 0.013*"son"'),
 (5,
  '0.125*"film" + 0.011


 46%|███████████████████████████████████▌                                         | 6/13 [3:18:41<3:56:06, 2023.74s/it]

Perplexity:  19 -11.321038929621038
Coherence Score:  19 0.39235633645194384
[(0,
  '0.026*"people" + 0.015*"world" + 0.014*"life" + 0.012*"show" + 0.009*"real" '
  '+ 0.008*"way" + 0.007*"live" + 0.006*"man" + 0.006*"fact" + '
  '0.006*"documentary"'),
 (1,
  '0.043*"character" + 0.038*"story" + 0.013*"end" + 0.013*"well" + '
  '0.011*"feel" + 0.010*"much" + 0.010*"book" + 0.010*"plot" + 0.009*"way" + '
  '0.007*"scene"'),
 (2,
  '0.034*"play" + 0.024*"role" + 0.023*"performance" + 0.018*"actor" + '
  '0.017*"great" + 0.015*"cast" + 0.015*"star" + 0.015*"well" + 0.010*"give" + '
  '0.007*"work"'),
 (3,
  '0.080*"show" + 0.042*"series" + 0.034*"comedy" + 0.033*"funny" + '
  '0.030*"episode" + 0.019*"tv" + 0.015*"laugh" + 0.014*"watch" + '
  '0.013*"season" + 0.013*"first"'),
 (4,
  '0.032*"girl" + 0.031*"woman" + 0.028*"man" + 0.018*"scene" + 0.016*"guy" + '
  '0.013*"sex" + 0.012*"look" + 0.008*"car" + 0.008*"end" + 0.007*"play"'),
 (5,
  '0.010*"time" + 0.010*"scene" + 0.010*"work" +


 54%|█████████████████████████████████████████▍                                   | 7/13 [3:55:12<3:27:22, 2073.73s/it]

Perplexity:  20 -11.466219946778793
Coherence Score:  20 0.41766601789063545
[(0,
  '0.063*"war" + 0.045*"american" + 0.023*"japanese" + 0.021*"country" + '
  '0.021*"soldier" + 0.017*"german" + 0.015*"indian" + 0.014*"history" + '
  '0.014*"english" + 0.013*"battle"'),
 (1,
  '0.038*"love" + 0.036*"life" + 0.031*"story" + 0.023*"character" + '
  '0.019*"family" + 0.013*"young" + 0.012*"father" + 0.011*"book" + '
  '0.010*"relationship" + 0.010*"beautiful"'),
 (2,
  '0.034*"play" + 0.024*"role" + 0.021*"performance" + 0.019*"great" + '
  '0.018*"actor" + 0.016*"star" + 0.015*"cast" + 0.014*"well" + 0.009*"give" + '
  '0.009*"year"'),
 (3,
  '0.097*"funny" + 0.094*"comedy" + 0.058*"laugh" + 0.034*"humor" + '
  '0.028*"joke" + 0.028*"hilarious" + 0.018*"fun" + 0.012*"funniest" + '
  '0.010*"moment" + 0.009*"comic"'),
 (4,
  '0.015*"man" + 0.013*"find" + 0.012*"end" + 0.011*"woman" + 0.010*"girl" + '
  '0.010*"friend" + 0.010*"old" + 0.009*"house" + 0.009*"back" + 0.009*"home"'),
 (5,
  '


 62%|███████████████████████████████████████████████▍                             | 8/13 [5:38:29<4:35:53, 3310.62s/it]

KeyboardInterrupt: 

In [None]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data_ready)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)

In [None]:
# 1. Wordcloud of Top N words in each topic
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(stopwords=stop_words,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = lda_model.show_topics(formatted=False)

fig, axes = plt.subplots(2, 2, figsize=(10,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

In [None]:
topics = lda_model.show_topics(formatted=False)
data_flat = [w for w_list in data_ready for w in w_list]
counter = Counter(data_flat)

out = []
for i, topic in topics:
    for word, weight in topic:
        out.append([word, i , weight, counter[word]])

df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])        

# Plot Word Count and Weights of Topic Keywords
fig, axes = plt.subplots(2, 2, figsize=(16,10), sharey=True, dpi=160)
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
    ax.bar(x='word', height="word_count", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
    ax_twin = ax.twinx()
    ax_twin.bar(x='word', height="importance", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
    ax.set_ylabel('Word Count', color=cols[i])
    ax_twin.set_ylim(0, 0.030); ax.set_ylim(0, 3500)
    ax.set_title('Topic: ' + str(i), color=cols[i], fontsize=16)
    ax.tick_params(axis='y', left=False)
    ax.set_xticklabels(df.loc[df.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right')
    ax.legend(loc='upper left'); ax_twin.legend(loc='upper right')

fig.tight_layout(w_pad=2)    
fig.suptitle('Word Count and Importance of Topic Keywords', fontsize=22, y=1.05)    
plt.show()

In [None]:
# Plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4), dpi=120, sharey=True)

# Topic Distribution by Dominant Topics
ax1.bar(x='Dominant_Topic', height='count', data=df_dominant_topic_in_each_doc, width=.5, color='firebrick')
ax1.set_xticks(range(df_dominant_topic_in_each_doc.Dominant_Topic.unique().__len__()))
tick_formatter = FuncFormatter(lambda x, pos: 'Topic ' + str(x)+ '\n' + df_top3words.loc[df_top3words.topic_id==x, 'words'].values[0])
ax1.xaxis.set_major_formatter(tick_formatter)
ax1.set_title('Number of Documents by Dominant Topic', fontdict=dict(size=10))
ax1.set_ylabel('Number of Documents')
ax1.set_ylim(0, 1000)

# Topic Distribution by Topic Weights
ax2.bar(x='index', height='count', data=df_topic_weightage_by_doc, width=.5, color='steelblue')
ax2.set_xticks(range(df_topic_weightage_by_doc.index.unique().__len__()))
ax2.xaxis.set_major_formatter(tick_formatter)
ax2.set_title('Number of Documents by Topic Weightage', fontdict=dict(size=10))

plt.show()