---
# Topic Modeling

### Sources

https://radimrehurek.com/gensim/models/ldaseqmodel.html

https://markroxor.github.io/gensim/static/notebooks/ldaseqmodel.html

https://www.youtube.com/watch?v=7BMsuyBPx90 <-- Dave Blei's Google talk on Dynamic Topic Modelling

https://towardsdatascience.com/exploring-the-un-general-debates-with-dynamic-topic-models-72dc0e307696 

^ Explains why each new paragraph should be treated as a separate document in DTM

In [1]:
LOAD_DICTIONARY = True
LOAD_LDA = True
LOAD_DTM = True
num_topics = 10

In [2]:
from collections import Counter
from gensim import models
from gensim.corpora import Dictionary, bleicorpus
from gensim.models import ldaseqmodel, ldamodel
from gensim.models.wrappers.dtmmodel import DtmModel
from gensim.test.utils import datapath
import pyLDAvis
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer 
import numpy as np
import os
import pandas as pd
import pickle
import re
import time
import winsound

C:\Users\Akio\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.CSRRD7HKRKC3T3YXA7VY7TAZGLSWDKW6.gfortran-win_amd64.dll
C:\Users\Akio\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.IPBC74C7KURV7CB2PKT5Z5FNR3SIBV4J.gfortran-win_amd64.dll
  stacklevel=1)
  regargs, varargs, varkwargs, defaults, formatvalue=lambda value: ""


In [3]:
def extract_para(raw, raw_parser, len_list):
    """
    Input is lowercased, and split by paragraph breaks
    using regex. The number of paragraphs are analyzed
    and appended in input list, 'len_list'
    
    Returns a list, 'list_of_paragraphs', which is the
    output from the split function.

    Parameters
    ----------
    first : string
        text of the PDF file
    second : bool
        whehther to read PDF or txt. Not
        currently implemented. 
    third : list
        list to keep track of number of paragraph 
        extracted from input, 'raw'
        
    Returns
    -------
    list

    """
    lowered = raw.lower()
    list_of_paragraphs = re.split(r'\.[ ][\n]+', lowered)
    len_list.append(len(list_of_paragraphs))
    return list_of_paragraphs

In [4]:
def beeper():
    '''
    Beeps when activated
    '''
    
    eighth = 250
    half = 1000
    g = 392 #hz
    ef = 311 #hz

    for i in range(3):
        winsound.Beep(g, eighth)
    winsound.Beep(ef, half)

## Get/Set common_corpus and common_dictionary

In [5]:
%%time

if LOAD_DICTIONARY:
    print("loading Dictionary, corpus, and len documents")
    common_dictionary = Dictionary.load('model/common_dictionary')
    with open('model/common_corpus_pickled', 'rb') as f:
        common_corpus = pickle.load(f)
    with open('model/len_docs_pickled', 'rb') as f:
        lengths_of_docs = pickle.load(f)
    
else:
    print("Dictionary, corpus, and len documents not found; initializing")

    list_of_string = []
    list_of_list_of_string = []
    lengths_of_docs = []

    path = "data/"
    dirs = os.listdir(path)
    for each_pdf in dirs:
        print(each_pdf)
        with open('txt/{}.txt'.format(str(each_pdf)), 'r', encoding='utf8') as f:
            text = f.read()
            list_of_paragraphs = extract_para(text, False, lengths_of_docs)
            for i in list_of_paragraphs:
                list_of_list_of_string.append(preprocess(i, False))

    # Create a corpus from a list of texts
    common_dictionary = Dictionary(list_of_list_of_string)
    common_corpus = [common_dictionary.doc2bow(text) for text in list_of_list_of_string]

    common_dictionary.save('model/common_dictionary')
    with open('model/common_corpus_pickled', 'wb') as f:
        pickle.dump(common_corpus, f)
    with open('model/len_docs_pickled', 'wb') as f:
        pickle.dump(lengths_of_docs, f)

loading Dictionary, corpus, and len documents
Wall time: 99.3 ms


## NMF

In [6]:
# nmf = models.Nmf(common_corpus, num_topics=10)

## Baseline: LDA

In [7]:
%%time
home = os.getcwd()

if LOAD_LDA:
    print("loading LDAmodel")
    lda = ldamodel.LdaModel.load(os.path.join(home, 'model/LDAmodel_{}'.format(num_topics)))
else:
    print("LDAmodel not found; initializing ldamodel")
    lda = ldamodel.LdaModel(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics, update_every=1, passes=1)
    
    print("saving")
    lda.save(os.path.join(home, 'model/LDAmodel_{}'.format(num_topics)))

loading LDAmodel
Wall time: 25 ms


In [8]:
[print("{}\n".format(i[1])) for i in lda.print_topics(num_topics=20, num_words=10)]

0.014*"people" + 0.010*"number" + 0.009*"total" + 0.007*"disaster" + 0.007*"refugee" + 0.007*"reported" + 0.006*"report" + 0.005*"world" + 0.005*"affected" + 0.004*"country"

0.030*"disaster" + 0.012*"society" + 0.011*"tel" + 0.010*"mail" + 0.010*"fax" + 0.009*"world" + 0.007*"development" + 0.006*"see" + 0.006*"crescent" + 0.006*"risk"


0.015*"www" + 0.012*"available" + 0.011*"disaster" + 0.010*"world" + 0.010*"org" + 0.008*"http" + 0.007*"report" + 0.007*"online" + 0.006*"humanitarian" + 0.006*"health"

0.030*"see" + 0.018*"box" + 0.012*"humanitarian" + 0.009*"society" + 0.008*"http" + 0.008*"www" + 0.008*"org" + 0.007*"crescent" + 0.007*"fax" + 0.006*"tel"

0.013*"people" + 0.013*"per" + 0.012*"cent" + 0.011*"disaster" + 0.009*"aid" + 0.008*"number" + 0.008*"total" + 0.008*"reported" + 0.007*"country" + 0.007*"world"

0.010*"disaster" + 0.007*"child" + 0.007*"risk" + 0.005*"world" + 0.005*"hiv" + 0.005*"community" + 0.004*"many" + 0.004*"health" + 0.004*"humanitarian" + 0.004*"aid"

[None, None, None, None, None, None, None, None, None, None]

## DTM

In [9]:
%%time
home = os.getcwd()

if LOAD_DTM:
    print("loading DTMmodel")
    ldaseq = ldaseqmodel.LdaSeqModel.load('model/DTMmodel_{}'.format(num_topics))
else:
    print("DTMmodel not found; initializing DTMmodel")
    bc = bleicorpus.BleiCorpus.serialize(fname=os.path.join(home, 'model/blei_{}'.format(num_topics)), corpus=common_corpus)
    ldaseq = ldaseqmodel.LdaSeqModel(corpus=common_corpus, id2word=common_dictionary, time_slice=lengths_of_docs, num_topics=num_topics)
    # ldaseq.print_topics(time=0)
    
    print("saving")
    ldaseq.save(os.path.join(home, 'model/DTMmodel_{}'.format(num_topics)))

loading DTMmodel
Wall time: 1.05 s


In [10]:
def output_dtm_csv():
    '''
    Saves output of DTM as csv
    '''
    with open('dtm_output.csv', 'w', newline='') as f:
        csv_writer = csv.writer(f)
        csv_writer.writerow(['topic', 'time', 'term', 'topic_importance'])
        for t in range(len(lengths_of_docs)):
            counter = 0
            for i in ldaseq.print_topics(time=t, top_terms=15):
        #         print(i)
                counter += 1
                for j in range(len(i)):
                    csv_writer.writerow([counter,t,i[j][0], i[j][1]])

In [11]:
def display_viz():
    doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(time=0, corpus = common_corpus)
    vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
    pyLDAvis.display(vis_wrapper)
    pyLDAvis.save_html(vis_wrapper, 'viz/DTM{}_viz.html'.format(num_topics))

lambda = 1 results in the familiar ranking of terms in decreasing order of their topic-specific probability, and setting lambda = 0 ranks terms solely by their lift

lift: the ratio of a term’s probability within a topic to its marginal probability across the corpus.

In [12]:
def plot_specific_topic(term = 'hiv'):
    val_list = []
    for i in ldaseq.print_topic_times(topic=0, top_terms=100):
        for each_tup in i:
            if each_tup[0] == term:
                val_list.append(each_tup[1])
            else: pass
    plt.figure(figsize=(9, 7))
    # plt.title('Scores by group and gender')
    plt.plot(val_list)
    plt.show()