# Topic Modeling

Akio Kakishima

### Sources

https://radimrehurek.com/gensim/models/ldaseqmodel.html

https://markroxor.github.io/gensim/static/notebooks/ldaseqmodel.html

https://www.youtube.com/watch?v=7BMsuyBPx90 <-- Dave Blei's Google talk on Dynamic Topic Modelling

https://towardsdatascience.com/exploring-the-un-general-debates-with-dynamic-topic-models-72dc0e307696 

^ Explains why each new paragraph should be treated as a separate document in DTM


### Notes
This .ipynb file runs topic modeling. Make sure to have parsed the PDFs before running this analysis. 

In [10]:
BIGRAMS = False
LOAD_DICTIONARY = True
LOAD_LDA = True
LOAD_DTM = True
num_topics = 50
threshold = 25

In [11]:
from collections import Counter
from gensim import models
from gensim.corpora import Dictionary, bleicorpus
from gensim.models import ldaseqmodel, ldamodel
from gensim.models.wrappers.dtmmodel import DtmModel
from gensim.test.utils import datapath
import pyLDAvis
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer 
import numpy as np
import os
import pandas as pd # ver 0.24.2
import pickle
import re
import time
import winsound

In [12]:
def remove_threshold(input_list, thresh):
    output_set = set()
    c = Counter(input_list)
    for each in c:
        if c[each] <= thresh:
            output_set.add(each)
    return output_set

In [13]:
def preprocess(raw, BIGRAM):
    """
    Takes in a string, lowercases all the text,
    fixes hyphenation, then tokenizes the input. 
    Outputs a list of all the tokens in the input document. 
    Each token is at least 3 chars long. 
    Also removed numbers, and stopwords. 
    Stopwords downloaded from nltk.downloads('stopwords'). 

    Parameters
    ----------
    first : string, or parser 
        `raw` is the pdf document

    Returns
    -------
    list
        A list of each token, stopword filtered

    """

    lowered = raw.lower()
    fixed = re.sub(r'[\-]\W+', '', lowered)
    whitespace = re.sub(r'[\W]+', ' ', fixed)

    tokenizer = RegexpTokenizer(r'[A-z]{4,}')
    additional_list = ['also', 'ndr', 'red', 'crescent', 'cross', 'world', 'disaster', 'report', 'chap', 'page', 'ifrc','unhcr']
    tokenized = [word for word in tokenizer.tokenize(fixed) if word not in additional_list]
    
    
    lemmatizer = WordNetLemmatizer()
    lemmed = [lemmatizer.lemmatize(word) for word in tokenized if word not in stopwords.words('english')]

    remove_set = remove_threshold(lemmed, threshold)
    
    lemmed_filtered = [i for i in lemmed if i not in remove_set]
    
    if BIGRAM:
        bigram = list(nltk.bigrams(lemmed_filtered))
        filtered_words = [i for i in map('_'.join, bigram)]
    else:
        filtered_words = lemmed_filtered
    
    return filtered_words

In [14]:
def extract_para(raw, len_list):
    """
    Input is lowercased, and split by paragraph breaks
    using regex. The number of paragraphs are analyzed
    and appended in input list, 'len_list'
    
    Returns a list, 'list_of_paragraphs', which is the
    output from the split function.

    Parameters
    ----------
    first : string
        text of the PDF file
    second : list
        list to keep track of number of paragraph 
        extracted from input, 'raw'
        
    Returns
    -------
    list

    """

    lowered = raw.lower()
    list_of_paragraphs = re.split(r'\.[ ][\n]+', lowered)
    len_list.append(len(list_of_paragraphs))
    
    return list_of_paragraphs

In [15]:
def determine_year(pdf_title):
    searchObj = re.search(r'20[0-9]{2}|19[0-9]{2}', pdf_title, re.M|re.I)
    if searchObj:
        year = searchObj.group()
    
    return int(year)

In [16]:
def beeper():
    '''
    Beeps when activated
    '''
    
    eighth = 250
    half = 1000
    g = 392 #hz
    ef = 311 #hz

    for i in range(3):
        winsound.Beep(g, eighth)
    winsound.Beep(ef, half)

## Get/Set common_corpus and common_dictionary

In [17]:
%%time

if LOAD_DICTIONARY:
    print("loading Dictionary, corpus, and len documents")
    common_dictionary = Dictionary.load('model/common_dictionary')
    with open('model/common_corpus_pickled', 'rb') as f:
        common_corpus = pickle.load(f)
    with open('model/len_docs_pickled', 'rb') as f:
        lengths_of_docs = pickle.load(f)
    with open('model/common_corpus_test_pickled', 'rb') as f:
        common_corpus_test = pickle.load(f)
    with open('model/len_docs_test_pickled', 'rb') as f:
        lengths_of_docs_test = pickle.load(f)    
else:
    print("Dictionary, corpus, and len documents not found; initializing")

    print('trainset')
    text_library = {}
    list_of_string = []
    list_of_list_of_string = []
    lengths_of_docs = []

    path = "data/"
    dirs = os.listdir(path)
    for each_pdf in dirs:
        if each_pdf == 'WDR-2018-EN-LR.pdf': pass # our testset
        else:
            print(each_pdf)       
            with open('txt/{}.txt'.format(str(each_pdf)), 'r', encoding='utf8') as f:
                text = f.read()
                text_year = determine_year(each_pdf)
                try:
                    text_update = "{}\n{}".format(text_library[text_year], text)
                    text_library.update({text_year: text_update})
                except:
                    text_library[text_year] = text

    for i in sorted(text_library):
        print("year: {}".format(i))
        text = text_library[i]
        list_of_paragraphs = extract_para(text, lengths_of_docs)
        for i in list_of_paragraphs:
            if len(i) > 3:
                list_of_list_of_string.append(preprocess(i, BIGRAMS))
            else: pass

    # Create a corpus from a list of texts
    common_dictionary = Dictionary(list_of_list_of_string)
    common_corpus = [common_dictionary.doc2bow(text) for text in list_of_list_of_string]
    
    with open('model/common_corpus_pickled', 'wb') as f:
        pickle.dump(common_corpus, f)
    with open('model/len_docs_pickled', 'wb') as f:
        pickle.dump(lengths_of_docs, f)
    
    print('testset')
    text_library_test = {}
    list_of_string_test = []
    list_of_list_of_string_test = []
    lengths_of_docs_test = []

    for each_pdf in dirs:
        if each_pdf == 'WDR-2018-EN-LR.pdf': # our testset
            print(each_pdf)       
            with open('txt/{}.txt'.format(str(each_pdf)), 'r', encoding='utf8') as f:
                text = f.read()
                text_year = determine_year(each_pdf)
                text_library_test[text_year] = text
        else: pass

    for i in sorted(text_library_test):
        print("year: {}".format(i))
        text = text_library_test[i]
        list_of_paragraphs_test = extract_para(text, lengths_of_docs_test)
        for i in list_of_paragraphs_test:
            if len(i) > 3:
                list_of_list_of_string_test.append(preprocess(i, BIGRAMS))
            else: pass

    # Create a corpus from a list of texts
    common_dictionary.add_documents(list_of_list_of_string_test)
    common_corpus_test = [common_dictionary.doc2bow(text) for text in list_of_list_of_string_test]

    common_dictionary.save('model/common_dictionary')
    
    with open('model/common_corpus_test_pickled', 'wb') as f:
        pickle.dump(common_corpus_test, f)
    with open('model/len_docs_test_pickled', 'wb') as f:
        pickle.dump(lengths_of_docs_test, f)

loading Dictionary, corpus, and len documents
Wall time: 3 ms


## NMF

In [18]:
# nmf = models.Nmf(common_corpus, num_topics=10)

## Baseline: LDA

In [19]:
%%time
home = os.getcwd()

if LOAD_LDA:
    print("loading LDAmodel")
    lda = ldamodel.LdaModel.load(os.path.join(home, 'model/LDAmodel_{}'.format(num_topics)))
else:
    for num_topics_sub in [num_topics]:
        print("LDAmodel not found; initializing ldamodel")
        lda = ldamodel.LdaModel(corpus=common_corpus, id2word=common_dictionary, num_topics=num_topics_sub, update_every=1, passes=1)

        print("saving")
        lda.save(os.path.join(home, 'model/LDAmodel_{}'.format(num_topics_sub)))

loading LDAmodel
Wall time: 23 ms


In [20]:
def pretty_print_topic(n_w = len(common_dictionary.values())):
    epoch = []
    topic_list = []
    topic_imp = []
    
    for idx, each_topic in enumerate(lda.print_topics(num_topics=num_topics, num_words=n_w)):
        # len(common_dictionary.values()) = gets all vocab
        for each_token in each_topic[1].split('+'):
            topic_importance, token = each_token.split("*")
            topic_list.append(re.sub(r'[^A-z0-9]+', '', token))
            topic_imp.append(topic_importance)
            epoch.append(idx)
    data = {'epoch': epoch, 'topic': topic_list, 'topic_importance': topic_imp}
    lda_df = pd.DataFrame.from_dict(data)
#     print(len(lda_df))
#     display(lda_df)
    
    return lda_df

lda_df = pretty_print_topic(n_w = 5)

for each_epoch in lda_df['epoch'].unique():
    display(lda_df[lda_df['epoch'] == each_epoch]['topic'])

0          forced
1         country
2    displacement
3          people
4     development
Name: topic, dtype: object

5     humanitarian
6           people
7    international
8      information
9         disaster
Name: topic, dtype: object

10       radio
11      medium
12      people
13         aid
14    disaster
Name: topic, dtype: object

15          society
16             mail
17         redcross
18             http
19    international
Name: topic, dtype: object

20    humanitarian
21      technology
22          people
23        disaster
24       community
Name: topic, dtype: object

25          disaster
26            people
27    discrimination
28         community
29             total
Name: topic, dtype: object

30           people
31         disaster
32    international
33         reported
34           number
Name: topic, dtype: object

35          united
36      technology
37    displacement
38          forced
39         country
Name: topic, dtype: object

40      early
41     people
42     health
43        aid
44    country
Name: topic, dtype: object

45     humanitarian
46    international
47           people
48      development
49         disaster
Name: topic, dtype: object

50      migration
51        country
52    development
53         people
54        refugee
Name: topic, dtype: object

55        mail
56     society
57        http
58      people
59    redcross
Name: topic, dtype: object

60              aid
61           people
62    international
63          country
64         disaster
Name: topic, dtype: object

65          asylum
66      technology
67         migrant
68      earthquake
69    humanitarian
Name: topic, dtype: object

70        mail
71     society
72        http
73      people
74    redcross
Name: topic, dtype: object

75         community
76    discrimination
77            people
78             woman
79     international
Name: topic, dtype: object

80            people
81    discrimination
82          disaster
83              risk
84     international
Name: topic, dtype: object

85      climate
86       change
87       people
88      country
89    community
Name: topic, dtype: object

90       early
91     malaria
92      people
93         aid
94    disaster
Name: topic, dtype: object

95    capacity
96    building
97       local
98      people
99      agency
Name: topic, dtype: object

100            people
101    discrimination
102             woman
103     international
104          disaster
Name: topic, dtype: object

105        early
106          aid
107       people
108     disaster
109    community
Name: topic, dtype: object

110             data
111           people
112         disaster
113     humanitarian
114    international
Name: topic, dtype: object

115       people
116     disaster
117       relief
118     recovery
119    community
Name: topic, dtype: object

120          disaster
121            people
122     international
123    discrimination
124         community
Name: topic, dtype: object

125      refugee
126      country
127       person
128       figure
129    stateless
Name: topic, dtype: object

130          risk
131     community
132        people
133      disaster
134    government
Name: topic, dtype: object

135         aid
136      people
137     country
138      health
139    disaster
Name: topic, dtype: object

140         people
141       disaster
142        country
143      community
144    development
Name: topic, dtype: object

145     urban
146      city
147    forced
148     mercy
149      corp
Name: topic, dtype: object

150           people
151           agency
152              aid
153      information
154    international
Name: topic, dtype: object

155            people
156             woman
157    discrimination
158         community
159      humanitarian
Name: topic, dtype: object

160      people
161       total
162      number
163    reported
164    disaster
Name: topic, dtype: object

165             total
166            people
167          disaster
168          reported
169    discrimination
Name: topic, dtype: object

170            woman
171     humanitarian
172    international
173         disaster
174           people
Name: topic, dtype: object

175           person
176           figure
177    international
178           people
179     humanitarian
Name: topic, dtype: object

180            people
181             woman
182    discrimination
183         community
184               aid
Name: topic, dtype: object

185            person
186            people
187          disaster
188    discrimination
189     international
Name: topic, dtype: object

190    international
191       technology
192     humanitarian
193         disaster
194             food
Name: topic, dtype: object

195        data
196      return
197    disaster
198         aid
199      health
Name: topic, dtype: object

200       people
202        local
203    hurricane
204     reported
Name: topic, dtype: object

205          person
206          people
207    humanitarian
208        disaster
209          figure
Name: topic, dtype: object

210      relief
211        rise
212      people
213    disaster
214    recovery
Name: topic, dtype: object

215      health
216       nepal
217      people
218       woman
219    disaster
Name: topic, dtype: object

220    stateless
221       person
222       figure
223       people
224          aid
Name: topic, dtype: object

225     humanitarian
226           agency
227           people
228         disaster
229    international
Name: topic, dtype: object

230           people
231             risk
232              aid
233         disaster
234    international
Name: topic, dtype: object

235     million
236      people
237        cent
238      locust
239    reported
Name: topic, dtype: object

240          society
241    international
242         disaster
243             mail
244           united
Name: topic, dtype: object

245         disaster
246             loss
247           people
248              aid
249    international
Name: topic, dtype: object

## DTM

In [21]:
%%time
home = os.getcwd()

if LOAD_DTM:
    print("loading DTMmodel")
    ldaseq = ldaseqmodel.LdaSeqModel.load('model/DTMmodel_{}'.format(num_topics))
else:
    for num_topics_sub in [num_topics]:
        print(num_topics)
        print("DTMmodel not found; initializing DTMmodel")
        bc = bleicorpus.BleiCorpus.serialize(fname=os.path.join(home, 'model/blei_{}'.format(num_topics_sub)), corpus=common_corpus)
        ldaseq = ldaseqmodel.LdaSeqModel(corpus=common_corpus, id2word=common_dictionary, time_slice=lengths_of_docs, num_topics=num_topics_sub)
        # ldaseq.print_topics(time=0)

        print("saving")
        ldaseq.save(os.path.join(home, 'model/DTMmodel_{}'.format(num_topics_sub)))

loading DTMmodel
Wall time: 154 ms


In [22]:
def output_dtm(as_csv = False):
    '''
    Saves output of DTM as csv
    '''
    if as_csv:
        with open('dtm_output.csv', 'w', newline='') as f:
            csv_writer = csv.writer(f)
            csv_writer.writerow(['topic', 'time', 'term', 'topic_importance'])
            for t in range(len(lengths_of_docs)):
                counter = 0
                for i in ldaseq.print_topics(time=t, top_terms=15):
            #         print(i)
                    counter += 1
                    for j in range(len(i)):
                        csv_writer.writerow([counter,t,i[j][0], i[j][1]])
    else:
        pandas_dict = {}
        counter = 0
        for each_epoch in range(len(lengths_of_docs)):
            for each_topic, i in enumerate(ldaseq.print_topics(time=each_epoch, top_terms=len(common_dictionary.values()))):
                for each_tuple in i:
                    (term, importance) = each_tuple
                    pandas_dict.update({counter: [each_epoch, each_topic, term, importance]})
                    counter += 1
        output_df = pd.DataFrame.from_dict(pandas_dict, orient='index', columns=['epoch', 'topic_number', 'topic_terms', 'topic_importance'])
        return output_df

In [23]:
dtm_df = output_dtm()
dtm_df.head()

Unnamed: 0,epoch,topic_number,topic_terms,topic_importance
0,0,0,rise,0.96638
1,0,0,mail,7e-05
2,0,0,society,7e-05
3,0,0,http,7e-05
4,0,0,redcross,7e-05


In [14]:
def display_viz(save = True, time = 0):
    doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(time=time, corpus = common_corpus)
    vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
    if save:
        pyLDAvis.save_html(vis_wrapper, 'viz/DTM{}_viz.html'.format(num_topics))
    else: pass
    return pyLDAvis.display(vis_wrapper)

In [15]:
display_viz(save= False, time=0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


lambda = 1 results in the familiar ranking of terms in decreasing order of their topic-specific probability, and setting lambda = 0 ranks terms solely by their lift

lift: the ratio of a term’s probability within a topic to its marginal probability across the corpus.

# Predict distributions

In [24]:
[print(i) for i in common_dictionary.items()]

(0, 'data')
(1, 'disaster')
(2, 'total')
(3, 'http')
(4, 'international')
(5, 'mail')
(6, 'redcross')
(7, 'society')
(8, 'people')
(9, 'recovery')
(10, 'relief')
(11, 'community')
(12, 'volunteer')
(13, 'affected')
(14, 'annual')
(15, 'average')
(16, 'cent')
(17, 'development')
(18, 'human')
(19, 'killed')
(20, 'natural')
(21, 'number')
(22, 'reported')
(23, 'based')
(24, 'challenge')
(25, 'chapter')
(26, 'cost')
(27, 'flood')
(28, 'government')
(29, 'hazard')
(30, 'local')
(31, 'many')
(32, 'mitigation')
(33, 'preparedness')
(34, 'programme')
(35, 'project')
(36, 'reduction')
(37, 'risk')
(38, 'vulnerability')
(39, 'year')
(40, 'rise')
(41, 'change')
(42, 'climate')
(43, 'country')
(44, 'island')
(45, 'pacific')
(46, 'earthquake')
(47, 'accountability')
(48, 'actor')
(49, 'agency')
(50, 'assessment')
(51, 'assistance')
(52, 'capacity')
(53, 'committee')
(54, 'conflict')
(55, 'could')
(56, 'cred')
(57, 'crisis')
(58, 'decade')
(59, 'emergency')
(60, 'even')
(61, 'example')
(62, 'figure

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [25]:
def testset():
    lda = ldamodel.LdaModel(corpus=common_corpus_test, \
                            id2word=common_dictionary, \
                            num_topics=num_topics, \
                            update_every=1, \
                            passes=1)
    epoch = []
    topic_list = []
    topic_imp = []

    for idx, each_topic in enumerate(lda.print_topics(num_topics=num_topics, num_words=len(common_dictionary.values()))):
        for each_token in each_topic[1].split('+'):
            topic_importance, token = each_token.split("*")
            topic_list.append(re.sub(r'[^A-z0-9]+', '', token))
            topic_imp.append(topic_importance)
            epoch.append(idx)
    data = {'topic_number': epoch, 'topic': topic_list, 'topic_importance': topic_imp}
    lda_df = pd.DataFrame.from_dict(data)

    return lda_df

In [26]:
testset_df = testset()
testset_df['id']= testset_df['topic'].map(common_dictionary.token2id)
dtm_df['id']= dtm_df['topic_terms'].map(common_dictionary.token2id)
testset_df['id'] = testset_df['id'].apply(int)
testset_df['topic_importance'] = testset_df['topic_importance'].apply(float)
testset_df['epoch'] = 18
testset_df = testset_df.sort_values(['topic_number', 'id'])

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

In [27]:
testset_df.head()

Unnamed: 0,topic_number,topic,topic_importance,id,epoch
0,0,data,0.002,0,18
334,0,disaster,0.002,1,18
333,0,total,0.002,2,18
332,0,http,0.002,3,18
331,0,international,0.002,4,18


In [28]:
dtm_df['id'] = dtm_df['id'].apply(int)
dtm_df['topic_importance'] = dtm_df['topic_importance'].apply(float)
dtm_df = dtm_df.sort_values(['topic_number', 'epoch', 'id'])
dtm_df.head()

Unnamed: 0,epoch,topic_number,topic_terms,topic_importance,id
38,0,0,data,7e-05,0
9,0,0,disaster,7e-05,1
82,0,0,total,7e-05,2
3,0,0,http,7e-05,3
8,0,0,international,7e-05,4


In [29]:
len(dtm_df['epoch'].unique())

18

In [30]:
for each_epoch in range(len(dtm_df['epoch'].unique())):
    print("{}: {}".format(each_epoch, len(dtm_df[dtm_df['epoch'] == each_epoch]['id'].unique())))

0: 484
1: 484
2: 484
3: 484
4: 484
5: 484
6: 484
7: 484
8: 484
9: 484
10: 484
11: 484
12: 484
13: 484
14: 484
15: 484
16: 484
17: 484


In [31]:
testset_id_set = set(testset_df['id'].unique())
dtm_df_id_set = set(dtm_df[dtm_df['epoch']==1]['id'].unique())

missing_in_dtm = testset_id_set.difference(dtm_df_id_set)
print(missing_in_dtm)

{484, 485, 486, 487, 488, 489, 490, 491}


In [32]:
additional_data = {}
row_num = 435599+1
for each_epoch in range(len(dtm_df['epoch'].unique())):
    for each_tn in range(num_topics):
        for i in missing_in_dtm:
            # epoch, topic_number, topic_terms, topic_importance, id
            additional_data.update({row_num: [each_epoch, each_tn, common_dictionary.id2token[i], 0.0000001, i]})
            row_num+=1
additional_data

{435600: [0, 0, 'file', 1e-07, 484],
 435601: [0, 0, 'content', 1e-07, 485],
 435602: [0, 0, 'site', 1e-07, 486],
 435603: [0, 0, 'uploads', 1e-07, 487],
 435604: [0, 0, 'default', 1e-07, 488],
 435605: [0, 0, 'icrc', 1e-07, 489],
 435606: [0, 0, 'reliefweb', 1e-07, 490],
 435607: [0, 0, 'unocha', 1e-07, 491],
 435608: [0, 1, 'file', 1e-07, 484],
 435609: [0, 1, 'content', 1e-07, 485],
 435610: [0, 1, 'site', 1e-07, 486],
 435611: [0, 1, 'uploads', 1e-07, 487],
 435612: [0, 1, 'default', 1e-07, 488],
 435613: [0, 1, 'icrc', 1e-07, 489],
 435614: [0, 1, 'reliefweb', 1e-07, 490],
 435615: [0, 1, 'unocha', 1e-07, 491],
 435616: [0, 2, 'file', 1e-07, 484],
 435617: [0, 2, 'content', 1e-07, 485],
 435618: [0, 2, 'site', 1e-07, 486],
 435619: [0, 2, 'uploads', 1e-07, 487],
 435620: [0, 2, 'default', 1e-07, 488],
 435621: [0, 2, 'icrc', 1e-07, 489],
 435622: [0, 2, 'reliefweb', 1e-07, 490],
 435623: [0, 2, 'unocha', 1e-07, 491],
 435624: [0, 3, 'file', 1e-07, 484],
 435625: [0, 3, 'content', 

In [33]:
new_data = pd.DataFrame.from_dict(additional_data, orient='index', columns=['epoch', 'topic_number', 'topic_terms', 'topic_importance', 'id'])
new_data.head()

Unnamed: 0,epoch,topic_number,topic_terms,topic_importance,id
435600,0,0,file,1e-07,484
435601,0,0,content,1e-07,485
435602,0,0,site,1e-07,486
435603,0,0,uploads,1e-07,487
435604,0,0,default,1e-07,488


In [34]:
dtm_df = pd.concat([dtm_df, new_data], ignore_index=True)
dtm_df.tail()

Unnamed: 0,epoch,topic_number,topic_terms,topic_importance,id
442795,17,49,uploads,1e-07,487
442796,17,49,default,1e-07,488
442797,17,49,icrc,1e-07,489
442798,17,49,reliefweb,1e-07,490
442799,17,49,unocha,1e-07,491


In [35]:
for each_epoch in range(len(dtm_df['epoch'].unique())):
    print("{}: {}".format(each_epoch, len(dtm_df[dtm_df['epoch'] == each_epoch]['id'].unique())))

0: 492
1: 492
2: 492
3: 492
4: 492
5: 492
6: 492
7: 492
8: 492
9: 492
10: 492
11: 492
12: 492
13: 492
14: 492
15: 492
16: 492
17: 492


In [36]:
new_dict = {}
counter = 0
for each_epoch in dtm_df['epoch'].unique():
    new_dict.update({counter: list(dtm_df[dtm_df['epoch'] == each_epoch]['topic_importance'])})
    counter+=1

In [37]:
len(new_dict)

18

In [38]:
new_dict_testset_df = {}
counter = 0
for each_epoch in testset_df['epoch'].unique():
    new_dict_testset_df.update({counter: list(testset_df[testset_df['epoch'] == each_epoch]['topic_importance'])})
    counter+=1
len(new_dict_testset_df)

1

In [39]:
train_df = pd.DataFrame.from_dict(new_dict, orient='index')
train_df.head()
testset_df = pd.DataFrame.from_dict(new_dict_testset_df, orient='index')
testset_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24590,24591,24592,24593,24594,24595,24596,24597,24598,24599
0,0.002,0.002,0.002,0.002,0.002,0.002,0.002,0.002,0.002,0.002,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Sklearn

In [40]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import GridSearchCV

In [42]:
from sklearn.neural_network import MLPRegressor

y_test = testset_df.to_numpy().flatten()
X_test = train_df.tail(17).to_numpy().T
y_train = (train_df.tail(1).to_numpy()).flatten()
X_train = (train_df.head(17).to_numpy()).T

print("y_test: {}\ny_train: {}\nX_test: {}\nX_train: {}".format(y_test.shape, y_train.shape, X_test.shape, X_train.shape))

# Scaling
scaler = StandardScaler() 
scaler.fit(X_train)  
X_train = scaler.transform(X_train)  
X_test = scaler.transform(X_test)


param_grid = [
  {'hidden_layer_sizes': [(50000,)], 'alpha': [0.00001]}]

mlp = MLPRegressor()
clf = GridSearchCV(mlp, param_grid=param_grid, cv=5, n_jobs=-1)
clf.fit(X_train, y_train)

print(clf.best_estimator_)
print(mean_squared_error(y_test, clf.predict(X_test)))

y_test: (24600,)
y_train: (24600,)
X_test: (24600, 17)
X_train: (24600, 17)
MLPRegressor(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(50000,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)
0.0002394107241213776


In [141]:
y_test = testset_df.to_numpy().flatten()
X_test = train_df.tail(17).to_numpy().T
y_train = (train_df.tail(1).to_numpy()).flatten()
X_train = (train_df.head(17).to_numpy()).T

# Scaling
scaler = StandardScaler() 
scaler.fit(X_train)  
X_train = scaler.transform(X_train)  
X_test = scaler.transform(X_test)

# Dummy Regressor
clf_dumb = DummyRegressor()
clf_dumb.fit(X_train, y_train)

# NN Regressor
clf = KNeighborsRegressor(n_neighbors=20, n_jobs=-1)
clf.fit(X_train, y_train)

# Check error
print("dummy MSE: ", mean_squared_error(y_test, clf_dumb.predict(X_test)))
print("KNN mse: ", mean_squared_error(y_test, clf.predict(X_test)))

StandardScaler(copy=True, with_mean=True, with_std=True)

DummyRegressor(constant=None, quantile=None, strategy='mean')

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=-1, n_neighbors=20, p=2,
          weights='uniform')

dummy MSE:  0.00013591573214964906
KNN mse:  0.0005788273348822681


In [143]:
clf.predict(X_test)

array([6.62430774e-05, 6.62504441e-05, 6.62311721e-05, ...,
       1.00000000e-07, 1.00000000e-07, 1.00000000e-07])

# Plots

In [None]:
def plot_specific_topic(term = 'geneva'):
    val_list = []
    for i in ldaseq.print_topic_times(topic=0, top_terms=len(common_dictionary.values())):
        for each_tup in i:
            if each_tup[0] == term:
                val_list.append(each_tup[1])
            else: pass
    plt.figure(figsize=(9, 7))
    # plt.title('Scores by group and gender')
    plt.plot(val_list)
    plt.show()

In [None]:
plot_specific_topic('refugee')