In [1]:
import pandas as pd
import numpy as np
import pickle
import re
import timeit
import spacy
import copy

import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, HdpModel, LdaModel, LdaMulticore
from nltk.corpus import stopwords
import helper as he

with open('../../data/preprocessed_data/doc_indexes/aadhar.pkl','rb') as f:
    texts,INITIAL_DOC_SIZE, DOC_TEMPORAL_INCREMENT = pickle.load(f)

with open('../../data/preprocessed_data/corpus_dict/aadhar_corp.pkl', 'rb') as f:
    data_lemmatized, _, _ = pickle.load(f)

In [2]:
# Set Data State to that of existing model in simulation
data = data_lemmatized[:INITIAL_DOC_SIZE]
id2word = Dictionary(documents=data_lemmatized)
corpus = [id2word.doc2bow(doc) for doc in data]

# Building for the first time - To be considered as the starting/existing model in simulation.
lda = LdaMulticore(corpus, num_topics=35, id2word=id2word,
                   workers=3, chunksize=2000, passes=10, batch=False)

#Baseline Model
corpus_baseline = copy.deepcopy(corpus)
lda_baseline = copy.deepcopy(lda)

In [None]:
# The loop simulates arrival of new documents in batches where batch_size is defined in DOC_TEMPORAL_INCREMENT
doc_size = []
positive_arr = []

f2 = open('../../data/temp/aadhar_confusion.pkl', 'wb')

count = 0
doc_size_counter = INITIAL_DOC_SIZE
print('Total Corpus Length:',len(data_lemmatized))
for i in DOC_TEMPORAL_INCREMENT:
    # new_docs is the list of STEP_SIZE new documents which have arrived
    new_docs = data_lemmatized[doc_size_counter:doc_size_counter+i]
    doc_size_counter += i

    prev_corpus = copy.deepcopy(corpus)

    # Converting Documents to doc2bow format so that they can be fed to models
    corpus = [id2word.doc2bow(doc) for doc in new_docs]
    count += 1

    print('MODEL NO:'+str(count))
    lda.update(corpus)
    print('MODEL DONE')

    prev_corpus.extend(corpus)
    corpus = copy.deepcopy(prev_corpus)

    doc_size.append(i)
    positive_arr.append(he.calc_confusion_matrix(
        lda_baseline, lda, corpus))

    pickle.dump((positive_arr, doc_size), f2)

f2.close()

Total Corpus Length: 13908
MODEL NO:1
MODEL DONE
Corpus 1 Length -  39
MODEL NO:2
MODEL DONE
Corpus 1 Length -  62
MODEL NO:3
MODEL DONE
Corpus 1 Length -  90
MODEL NO:4
MODEL DONE
Corpus 1 Length -  93
MODEL NO:5
MODEL DONE
Corpus 1 Length -  115
MODEL NO:6
MODEL DONE
Corpus 1 Length -  120
MODEL NO:7
MODEL DONE
Corpus 1 Length -  128
MODEL NO:8
MODEL DONE
Corpus 1 Length -  132
MODEL NO:9
MODEL DONE
Corpus 1 Length -  143
MODEL NO:10
MODEL DONE
Corpus 1 Length -  155
MODEL NO:11
MODEL DONE
Corpus 1 Length -  171
MODEL NO:12
MODEL DONE
Corpus 1 Length -  174
MODEL NO:13
MODEL DONE
Corpus 1 Length -  189
MODEL NO:14
MODEL DONE
Corpus 1 Length -  195
MODEL NO:15
MODEL DONE
Corpus 1 Length -  207
MODEL NO:16
MODEL DONE
Corpus 1 Length -  241
MODEL NO:17
MODEL DONE
Corpus 1 Length -  252
MODEL NO:18
MODEL DONE
Corpus 1 Length -  285
MODEL NO:19
MODEL DONE
Corpus 1 Length -  301
MODEL NO:20
MODEL DONE
Corpus 1 Length -  324
MODEL NO:21
MODEL DONE
Corpus 1 Length -  342
MODEL NO:22
MODEL DO