In [1]:
import pandas as pd
import numpy as np
import pickle
import re
import timeit
import spacy
import copy

import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, HdpModel, LdaModel, LdaMulticore
from nltk.corpus import stopwords
import helper as he
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("whitegrid") 

with open('../data/preprocessed_data/doc_indexes/gst.pkl','rb') as f:
    texts,INITIAL_DOC_SIZE, DOC_TEMPORAL_INCREMENT = pickle.load(f)

with open('../data/preprocessed_data/corpus_dict/gst_corp.pkl', 'rb') as f:
    data_lemmatized, _, _ = pickle.load(f)

In [2]:
print('Building Dictionary')
#Dictionary is built over entire set of documents
id2word = Dictionary(documents=data_lemmatized)

Building Dictionary


In [3]:
#Golden Standard Model
print('Building Golden Corpus')
golden_corpus = [id2word.doc2bow(doc) for doc in data_lemmatized]
golden_lda = LdaMulticore(golden_corpus, num_topics=35, id2word=id2word,
                   workers=3, chunksize=2000, passes=10, batch=False)
print('Finished Building Golden Corpus')

Building Golden Corpus
Finished Building Golden Corpus


Building Initial Model
Finished Building Initial Model


In [5]:
def get_positives_arr(initial_doc_size, doc_increment):
    
    # Set Data State to that of existing model in simulation
    data = data_lemmatized[:initial_doc_size]
    corpus = [id2word.doc2bow(doc) for doc in data]

    print('Building Initial Model')
    # Building for the first time - To be considered as the starting/existing model in simulation.
    running_lda = LdaMulticore(corpus, num_topics=35, id2word=id2word,
                       workers=3, chunksize=2000, passes=10, batch=False)
    print('Finished Building Initial Model')
    
    
    count = initial_doc_size
    positive_arr=[]
    no=0
    total_len=len(data_lemmatized)
    for i in doc_increment:
        no+=1
        new_docs = data_lemmatized[count:count+i]
        count+=i
        print('Progress upto Document no.',count,'/',total_len)
        print('No. of New Docs:',i)

        new_corp = [id2word.doc2bow(doc) for doc in new_docs]

        positive_arr.append(calc_confusion_matrix(golden_lda, running_lda, new_corp))
        
        if(i!=doc_increment[-1]):
            print('MODEL NO:'+str(no))
            running_lda.update(new_corp)
            print('MODEL DONE')
    return positive_arr

In [6]:
def get_doc_parameters(initial,k):
    arr = []
    arr.append(INITIAL_DOC_SIZE)
    arr.extend(DOC_TEMPORAL_INCREMENT)
    for i in range(1,len(arr)):
        arr[i]=arr[i]+arr[i-1]
        
    doc_sizes = []
    count=1
    for i in range(initial+1,len(arr)):
        if(count%k==0):
            doc_sizes.append(arr[i])
            count=1
        count+=1
        
    doc_intervals = [arr[initial],doc_sizes[0]-arr[initial]]
    for i in range(1,len(doc_sizes)):
        doc_intervals.append(doc_sizes[i]-doc_sizes[i-1])
    
    return doc_intervals

In [7]:
def add_row_to_df(df,positive_arr,k):
    length = len(df)
    count=length
    for i in positive_arr:
        df.loc[count]=[i,count-length,k]
        count+=1
    return df

In [8]:
def calc_confusion_matrix(model1, model2, corpus1, doc_max=True):
    lda_corpus_1 = [max(prob, key=lambda y:y[1])
                    for prob in model1[corpus1]]
    lda_corpus_2 = [max(prob, key=lambda y:y[1])
                    for prob in model2[corpus1]]
    positive = 0
    negative = 0
    upper_limit = len(lda_corpus_1)
    total_permutations = upper_limit * \
        (upper_limit-1)/2  # nC2 combinations
    for i in range(upper_limit):
        for j in range(i+1, upper_limit):
            if(lda_corpus_1[i][0] == lda_corpus_1[j][0] and lda_corpus_2[i][0] == lda_corpus_2[j][0]):
                positive = positive+1
            elif(lda_corpus_1[i][0] != lda_corpus_1[j][0] and lda_corpus_2[i][0] == lda_corpus_2[j][0]):
                negative = negative+1
            elif(lda_corpus_1[i][0] == lda_corpus_1[j][0] and lda_corpus_2[i][0] != lda_corpus_2[j][0]):
                negative = negative+1
            elif(lda_corpus_1[i][0] != lda_corpus_1[j][0] and lda_corpus_2[i][0] != lda_corpus_2[j][0]):
                positive = positive+1
    answers_positive = (round(positive*100/total_permutations, 2))
    return answers_positive

In [14]:
f = open('temp.pkl','wb')
for i in range(10,150):
# for i in range(2,3):
    df = pd.DataFrame(columns=['positive_value','index','k'])
    for j in range(1,30):
#     for j in range(32,34):
        print('Loop Parameters-',(i,j))
        doc_params = get_doc_parameters(i,j)
        positive = get_positives_arr(doc_params[0],doc_params[1:])
        pickle.dump(((i,j),positive),f)
        df = add_row_to_df(df,positive,j)
        
        fig, ax = plt.subplots()
        sns_plot = sns.violinplot(x = 'k', y = 'positive_value', data = df)
        sns_plot = sns.pointplot(x='k', y='positive_value', data=df.groupby('k', as_index=False).median(), ax=ax, color='k', linestyle='--')

        plt.savefig("./temp/initial_"+str(i)+".eps")
        
        print('-----------------------------------------------')
    df.to_csv(r'./temp/inital_'+str(i)+'.csv')
f.close()

Loop Parameters- (10, 1)
Progress upto Document no. 599 / 22179
No. of New Docs: 64
MODEL NO:1
MODEL DONE
Progress upto Document no. 632 / 22179
No. of New Docs: 33
MODEL NO:2
MODEL DONE
Progress upto Document no. 677 / 22179
No. of New Docs: 45
MODEL NO:3
MODEL DONE
Progress upto Document no. 695 / 22179
No. of New Docs: 18
MODEL NO:4
MODEL DONE
Progress upto Document no. 711 / 22179
No. of New Docs: 16
MODEL NO:5
MODEL DONE
Progress upto Document no. 733 / 22179
No. of New Docs: 22
MODEL NO:6
MODEL DONE
Progress upto Document no. 771 / 22179
No. of New Docs: 38
MODEL NO:7
MODEL DONE
Progress upto Document no. 808 / 22179
No. of New Docs: 37
MODEL NO:8
MODEL DONE
Progress upto Document no. 836 / 22179
No. of New Docs: 28
MODEL NO:9
MODEL DONE
Progress upto Document no. 861 / 22179
No. of New Docs: 25
MODEL NO:10
MODEL DONE
Progress upto Document no. 1012 / 22179
No. of New Docs: 151
MODEL NO:11
MODEL DONE
Progress upto Document no. 1179 / 22179
No. of New Docs: 167
MODEL NO:12
MODEL D

  del sys.path[0]


-----------------------------------------------
Loop Parameters- (10, 22)
Progress upto Document no. 1615 / 22179
No. of New Docs: 1080
MODEL NO:1
MODEL DONE
Progress upto Document no. 2241 / 22179
No. of New Docs: 626
MODEL NO:2
MODEL DONE
Progress upto Document no. 3131 / 22179
No. of New Docs: 890
MODEL NO:3
MODEL DONE
Progress upto Document no. 4336 / 22179
No. of New Docs: 1205
MODEL NO:4
MODEL DONE
Progress upto Document no. 6922 / 22179
No. of New Docs: 2586
MODEL NO:5
MODEL DONE
Progress upto Document no. 13321 / 22179
No. of New Docs: 6399
MODEL NO:6
MODEL DONE
Progress upto Document no. 19013 / 22179
No. of New Docs: 5692
MODEL NO:7
MODEL DONE
Progress upto Document no. 20942 / 22179
No. of New Docs: 1929
-----------------------------------------------
Loop Parameters- (10, 23)
Progress upto Document no. 1641 / 22179
No. of New Docs: 1106
MODEL NO:1
MODEL DONE
Progress upto Document no. 2303 / 22179
No. of New Docs: 662
MODEL NO:2
MODEL DONE
Progress upto Document no. 3319 / 

Process ForkPoolWorker-13721:
Process ForkPoolWorker-13719:
Process ForkPoolWorker-13720:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/Cellar/python/3.7.5/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/local/Cellar/python/3.7.5/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/local/Cellar/python/3.7.5/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()


KeyboardInterrupt: 

  File "/usr/local/Cellar/python/3.7.5/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/Cellar/python/3.7.5/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/Cellar/python/3.7.5/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/Cellar/python/3.7.5/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/pool.py", line 105, in worker
    initializer(*initargs)
  File "/usr/local/Cellar/python/3.7.5/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/pool.py", line 105, in worker
    initializer(*initargs)
  File "/usr/local/Cellar/python/3.7.5/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/p

Error in callback <function flush_figures at 0x13bfe55f0> (for post_execute):


Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/Cellar/python/3.7.5/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/local/Cellar/python/3.7.5/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/local/Cellar/python/3.7.5/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/local/Cellar/python/3.7.5/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  

Model built on 1 month data <br>
Incremental updates done on 30*2 weeks data