In [None]:
import glob
import json
import gensim
import pickle
import collections
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt

from gensim.parsing.preprocessing import preprocess_string

In [None]:
get_ipython().magic('matplotlib inline')

In [None]:
pathToData = 'data/aristo-mini-corpus/Aristo-Mini-Corpus-Dec2016.txt'

In [None]:
def plotTopicProjections(model,dictionary,scale=False,plotNegative=False,nTerms=5):
    '''
    Convenience function to plot term importances in topics
    @plotNegative is for models that return -ve term importances
    @scale is either fixed at [-1,1] or autoscaled based on largest importance
    @model is LDA/LSI gensim model object
    '''
    
    topicProjections=model.get_topics()
    
    for n in range(topicProjections.shape[0]):
        #print(i)
        topicTerm=collections.Counter({dictionary[j]:p for j,p in\
                                       enumerate(topicProjections[n,:])})

        most = topicTerm.most_common(nTerms)[::-1]
        least = topicTerm.most_common()[-1*nTerms:]
        
        if not scale:
            plt.xlim(-1,1)
            maxExtent=1
        else:
            maxMost=max([m[1] for m in most])*1.1
            minLeast=min([l[1] for l in least])*1.1
            
            maxMost=topicProjections.max()*1.1
            minMost=topicProjections.min()*1.1
            
            maxExtent=max([abs(minLeast),abs(maxMost)])
            plt.xlim(-1*maxExtent,maxExtent)
                    
        plt.barh(range(nTerms),[m[1] for m in most])
        for i,m in enumerate(most):
            plt.annotate('{:s} ({:.3f})'.format(m[0],m[1]),\
                         xy=(0.1*maxExtent,i-0.1),xycoords='data',fontsize=20)
        
        if not plotNegative:
            if not scale:
                plt.xlim(0,1)
            else:
                plt.xlim(0,maxExtent)
        
        plt.barh(range(nTerms),[l[1] for l in least])
        for i,l in enumerate(least):
            plt.annotate('{:s} ({:.3f})'.format(l[0],l[1]),\
                         xy=(-0.1*maxExtent,i-0.1),xycoords='data',ha='right',fontsize=20)
        plt.axvline(color='grey')
        plt.title('Topic {:d}'.format(n))
        plt.yticks([],[])
        plt.xlabel('Projection')
        plt.show()

In [None]:
class textGen():
    '''
    Object to iterate over text out of memory
    Generator: Yields values one at a time
    @n is number of lines to read, -1 means all lines
    '''
    def __init__(self,n=-1):
        print('Initialising textgenerator...')
        self.n=n

    def __iter__(self):    
        with open(pathToData,'r',errors='ignore') as inFile:
            for nLine,line in enumerate(inFile):
                
                if self.n>-1 and nLine>self.n:
                    break
                if len(line)>0:
                
                    if not len(line)==0:
                        yield preprocess_string(line)

In [None]:
holdout = 50000

In [None]:
%time dictionary = gensim.corpora.Dictionary(textGen(n=holdout))

In [None]:
len(dictionary)

In [None]:
%time dictionary.filter_extremes()

In [None]:
len(dictionary)

In [None]:
class newsCorpus():
    '''
    Class wrapper for reading news data
    Generator: Yields indexed documents one at a time
    @n is number of lines to read, -1 means all lines
    @start is line number to start returning data
    (for creating holdout set)
    '''
    def __init__(self,n=-1,start=0):
        print('Initialising corpus...')
        self.n=n
        self.start=start
        
    def __iter__(self):   
        for nFile,file in enumerate(glob.glob(pathToData+'news*json')):
            #print(nFile,self.n)
            if self.n>-1 and nFile>self.n:
                break

            with open(file,'r') as inFile:
                for line in inFile:

                    if len(line)>0 and nFile>=self.start:
                        d=json.loads(line)
                        tokens=preprocess_string(d['text'])
                        yield dictionary.doc2bow(tokens)

### LDA

In [None]:
%time resLda_4topics = gensim.models.ldamulticore.LdaMulticore(newsCorpus(n=450000), num_topics=4, id2word=dictionary)

In [None]:
%time resLda_10topics = gensim.models.ldamulticore.LdaMulticore(newsCorpus(n=holdout), num_topics=10, id2word=dictionary)

In [None]:
plotTopicProjections(resLda_4topics, dictionary, scale=True, nTerms=10)

In [None]:
resLda_4topics.get_document_topics(dictionary.doc2bow(preprocess_string('This is Sparta')))

In [None]:
tfidf =  gensim.models.TfidfModel(dictionary=dictionary)

In [None]:
%time res_tfidf = gensim.models.ldamulticore.LdaMulticore(tfidf[newsCorpus(n=holdout)], num_topics=4, id2word=dictionary)

In [None]:
plotTopicProjections(res_tfidf, dictionary, scale=True, nTerms=7)

### LSI

In [None]:
%time resLsi = gensim.models.lsimodel.LsiModel(newsCorpus(n=holdout), num_topics=4, id2word=dictionary)

In [None]:
plotTopicProjections(resLsi, dictionary, plotNegative=True, nTerms=10)

In [None]:
%time resLsiTfidf = gensim.models.lsimodel.LsiModel(tfidf_corpus[newsCorpus(n=holdout)],num_topics=4,id2word=dictionary)