In [2]:
import logging
from gensim.models import ldaseqmodel
from gensim.corpora import Dictionary, bleicorpus, textcorpus
import numpy as np
from gensim.matutils import hellinger
import time
import pickle
import pyLDAvis
import matplotlib.pyplot as plt
from scipy.stats import entropy
import pandas as pd
from numpy.linalg import norm


In [3]:
alldata_new = pickle.load(open('output/dtm_processed_output.p', 'rb'))
# load data
doc_year=alldata_new['docs_per_year']
doc_ids =[0]+list(np.cumsum(doc_year))

term_topic = alldata_new['term_topic']# term_topic is n_years*n_topics*n_terms
terms = alldata_new['terms']

doc_topicyrs = alldata_new['doc_topic']

doc_topic = []
for year in range(len(term_topic)):    
    doc_topic.append(alldata_new['doc_topic'][doc_ids[year]:doc_ids[year+1]])# doc_topic is nyear*n_docs given year*n_topics
# rename topics by the hand-picked names
topic_labels = pickle.load(open('topicnames.p','rb'))
    

In [4]:
def totvar(p,q):
    maxdist=np.max(abs(p-q))
    maxid=np.argmax(abs(p-q))
    return [maxdist,maxid]
def JSD(P, Q):
    _P = P / norm(P, ord=1)
    _Q = Q / norm(Q, ord=1)
    _M = 0.5 * (_P + _Q)
    dist=0.5 * (entropy(_P, _M) + entropy(_Q, _M))    
    return dist

In [5]:
# entropy change within a topic -- which topic's content has changed most in the past years
epsilon = 1e-15
ntopics = 20
topicdelta=np.empty((ntopics,len(term_topic))) # distance from previous year: jenson-shannon distance
topicshift=np.empty(ntopics) # distance from 2000 to 2017
topicdelta_tv=np.empty((ntopics,len(term_topic))) # distance from previous year: total variance
topicshift_tv=np.empty(ntopics) # distance from 2000 to 2017:total variance


deltaterm=[]
shiftterm=[]
for k in range(ntopics):
    sftterms=[]
    for iyear in range(len(term_topic)):    
        topic = term_topic[iyear][k]
        # why not using KL: 1) avoid asymetry 2) avoid inf
        topic = topic/sum(topic)
        topicdelta[k,iyear] = JSD(topic,term_topic[max(iyear-1,0)][k]) # jensen-shannon distance
        [topicdelta_tv[k,iyear],maxterm]=totvar(topic,term_topic[max(iyear-1,0)][k]) # maxterm: term of biggest change from previous year
        sftterms.append(terms[maxterm])
    topicshift[k] = JSD(term_topic[-1][k],term_topic[0][k])
    [topicshift_tv[k],maxterm]=totvar(term_topic[-1][k],term_topic[0][k])
    shiftterm.append(terms[maxterm]) # biggest shift from 2017 to 2000
    deltaterm.append(sftterms) # biggest delta from prev year: max term for every year

In [9]:
deltaterm[4]

['color',
 'estimate',
 'estimate',
 'estimate',
 'estimate',
 'dot',
 'dot',
 'dot',
 'judgment',
 'trial',
 'trial',
 'trial',
 'trial',
 'trial',
 'trial',
 'trial',
 'trial',
 'bias']

In [None]:
shiftidx=np.argsort(-topicshift)
for idx in shiftidx:
    print(topic_labels[idx]+': %.3f'%topicshift[idx])

print('total variance:')
shiftidx=np.argsort(-topicshift_tv)
for idx in shiftidx:
    print(topic_labels[idx]+': %.3f'%topicshift_tv[idx]+' max shift word:'+shiftterm[idx])

In [None]:
#TODO: get the raise and fall terms for each topic...just copy the other code; set the jsd as titles

In [None]:
# calculate the topic distribution for each year (should correspond to the topic evolution trend...can't find that code right now)
ntopics = len(topic_labels)
ptop_years = []
entrop_years = []
for iyear in range(len(term_topic)):    
    ptopics = np.zeros(ntopics)
    for doc in doc_topic[iyear]:
        ptopics+=doc
    ptopics = ptopics/sum(ptopics)
    ptop_years.append(ptopics)
    entrop_years.append(entropy(ptopics))
print(entrop_years)

# plot the entropy change across years
years = np.arange(len(term_topic))+2000
plt.plot(years,entrop_years,'-o')
plt.xlabel('year')
plt.title('entropy of topic distribution')
plt.show()

# could be done: find the paper with highest / lowest entropy; find the topic with highest/lowest entropy

In [None]:
# KL-divergence across years
kl_years = []

gap=1
for iyear in range(len(term_topic)-gap):    
#    kl_years.append(entropy(ptop_years[iyear],ptop_years[iyear+gap]))
    kl_years.append(entropy(ptop_years[iyear+gap],ptop_years[iyear]))# sanity check: reverse the direction of KL. not differen
plt.plot(years[gap:],kl_years,'-o')
plt.xlabel('year')
plt.title('KL div with the previous year')
plt.show()

# TODO: eye-balling the distribution overlayed

**tentative conclusion**
- the diversity of topics seem to increase over years
- 2002 has a relatively less diverse topic distribution while 2013 was pretty diverse.

- the year-to-year difference has been decreasing across years...it's like the field is changing more slowly? doesn't make sense to me...

In [None]:
# entropy of topics
for iyear in range(len(term_topic)):
    print('\n Year='+str(years[iyear]))
    entrop_terms=[]
    for k in range(ntopics):
        topic = term_topic[iyear][k] # already normalized
        entrop_terms.append(entropy(topic))
    sorted_H = np.sort(entrop_terms)
    idx = np.argsort(entrop_terms)
    [print(topic_labels[idx[j]]+':'+str(sorted_H[j])) for j in range(len(idx))]
# turns out the ranking of entropy is pretty stable over the years.

In [None]:
sum(term_topic[iyear][3])