# Explore Topics

Notebook to allow inspection of keywords by topic by LDA run

In [1]:
import gzip
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import sys

sys.path.append("../src")
from process import *
from fitting_util import *
from plotting import *

import statistics as stat
from scipy.stats import sem

In [2]:
# Config : dont change

# LPT to use for stable topics, available values -2.0, -2.5 and -5.0 (only rec1-rec5 however)
Lpt=-2.5 

# type of content to model
#contentType = 'whitepapers' 
contentType = 'decadal_report'

# stable topic thresholds to examine
thresholds = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

# Gravity Wave topics, which are related to phenomena on earth, not the sky
ignore_topics = {
                 '19_2_125_rec1': [65],
                 '19_2_125_rec2': [112],
                 '19_2_125_rec3': [41],
                 '19_2_125_rec4': [74],
                 '19_2_125_rec5': [5],
                 '19_2_125_rec6': [93],
                 '19_2_125_rec7': [112],
                 '19_2_125_rec8': [1],
                 '19_2_125_rec9': [6],
                 '19_2_125_rec10': [95]
                 }

# -------------------------------
# which cagr statistic to use
which_cagr = 'CAGR' #'CAGR' #'CAGR_2_year_rolling_mean' # 'CAGR_model_best_fit' 'CAGR'
#which_cagr = 'CAGR_2_year_rolling_mean' 
#which_cagr = 'CAGR_model_best_fit' 

flex_min_cagr = False # IF True, then min_cagr value is ignored in favor of using the minimum cagr
                      # within the given dataset run
min_cagr = -0.05 #-0.041 # for calculation of ri, the minimum cagr we expect to have to handle

# ignore inferences in paragraphs below this value
paragraph_inference_threshold = 0.0 # 0.01 # 0.1 # 1./125 

# use spearman coorelation coefficient instead of pearson
use_spearman = False

# minimum score_sum (of top topic inference). IF below, then we discard
# the result
#min_top_topic_score_sum = 0.8 # 0.5 == 50%;  ie, must be above X%
#min_top_topic_score_sum = 0.0 # This has no beneficial effect, see slides from 1-4-2021  # PARAM no longer used

# this is used to normalize the (TCS) score we get. It should be the value of 
# inference if all paragraphs/unit/etc embody only a single topic
document_max_score = {'decadal_report': 1009, 'whitepapers' : 274 }

# These values from Bootstrap Error estimation (see respective notebooks)
# TCS Literature (1998-2010) error (see Bootstrap_Estimation_1998-2010_TCS notebook)
tcs_lit_mean_err = 18.3

# TCS_CAGR error (see CAGR_Bootstrap_Estimation notebook)
cagr_mean_err = 0.004

# RI mean error (see RI_Bootstrap_Estimation notebook)
ri_mean_err = 1.3

In [3]:
def has_ignore_topics (ig_dict:dict)->bool:
    for v in ig_dict.values():
        if len(v) > 0:
            return True
    return False

filtered = "_filtered" if has_ignore_topics(ignore_topics) else ""
print(filtered)

_filtered


In [4]:

def process_tdata(version:str, contentType:str, lda_viz_data:pd.DataFrame, timeseries_data:pd.DataFrame, topic_distrib_data:pd.DataFrame, ignore_topics:list=[]):
    
    # create dataset of summed inference in Panel reports vs CAGR
    # we need to carefully calculate the score. 
    # Hueristic: use only paragraphs where the top 5 topics sum to 0.5 or greater; 

    # process document inference data to get TCS score
    doc_tcs = doc_tcs_by_topic (topic_distrib_data, threshold=paragraph_inference_threshold)

    # now filter out topics which are 'unsuitable'
    # then create scatter plot of summed inference in document vs CAGR
    return create_dataset(doc_tcs, timeseries_data, lda_viz_data, which_cagr, ignore_topics, flex_min_cagr=flex_min_cagr,
                          min_cagr=min_cagr, max_doc_score=document_max_score[contentType])

In [5]:
# testing
version = '19_2_125_rec1'
# open basic files
lda_viz_data, timeseries_data, topic_distrib_data, stable_topics = open_datafiles(version, lpt=Lpt, contentType=contentType)

# process 
doc_scores = process_tdata(version, contentType, lda_viz_data, timeseries_data, topic_distrib_data, ignore_topics[version])

Data MIN_CAGR: -0.0220487315718552


In [6]:
doc_scores[:3]

Unnamed: 0,topic,raw_doc_tcs,doc_tcs,tcs,cagr,keywords,ri
0,0,12.946581,0.012831,333.510897,0.026101,"massive black hole, solar abundance, massive s...",25.380453
1,1,8.899732,0.00882,727.68061,0.023061,"shock wave, photometric observation, particle ...",53.165325
2,2,22.136611,0.021939,1138.389202,0.083674,"high redshift, low redshift, z >, < z <, star-...",152.172629


In [7]:
def get_topic_keywords(topic_num, doc_scores)->str:
    return doc_scores[doc_scores['topic']==str(topic_num)]

In [8]:
get_topic_keywords(23, doc_scores)

Unnamed: 0,topic,raw_doc_tcs,doc_tcs,tcs,cagr,keywords,ri
23,23,5.930148,0.005877,542.681655,-0.009913,"elliptical galaxy, globular cluster, hubble sp...",21.754692


In [9]:
get_topic_keywords(7, doc_scores)

Unnamed: 0,topic,raw_doc_tcs,doc_tcs,tcs,cagr,keywords,ri
7,7,6.996014,0.006934,455.102846,0.05408,"polycyclic aromatic hydrocarbon, infrared emis...",47.366895
