# Signal Thresholds

The steps for establishing a threshold are as follows:

1. Select two topics
2. Modify the alpha between them
3. Measure the distance between 
3. Generate documents/topics from modified topics

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
# UTILITIES
import util
import pandas as pd
import numpy as np
from metrics_model import MetricsModel
# CORPORA
from nltk.corpus import brown
# PLOTTING
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.models.widgets import Panel, Tabs
from bokeh.models import (HoverTool,
                          Legend,
                          ColumnDataSource,
                          LinearColorMapper,
                          BasicTicker,
                          PrintfTickFormatter,
                          ColorBar)
from bokeh.palettes import Spectral10
# INTERACTIONS
from ipywidgets import (HBox,VBox,Dropdown,IntSlider,
                        FloatSlider,Checkbox,Button,
                        interact_manual,Output)
output_notebook()

# Select a Corpus

Brown. It has 15 pre-defined categories (15).

In [5]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [6]:
len(brown.fileids())

500

In [None]:
data_samples = [' '.join(brown.words(f)) for f in brown.fileids()]

In [None]:
util.dump_pickle(data_samples,'brown_data_samples')

# Extract Topics from the Corpus

We're using a fixed model, LDA to extract the topics.

In [7]:
data_samples = util.load_pickle('brown_data_samples')

In [8]:
model_components = fm.get_model(data_samples)

# Choose Signal & Noise Topics

Pick two topics from the ones extracted above. To help in the decision making, this interactive plot shows how each topic compares to the rest for the various metrics we have available. You can hover to see the top 5 words that represent each topic.

In [9]:
topic_metrics = util.calc_metrics(model_components)

Calculating rank1...done in 0.040s
Calculating average word length...done in 0.084s
Calculating effective size...done in 0.635s
Calculating exclusivity...done in 0.723s
Calculating distance from uniform...done in 1.310s
Calculating distance from corpus...done in 2.907s


In [10]:
source_dict = topic_metrics.copy()
source_dict['x'] = range(fm.K)
source = ColumnDataSource(source_dict)

top_words = fm.get_top_words(model_components)
source.data['top_three'] = [' '.join(top_words[t][:3]) for t in range(fm.K)]
hover = HoverTool(tooltips=[('top words','@top_three')])

metric_tabs = []
for m in topic_metrics.keys():
    fig = figure(x_axis_label='Topic Number',
                 y_axis_label=m,
                 height=600,
                 width=600,
                 toolbar_location='above')
    fig.add_tools(hover)
    fig.vbar(x='x',top=m,width=0.75,source=source)
    metric_tabs.append(Panel(child=fig, title=m))

In [11]:
distance_algorithms = ["jensen_shannon_divergence",
                       "kullback_leibler_divergence",
                       "cosine_distance"]

for dist_alg in distance_algorithms:
    df = pd.DataFrame(columns=list(range(fm.K)),index=list(range(fm.K)))
    topic_word = model_components['topic_word']
    for i in range(fm.K):
        for j in range(fm.K):
            func = getattr(metrics,dist_alg)
            df.at[i,j] = func(topic_word[i],topic_word[j])
    df.index.name="TopicA"
    df.columns.name="TopicB"
    topicsA = list(df.index)
    topicsB = list(df.columns)
    p_df = pd.DataFrame(df.stack(),columns=['dist']).reset_index()
    p_df['x'] = p_df["TopicA"] + 0.5
    p_df['y'] = p_df["TopicB"] + 0.5
    TOOLS = "hover,save"
    mapper = LinearColorMapper(palette='Spectral10',low=0,high=1)
    color_bar = ColorBar(color_mapper=mapper,
                         major_label_text_font_size='10pt',
                         border_line_color=None,
                         location=(0,0))
    source = ColumnDataSource(p_df)
    p = figure(title=dist_alg,
               x_range=[str(i) for i in topicsA],y_range=[str(i) for i in topicsB],
               tools=TOOLS, toolbar_location='above')
    p.grid.grid_line_color=None
    p.axis.axis_line_color=None
    p.axis.major_tick_line_color=None
    p.axis.major_label_text_font_size='10pt'
    p.axis.major_label_standoff=0
    p.rect(x="x",y="y",width=1,height=1,source=source,
           fill_color={'field':'dist','transform':mapper},line_color=None)
    p.add_layout(color_bar, 'right')
    p.select_one(HoverTool).tooltips = [
        ('Topics','@TopicA and @TopicB'),
        (dist_alg,'@dist')
    ]
    metric_tabs.append(Panel(child=p,title=dist_alg))

In [12]:
tabs = Tabs(tabs=metric_tabs)
show(tabs,notebook_handle=True)

The plots above let us exlpore the topics that were already present in our dataset. That way, we can choose an interesting pair to explore thresholds for. 

# Alpha Thresholds

Choosing two topics, we're going to modify the distance between them until their metrics change enough that we can't tell them apart. There are two approaches to this. The first is with no intervention by an LDA model. That is, we change the topics and directly observe how their metrics change. The second is with LDA intervention. We change the topics, generate sample documents from them, fit a model to those sample documents, and observe the metrics from that model.

In [114]:
ALPHA_RANGE = [x/100 for x in range(10,100,10)]
topic_word = model_components['topic_word']
features = model_components['features']

# BUILD THE DROPDOWN SELECTIONS
top_words = fm.get_top_words(model_components)
top_three = [' '.join(top_words[t][:3]) for t in range(fm.K)]
select_list = ['Topic {0}: {1}'.format(t,top_three[t]) for t in range(fm.K)]

no_intervention_results = {}

def run_experiment(b):
    # EXPERIMENT SETTINGS
    SIGNAL = signal_selector.index
    NOISE = noise_selector.index
    D = num_docs_slider.value
    N = doc_len_slider.value
    
    res_dict = {'Kullback-Leibler Divergence':[],
                'Jensen-Shannon Divergence':[],
                'Cosine Distance':[]}
    
    for a in ALPHA_RANGE:
        noise_ = topic_word[NOISE]
        signal_ = util.alpha_separate(topic_word[SIGNAL],noise_,a)
        kld = metrics.kullback_leibler_divergence(signal_,noise_)
        jsd = metrics.jensen_shannon_divergence(signal_,noise_)
        cos = metrics.cosine_distance(signal_,noise_)
        res_dict['Kullback-Leibler Divergence'].append(kld)
        res_dict['Jensen-Shannon Divergence'].append(jsd)
        res_dict['Cosine Distance'].append(cos)
        
    no_intervention_results['{}_{}'.format(SIGNAL,NOISE)] = res_dict
    
intervention_results = {}

# Change model to 2-topics
fm.update_model(2)

def run_intervention(b):
    # EXPERIMENT SETTINGS
    SIGNAL = signal_selector.index
    NOISE = noise_selector.index
    D = num_docs_slider.value
    N = doc_len_slider.value
    
    res_dict = {'Kullback-Leibler Divergence':[],
                'Jensen-Shannon Divergence':[],
                'Cosine Distance':[]}
    
    for a in ALPHA_RANGE:
        noise_ = topic_word[NOISE]
        signal_ = util.alpha_separate(topic_word[SIGNAL],noise_,a)
        
        new_components = {}
        new_components['topic_word'] = np.array([signal_,noise_])
        new_components['features'] = model_components['features']
        new_components['doc_topic'] = [[1/2,1/2] for d in range(1000)] # 1000 documents
        data_samples = util.generate(new_components,1000) # Each document
        
        fitted_components = fm.get_model(data_samples)
        t0 = fitted_components['topic_word'][0]
        t1 = fitted_components['topic_word'][1]
        kld = metrics.kullback_leibler_divergence(t0,t1)
        jsd = metrics.jensen_shannon_divergence(t0,t1)
        cos = metrics.cosine_distance(t0,t1)
        res_dict['Kullback-Leibler Divergence'].append(kld)
        res_dict['Jensen-Shannon Divergence'].append(jsd)
        res_dict['Cosine Distance'].append(cos)
        
    no_intervention_results['{}_{}'.format(t0,t1)] = res_dict   

signal_selector = Dropdown(
                    description="SIGNAL",
                    options=select_list,
                    index=1
                  )
noise_selector  = Dropdown(
                    description="NOISE",
                    options=select_list,
                    index=2
                  )
inter_button    = Button(
                    description="Run Experiment",
                    tooltip="Run Experiment"
                  )
no_inter_button = Button(
                    description="Run Experiment (with intervention)",
                    tooltip="Run Experiment (with intervention)"
                  )
inter_button.on_click(run_experiment)
no_inter_button.on_click(run_intervention)
display(VBox([HBox([signal_selector,noise_selector]),HBox([inter_button,no_inter_button])]))

A Jupyter Widget

In [110]:
def get_tabs(results):
    mets = ['Jensen-Shannon Divergence',
            'Kullback-Leibler Divergence',
            'Cosine Distance']
    colors = Spectral10[:]
    tabs = []
    # BUILD A PANEL FOR EACH METRIC
    for m in mets:
        source_dict = {'x':ALPHA_RANGE}
        for k in results.keys():
            source_dict[k] = results[k][m]
        source = ColumnDataSource(source_dict)
        fig = figure(x_axis_label='Alpha',
                     y_axis_label=m,
                     height=600,
                     width=700,
                     toolbar_location='above')
        for i in range(len(results)):
            k = list(results.keys())[i]
            fig.line(x='x',y=k,
                     source=source,legend='Topics: {}'.format(k),
                     color=colors[i%len(colors)],line_width=2)
        tabs.append(Panel(child=fig,title=m))
    return tabs

show(Tabs(tabs=get_tabs(no_intervention_results)),notebook_handle=True)

## With LDA Intervention

Now we'll re-run the same experiment, this time with LDA stepping in to predict topics before they get measured with the divergence metrics above.

In [None]:


intervention_results = {}

def run_intervention_experiment(b):
    # EXPERIMENT SETTINGS
    SIGNAL = signal_selector.index
    NOISE = noise_selector.index
    D = num_docs_slider.value
    N = doc_len_slider.value
    
    res_dict = {'Kullback-Leibler Divergence':[],
                'Jensen-Shannon Divergence':[],
                'Cosine Distance':[]}
    
    for a in ALPHA_RANGE:
        noise_ = topic_word[NOISE]
        signal_ = util.alpha_separate(topic_word[SIGNAL],noise_,a)
        kld = metrics.kullback_leibler_divergence(signal_,noise_)
        jsd = metrics.jensen_shannon_divergence(signal_,noise_)
        cos = metrics.cosine_distance(signal_,noise_)
        res_dict['Kullback-Leibler Divergence'].append(kld)
        res_dict['Jensen-Shannon Divergence'].append(jsd)
        res_dict['Cosine Distance'].append(cos)
        
    results['{}_{}'.format(SIGNAL,NOISE)] = res_dict


signal_selector = Dropdown(
                    description="SIGNAL",
                    options=select_list,
                    index=1
                  )
noise_selector  = Dropdown(
                    description="NOISE",
                    options=select_list,
                    index=2
                  )
run_button      = Button(
                    description="Run Experiment",
                    tooltip="Run Experiment"
                  )
run_button.on_click(run_experiment)
display(VBox([HBox([signal_selector,noise_selector]),run_button]))

In [None]:
mets = ['Jensen-Shannon Divergence',
        'Kullback-Leibler Divergence',
        'Cosine Distance']
colors = Spectral10[:]
tabs = []
# BUILD A PANEL FOR EACH METRIC
for m in mets:
    source_dict = {'x':ALPHA_RANGE}
    for k in results.keys():
        source_dict[k] = results[k][m]
    source = ColumnDataSource(source_dict)
    hover = HoverTool(tooltips=[('Alpha','@x'),(m,'@{}'.format(m))])
    fig = figure(x_axis_label='Alpha',
                 y_axis_label=m,
                 height=400,
                 width=700,
                 toolbar_location='above')
    fig.add_tools(hover)
    for i in range(len(results)):
        k = list(results.keys())[i]
        fig.line(x='x',y=k,
                 source=source,legend='Topics: {}'.format(k),
                 color=colors[i%len(colors)],line_width=2)
    tabs.append(Panel(child=fig,title=m))
show(Tabs(tabs=tabs),notebook_handle=True)