# Signal Thresholds

The steps for establishing a threshold are as follows:

1. Select two topics
2. Modify the alpha between them
3. Measure the distance between 
3. Generate documents/topics from modified topics

In [16]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Select a Corpus

Brown. It has pre-defined categories (15).

In [3]:
from nltk.corpus import brown
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [4]:
len(brown.fileids())

500

In [6]:
data_samples = [' '.join(brown.words(f)) for f in brown.fileids()]

500

In [9]:
from util import *
dump_pickle(data_samples,'brown_data_samples')

True

# Extract Topics from the Corpus

We're using a fixed model, LDA to extract the topics.

In [30]:
import fixed_model as fm
data_samples = [' '.join(brown.words(f)) for f in brown.fileids()]
model_components = fm.get_model(data_samples)

# Choose Signal & Noise Topic(s)

Pick two topics from the ones extracted above. To help in the decision making, this interactive plot shows how each topic compares to the rest for the various metrics we have available. You can hover to see the top 5 words that represent each topic.

In [38]:
import util
topic_metrics = util.calc_metrics(model_components)

Calculating rank1...done in 0.020s
Calculating average word length...done in 0.043s
Calculating effective size...done in 0.305s
Calculating exclusivity...done in 0.345s
Calculating distance from uniform...done in 0.625s
Calculating distance from corpus...done in 0.976s


In [59]:
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.models.widgets import Panel, Tabs
from bokeh.models import HoverTool, Legend, ColumnDataSource
output_notebook()

source_dict = topic_metrics.copy()
source_dict['x'] = range(fm.K)
source = ColumnDataSource(source_dict)

top_words = fm.get_top_words(model_components)
source.data['top_three'] = [' '.join(top_words[t][:3]) for t in range(fm.K)]
hover = HoverTool(tooltips=[('top words','@top_three')])

metric_tabs = []
for m in topic_metrics.keys():
    fig = figure(x_axis_label='Topic Number',
                 y_axis_label=m,
                 height=600,
                 width=800,
                 toolbar_location='above')
    fig.add_tools(hover)
    fig.vbar(x='x',top=m,width=0.75,source=source)
    metric_tabs.append(Panel(child=fig, title=m))
    


    
tabs = Tabs(tabs=metric_tabs)
show(tabs,notebook_handle=True)

In [74]:
import pandas as pd
import metrics

from scipy.spatial.distance import cosine

from bokeh.io import show
from bokeh.models import (
    ColumnDataSource,
    HoverTool,
    LinearColorMapper,
    BasicTicker,
    PrintfTickFormatter,
    ColorBar
)

from bokeh.plotting import figure

# Build the distance matrices.
cos_df = pd.DataFrame(columns=list(range(fm.K)),
                      index=list(range(fm.K)))
jsd_df = pd.DataFrame(columns=list(range(fm.K)),
                      index=list(range(fm.K)))
# Now fill them.
topic_word = model_components['topic_word']
for i in range(fm.K):
    for j in range(fm.K):
        cos_df.at[i,j] = cosine(topic_word[i],topic_word[j])
        jsd_df.at[i,j] = metrics.jensen_shannon_divergence(topic_word[i],topic_word[j])
# Reset the index/column names
cos_df.index.name='TopicA'
jsd_df.index.name='TopicA'
cos_df.columns.name='TopicB'
jsd_df.columns.name='TopicB'
topicsA = list(cos_df.index)
topicsB = list(cos_df.columns)
# Stack it because bokeh sucks for heatmaps
p_df = pd.DataFrame(cos_df.stack(),columns=['cos']).reset_index()
p_df['x'] = p_df['TopicA'] + 0.5
p_df['y'] = p_df['TopicB'] + 0.5
q_df = pd.DataFrame(jsd_df.stack(),columns=['cos']).reset_index()
q_df['x'] = q_df['TopicA'] + 0.5
q_df['y'] = q_df['TopicB'] + 0.5
# Some plotting objects.
TOOLS = "hover,save"  
mapper = LinearColorMapper(palette='Spectral10',low=0,high=1)
color_bar_p = ColorBar(color_mapper=mapper,major_label_text_font_size='10pt',
                       border_line_color=None, location=(0,0))
color_bar_q = ColorBar(color_mapper=mapper,major_label_text_font_size='10pt',
                       border_line_color=None, location=(0,0))
# Make the plot sources.
source_p = ColumnDataSource(p_df)
source_q = ColumnDataSource(q_df)

# COSINE DISTANCE PANEL
p = figure(title="Cosine Distance Between Topics",
           x_range=[str(i) for i in topicsA],y_range=[str(i) for i in topicsB],
           tools=TOOLS, toolbar_location='above')
p.grid.grid_line_color=None
p.axis.axis_line_color=None
p.axis.major_tick_line_color=None
p.axis.major_label_text_font_size='10pt'
p.axis.major_label_standoff=0
p.rect(x="x",y="y",width=1,height=1,source=source_p,
       fill_color={'field':'cos','transform':mapper},line_color=None)
p.add_layout(color_bar_p, 'right')
p.select_one(HoverTool).tooltips = [
    ('Topics','@TopicA and @TopicB'),
    ('Cosine Distance','@cos')
]
# JENSEN SHANNON DIVERGENCE PANEL
q = figure(title="Jensen-Shannon Divergence Between Topics",
           x_range=[str(i) for i in topicsA],y_range=[str(i) for i in topicsB],
           tools=TOOLS, toolbar_location='above')
q.grid.grid_line_color=None
q.axis.axis_line_color=None
q.axis.major_tick_line_color=None
q.axis.major_label_text_font_size='10pt'
q.axis.major_label_standoff=0
q.rect(x="x",y="y",width=1,height=1,source=source_q,
       fill_color={'field':'cos','transform':mapper},line_color=None)
q.add_layout(color_bar_q, 'right')
q.select_one(HoverTool).tooltips = [
    ('Topics','@TopicA and @TopicB'),
    ('JSD','@cos')
]

distance_tabs = [Panel(child=p,title="Cosine"),
                 Panel(child=q,title="JSD")]
tabs = Tabs(tabs=distance_tabs)
show(tabs,notebook_handle=True)

After exploring the topics in the graphs above, we pick the two to find an alpha-threshold for. We'll go with:

In [77]:
SIGNAL = 3
NOISE = 8
print("Signal: ",' '.join(top_words[SIGNAL]))
print("Noise: ",' '.join(top_words[NOISE]))

Signal:  af temperature anode surface used cells degrees lines line volume
Noise:  said time new like man did years way just long


# Alpha Threshold

Using the two above topics, we're going to modify alpha until their metrics change enough that we can't tell them apart.

Two ways:
- without LDA intervention
- with LDA intervention

In [80]:
from ipywidgets import interact

def update_metrics_plot(alpha):
    pass

def update_distance_plot(alpha):
    pass

def update_plots(alpha):
    
    update_metrics_plot(alpha)
    update_distance_plot(alpha)


    
    
interact(update_plots,alpha=(0,1,0.05))

A Jupyter Widget

<function __main__.update_plots>