# Signal Thresholds

The steps for establishing a threshold are as follows:

1. Select two topics
2. Modify the alpha between them
3. Measure the distance between 
3. Generate documents/topics from modified topics

In [1]:
%load_ext autoreload
%autoreload 2

In [32]:
# UTILITIES
import metrics,util
import pandas as pd
import numpy as np
import fixed_model as fm
from scipy.spatial.distance import cosine
# CORPORA
from nltk.corpus import brown
# PLOTTING
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.models.widgets import Panel, Tabs
from bokeh.models import (HoverTool,
                          Legend,
                          ColumnDataSource,
                          LinearColorMapper,
                          BasicTicker,
                          PrintfTickFormatter,
                          ColorBar)
# INTERACTIONS
from ipywidgets import (HBox,VBox,Dropdown,IntSlider,
                        FloatSlider,Checkbox,Button,
                        interact_manual,Output)
output_notebook()

# Select a Corpus

Brown. It has pre-defined categories (15).

In [3]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [4]:
len(brown.fileids())

500

In [None]:
data_samples = [' '.join(brown.words(f)) for f in brown.fileids()]

In [None]:
util.dump_pickle(data_samples,'brown_data_samples')

# Extract Topics from the Corpus

We're using a fixed model, LDA to extract the topics.

In [5]:
data_samples = util.load_pickle('brown_data_samples')

In [6]:
model_components = fm.get_model(data_samples)

# Choose Signal & Noise Topics

Pick two topics from the ones extracted above. To help in the decision making, this interactive plot shows how each topic compares to the rest for the various metrics we have available. You can hover to see the top 5 words that represent each topic.

In [7]:
topic_metrics = util.calc_metrics(model_components)

Calculating rank1...done in 0.044s
Calculating average word length...done in 0.091s
Calculating effective size...done in 0.700s
Calculating exclusivity...done in 0.789s
Calculating distance from uniform...done in 1.428s
Calculating distance from corpus...done in 3.258s


In [None]:
source_dict = topic_metrics.copy()
source_dict['x'] = range(fm.K)
source = ColumnDataSource(source_dict)

top_words = fm.get_top_words(model_components)
source.data['top_three'] = [' '.join(top_words[t][:3]) for t in range(fm.K)]
hover = HoverTool(tooltips=[('top words','@top_three')])

metric_tabs = []
for m in topic_metrics.keys():
    fig = figure(x_axis_label='Topic Number',
                 y_axis_label=m,
                 height=600,
                 width=600,
                 toolbar_location='above')
    fig.add_tools(hover)
    fig.vbar(x='x',top=m,width=0.75,source=source)
    metric_tabs.append(Panel(child=fig, title=m))

In [None]:
distance_algorithms = ["jensen_shannon_divergence",
                       "kullback_leibler_divergence",
                       "cosine_distance"]

for dist_alg in distance_algorithms:
    df = pd.DataFrame(columns=list(range(fm.K)),index=list(range(fm.K)))
    topic_word = model_components['topic_word']
    for i in range(fm.K):
        for j in range(fm.K):
            func = getattr(metrics,dist_alg)
            df.at[i,j] = func(topic_word[i],topic_word[j])
    df.index.name="TopicA"
    df.columns.name="TopicB"
    topicsA = list(df.index)
    topicsB = list(df.columns)
    p_df = pd.DataFrame(df.stack(),columns=['dist']).reset_index()
    p_df['x'] = p_df["TopicA"] + 0.5
    p_df['y'] = p_df["TopicB"] + 0.5
    TOOLS = "hover,save"
    mapper = LinearColorMapper(palette='Spectral10',low=0,high=1)
    color_bar = ColorBar(color_mapper=mapper,
                         major_label_text_font_size='10pt',
                         border_line_color=None,
                         location=(0,0))
    source = ColumnDataSource(p_df)
    p = figure(title=dist_alg,
               x_range=[str(i) for i in topicsA],y_range=[str(i) for i in topicsB],
               tools=TOOLS, toolbar_location='above')
    p.grid.grid_line_color=None
    p.axis.axis_line_color=None
    p.axis.major_tick_line_color=None
    p.axis.major_label_text_font_size='10pt'
    p.axis.major_label_standoff=0
    p.rect(x="x",y="y",width=1,height=1,source=source,
           fill_color={'field':'dist','transform':mapper},line_color=None)
    p.add_layout(color_bar, 'right')
    p.select_one(HoverTool).tooltips = [
        ('Topics','@TopicA and @TopicB'),
        (dist_alg,'@dist')
    ]
    metric_tabs.append(Panel(child=p,title=dist_alg))

In [None]:
tabs = Tabs(tabs=metric_tabs)
show(tabs,notebook_handle=True)

After exploring the topics in the graphs above, we pick the two to find an alpha-threshold for. We'll go with:

# Alpha-Metric Thresholds

Using the two above topics, we're going to modify alpha until their metrics change enough that we can't tell them apart.

In [40]:
topic_word = model_components['topic_word']
features = model_components['features']

# FOR THE DROPDOWNS
top_words = fm.get_top_words(model_components)
top_three = [' '.join(top_words[t][:3]) for t in range(fm.K)]
select_list = ['Topic {0}: {1}'.format(t,top_three[t]) for t in range(fm.K)]

def run_experiment(b):
    global tabs
    # EXTRACT THE SETTINGS
    print("Extracting the settings...")
    SIGNAL = signal_selector.index
    NOISE = noise_selector.index
    ALPHA = alpha_slider.value
    D = num_docs_slider.value
    N = doc_len_slider.value
    # SEPARATE THE TOPICS
    print("Separating the topics...")
    new_signal = util.alpha_separate(topic_word[SIGNAL],topic_word[NOISE],ALPHA)
    new_topic_word = np.array([new_signal,topic_word[NOISE]])
    # BUILD MODEL COMPONENTS
    print("Building the components...")
    model_components = util.build_components(new_topic_word,features,D,N)
    # GENERATE DOCUMENTS
    print("Generating the documents...")
    gen_docs = util.generate(model_components,N)
    # FIT THE MODEL
    print("Fitting the model...")
    model_components = fm.get_model(gen_docs)
    # BUILD THE PLOT
    print("Building the plots...")
    metric_tabs = util.build_metrics_tabs(model_components)
    # SHOW THE PLOT
    out.clear_output()
    with out:
        tabs = Tabs(tabs=metric_tabs)
        show(tabs,notebook_handle=True)
    
signal_selector = Dropdown(
                    description="SIGNAL",
                    options=select_list,
                    index=1
                  )
noise_selector  = Dropdown(
                    description="NOISE",
                    options=select_list,
                    index=2
                  )
alpha_slider    = FloatSlider(
                    description="ALPHA",
                    value=0.5,
                    min=0,
                    max=1,
                    step=0.05,
                    continuous_update=False
                  )
num_docs_slider = IntSlider(
                    description="D",
                    value=100,
                    min=50,
                    max=1000,
                    step=10
                  )
doc_len_slider  = IntSlider(
                    description="N",
                    value=100,
                    min=50,
                    max=1000,
                    step=10
                  )
priors_check    = Checkbox(
                    value=False,
                    description="Priors?"
                  )
run_button      = Button(
                    description="Run Experiment",
                    tooltip='Run Experiment'
                  )
run_button.on_click(run_experiment)

row_1 = HBox([signal_selector,noise_selector])
row_2 = HBox([num_docs_slider,doc_len_slider])
row_3 = HBox([alpha_slider,priors_check])
row_4 = HBox([run_button])
out = Output()
display(VBox([row_1,row_2,row_3,row_4,out]))

A Jupyter Widget

(2, 42090)

In [51]:
row_sums = tw.sum(axis=1)

[autoreload of util failed: Traceback (most recent call last):
  File "/Users/cassiancorey/anaconda/lib/python3.5/site-packages/IPython/extensions/autoreload.py", line 246, in check
    superreload(m, reload, self.old_objects)
  File "/Users/cassiancorey/anaconda/lib/python3.5/site-packages/IPython/extensions/autoreload.py", line 369, in superreload
    module = reload(module)
  File "/Users/cassiancorey/anaconda/lib/python3.5/imp.py", line 314, in reload
    return importlib.reload(module)
  File "/Users/cassiancorey/anaconda/lib/python3.5/importlib/__init__.py", line 166, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 626, in _exec
  File "<frozen importlib._bootstrap_external>", line 658, in exec_module
  File "<frozen importlib._bootstrap_external>", line 764, in get_code
  File "<frozen importlib._bootstrap_external>", line 724, in source_to_code
  File "<frozen importlib._bootstrap>", line 222, in _call_with_frames_removed
  File "/Users

In [52]:
tw

array([[ 0.0525033 ,  0.16015236,  0.04656324, ...,  0.05079402,
         0.04631932,  0.04509932],
       [ 0.04658838,  0.06972765,  0.04584494, ...,  0.04760961,
         0.04592824,  0.04420352]])

In [53]:
tw.sum(axis=1)

array([ 3550.35212708,  1972.02522653])

In [54]:
row_sums.shape

(2,)

In [55]:
norm_topic_word = tw/row_sums[:,np.newaxis]

In [None]:
noise_ = 