In [1]:
# Visualization of Topic Modeling Results

### https://nbviewer.org/github/bmabey/pyLDAvis/blob/master/notebooks/pyLDAvis_overview.ipynb#topic=2&lambda=1&term=

In [2]:
import numpy as np
import os
import pyLDAvis
import warnings
warnings.filterwarnings('ignore')

# Set directory to load data from
out = os.path.join('..', 'out')

# Question Categories
column_names = {'from_idea_to_product': 'From Idea To Product', 'idea_backstory_motivation':
                'Idea, Backstory & Motivation', 'marketing_strategies': 'Marketing Strategies',
                'background_and_current_work': 'Background and Current Work',
                'challenges_obstacles_mistakes': 'Challenges, Obstacles & Mistakes',
                'main_lessons_and_advice': 'Main Lessons & Advice'}

In [3]:
# Define Helper Functions for Visualization

def get_filename(out, model, column, C, C_Topics, num_topics, max_df, max_features):
    # hstm-all.result.split0.setting=('marketing_strategies', 1e-05, 5e-05, 10, 0.8, 2000).npy
    fname = model + '.visualization.split0' + '.setting=' + str((column, C, C_Topics, num_topics, max_df, max_features)) + '.npy'
    return os.path.join(out, fname)

def load_data(fname):
    return np.load(fname, allow_pickle=True).item()

def print_shapes(model_data):
    print('Topic-Term shape: %s' % str(np.array(model_data['topic_term_dists']).shape))
    print('Doc-Topic shape: %s' % str(np.array(model_data['doc_topic_dists']).shape))


def prepare_visualization(model_data):
    return pyLDAvis.prepare(model_data['topic_term_dists'],
                            model_data['doc_topic_dists'],
                            np.sum(model_data['term_frequency'], axis=1),
                            model_data['vocab'],
                            np.sum(model_data['term_frequency'], axis=0))

def print_topic_with_biggest_influence(topic_weights):
    topic_most_influential_pos = topic_weights.argmax()
    topic_most_influential_neg = topic_weights.argmin()
    print('Topics with biggest influence: ' +  str(topic_most_influential_pos + 1) + ', ' +  str(topic_most_influential_neg + 1))
    print('Weights: ' + str(topic_model['topic_weights'][topic_most_influential_pos]) + ', ' + str(topic_model['topic_weights'][topic_most_influential_neg]))
    
def print_num_topics_by_influence(topic_weights):
    num_pos = len(list(filter(lambda x: (x > 0), topic_weights)))
    num_neg = len(list(filter(lambda x: (x < 0), topic_weights)))
    print('Number of positively influencing topics: ' + str(num_pos))
    print('Number of negatively influencing topics: ' + str(num_neg))

In [4]:
# Main Visualization function
def visualize_topic_model(out, model, column, C, C_Topics, num_topics, max_df, max_features):

    # 1. load_model
    fname = get_filename(out, model, column, C, C_Topics, num_topics, max_df, max_features)
    topic_model = load_data(fname)
    print_shapes(topic_model)

    # 2. prepare visualization
    visual_data = prepare_visualization(topic_model)

    # 3. enable visualization
    return visual_data

In [5]:
# Load Best Hyperparameter Combinations for Each Column

In [6]:
fname_best_results = "../out/hyperparam_combinations.npy"
fname_best_results_npmi = "../out/hyperparam_combinations_npmi.npy"
best_combinations = np.load(fname_best_results, allow_pickle=True)[0]
best_combinations_npmi = np.load(fname_best_results_npmi, allow_pickle=True)[0]
print(best_combinations)
print("-----")
print(best_combinations_npmi)

[['challenges_obstacles_mistakes' '0.74' '0.84' '1.47' '-0.33'
  (1e-05, 0.0001, 20, 0.8, 2000)]
 ['idea_backstory_motivation' '0.44' '0.62' '1.6' '-0.33'
  (1e-06, 0.0001, 20, 0.5, 2000)]
 ['background_and_current_work' '0.5' '0.67' '1.42' '-0.35'
  (0.0001, 5e-06, 20, 0.5, 2000)]
 ['main_lessons_and_advice' '0.62' '0.76' '1.33' '-0.35'
  (1e-06, 0.0001, 10, 0.5, 2000)]
 ['from_idea_to_product' '0.68' '0.81' '1.74' '-0.27'
  (5e-05, 1e-06, 10, 0.5, 2000)]
 ['marketing_strategies' '0.57' '0.72' '1.65' '-0.32'
  (0.0001, 1e-06, 10, 0.5, 2000)]]
-----
[['from_idea_to_product' '0.68' '0.81' '2.37' '-0.11'
  (0.0001, 1e-06, 10, 0.6, 1000)]
 ['main_lessons_and_advice' '0.65' '0.77' '1.96' '-0.12'
  (1e-06, 5e-05, 20, 0.6, 500)]
 ['idea_backstory_motivation' '0.48' '0.63' '2.84' '-0.02'
  (1e-05, 5e-05, 20, 0.5, 500)]
 ['background_and_current_work' '0.5' '0.67' '1.72' '-0.22'
  (0.0001, 1e-06, 10, 0.6, 1000)]
 ['challenges_obstacles_mistakes' '0.78' '0.86' '2.38' '-0.03'
  (1e-05, 0.0001, 3

In [7]:
# PyLDAVisualizations

In [8]:
(c, c_topics, num_topics, max_df, max_features) = best_combinations[0][5]
column = best_combinations[0][0]
fname = get_filename(out, 'hstm-all', column, c, c_topics, num_topics, max_df, max_features)
topic_model = load_data(fname)
print(column_names[column])
print_topic_with_biggest_influence(topic_model['topic_weights'])
print_num_topics_by_influence(topic_model['topic_weights'])
visual_data = prepare_visualization(topic_model)
pyLDAvis.display(visual_data)

Challenges, Obstacles & Mistakes
Topics with biggest influence: 1, 18
Weights: 0.23083389, -0.21097124
Number of positively influencing topics: 11
Number of negatively influencing topics: 9


In [9]:
(c, c_topics, num_topics, max_df, max_features) = best_combinations_npmi[0][5]
column = best_combinations_npmi[0][0]
fname = get_filename(out, 'hstm-all', column, c, c_topics, num_topics, max_df, max_features)
topic_model = load_data(fname)
print(column_names[column])
print_topic_with_biggest_influence(topic_model['topic_weights'])
print_num_topics_by_influence(topic_model['topic_weights'])
visual_data = prepare_visualization(topic_model)
pyLDAvis.display(visual_data)

From Idea To Product
Topics with biggest influence: 3, 7
Weights: 0.30700335, -0.21592718
Number of positively influencing topics: 6
Number of negatively influencing topics: 4


In [10]:
(c, c_topics, num_topics, max_df, max_features) = best_combinations[1][5]
column = best_combinations[1][0]
fname = get_filename(out, 'hstm-all', column, c, c_topics, num_topics, max_df, max_features)
topic_model = load_data(fname)
print(column_names[column])
print_topic_with_biggest_influence(topic_model['topic_weights'])
print_num_topics_by_influence(topic_model['topic_weights'])
visual_data = prepare_visualization(topic_model)
pyLDAvis.display(visual_data)

Idea, Backstory & Motivation
Topics with biggest influence: 1, 18
Weights: 0.23083383, -0.21297146
Number of positively influencing topics: 11
Number of negatively influencing topics: 9


In [11]:
(c, c_topics, num_topics, max_df, max_features) = best_combinations_npmi[1][5]
column = best_combinations_npmi[1][0]
fname = get_filename(out, 'hstm-all', column, c, c_topics, num_topics, max_df, max_features)
topic_model = load_data(fname)
print(column_names[column])
print_topic_with_biggest_influence(topic_model['topic_weights'])
print_num_topics_by_influence(topic_model['topic_weights'])
visual_data = prepare_visualization(topic_model)
pyLDAvis.display(visual_data)

Main Lessons & Advice
Topics with biggest influence: 17, 16
Weights: 0.23058762, -0.21273659
Number of positively influencing topics: 12
Number of negatively influencing topics: 8


In [12]:
(c, c_topics, num_topics, max_df, max_features) = best_combinations[2][5]
column = best_combinations[2][0]
fname = get_filename(out, 'hstm-all', column, c, c_topics, num_topics, max_df, max_features)
topic_model = load_data(fname)
print(column_names[column])
print_topic_with_biggest_influence(topic_model['topic_weights'])
visual_data = prepare_visualization(topic_model)
pyLDAvis.display(visual_data)

Background and Current Work
Topics with biggest influence: 1, 18
Weights: 0.23083389, -0.21097127


In [13]:
(c, c_topics, num_topics, max_df, max_features) = best_combinations_npmi[2][5]
column = best_combinations_npmi[2][0]
fname = get_filename(out, 'hstm-all', column, c, c_topics, num_topics, max_df, max_features)
topic_model = load_data(fname)
print(column_names[column])
print_topic_with_biggest_influence(topic_model['topic_weights'])
visual_data = prepare_visualization(topic_model)
pyLDAvis.display(visual_data)

Idea, Backstory & Motivation
Topics with biggest influence: 17, 16
Weights: 0.22658756, -0.21273656


In [14]:
(c, c_topics, num_topics, max_df, max_features) = best_combinations[3][5]
column = best_combinations[3][0]
fname = get_filename(out, 'hstm-all', column, c, c_topics, num_topics, max_df, max_features)
topic_model = load_data(fname)
print(column_names[column])
print_topic_with_biggest_influence(topic_model['topic_weights'])
print_num_topics_by_influence(topic_model['topic_weights'])
visual_data = prepare_visualization(topic_model)
pyLDAvis.display(visual_data)

Main Lessons & Advice
Topics with biggest influence: 7, 5
Weights: 0.29571918, -0.3013557
Number of positively influencing topics: 4
Number of negatively influencing topics: 6


In [15]:
(c, c_topics, num_topics, max_df, max_features) = best_combinations_npmi[3][5]
column = best_combinations_npmi[3][0]
fname = get_filename(out, 'hstm-all', column, c, c_topics, num_topics, max_df, max_features)
topic_model = load_data(fname)
print(column_names[column])
print_topic_with_biggest_influence(topic_model['topic_weights'])
print_num_topics_by_influence(topic_model['topic_weights'])
visual_data = prepare_visualization(topic_model)
pyLDAvis.display(visual_data)

Background and Current Work
Topics with biggest influence: 3, 7
Weights: 0.30700335, -0.21592721
Number of positively influencing topics: 6
Number of negatively influencing topics: 4


In [16]:
(c, c_topics, num_topics, max_df, max_features) = best_combinations[4][5]
column = best_combinations[4][0]
fname = get_filename(out, 'hstm-all', column, c, c_topics, num_topics, max_df, max_features)
topic_model = load_data(fname)
print(column_names[column])
print_topic_with_biggest_influence(topic_model['topic_weights'])
print_num_topics_by_influence(topic_model['topic_weights'])
visual_data = prepare_visualization(topic_model)
pyLDAvis.display(visual_data)

From Idea To Product
Topics with biggest influence: 7, 5
Weights: 0.29571918, -0.29935572
Number of positively influencing topics: 5
Number of negatively influencing topics: 5


In [17]:
(c, c_topics, num_topics, max_df, max_features) = best_combinations_npmi[4][5]
column = best_combinations_npmi[4][0]
fname = get_filename(out, 'hstm-all', column, c, c_topics, num_topics, max_df, max_features)
topic_model = load_data(fname)
print(column_names[column])
print_topic_with_biggest_influence(topic_model['topic_weights'])
print_num_topics_by_influence(topic_model['topic_weights'])
visual_data = prepare_visualization(topic_model)
pyLDAvis.display(visual_data)

Challenges, Obstacles & Mistakes
Topics with biggest influence: 23, 1
Weights: 0.17212455, -0.15971263
Number of positively influencing topics: 17
Number of negatively influencing topics: 13


In [18]:
(c, c_topics, num_topics, max_df, max_features) = best_combinations[5][5]
column = best_combinations[5][0]
fname = get_filename(out, 'hstm-all', column, c, c_topics, num_topics, max_df, max_features)
topic_model = load_data(fname)
print(column_names[column])
print_topic_with_biggest_influence(topic_model['topic_weights'])
print_num_topics_by_influence(topic_model['topic_weights'])
visual_data = prepare_visualization(topic_model)
pyLDAvis.display(visual_data)

Marketing Strategies
Topics with biggest influence: 7, 5
Weights: 0.29771915, -0.29935572
Number of positively influencing topics: 5
Number of negatively influencing topics: 5


In [19]:
(c, c_topics, num_topics, max_df, max_features) = best_combinations_npmi[5][5]
column = best_combinations_npmi[5][0]
fname = get_filename(out, 'hstm-all', column, c, c_topics, num_topics, max_df, max_features)
topic_model = load_data(fname)
print(column_names[column])
print_topic_with_biggest_influence(topic_model['topic_weights'])
print_num_topics_by_influence(topic_model['topic_weights'])
visual_data = prepare_visualization(topic_model)
pyLDAvis.display(visual_data)

Marketing Strategies
Topics with biggest influence: 3, 29
Weights: 0.16181436, -0.16708112
Number of positively influencing topics: 19
Number of negatively influencing topics: 11


In [139]:
    # Explain why it doesn't make sense to use SHAP values (number of features)
    # Don't forget to reiterate that not only topic but also words and heterogeneity influence outcome (only shows one part of the model)