In [13]:
from lda2vec import preprocess, Corpus
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline

try:
    import seaborn
except:
    pass

You must be using a very recent version of pyLDAvis to use the lda2vec outputs. 
As of this writing, anything past Jan 6 2016 or this commit `14e7b5f60d8360eb84969ff08a1b77b365a5878e` should work.
You can do this quickly by installing it directly from master like so:


In [None]:
# pip install -U git+https://github.com/bmabey/pyLDAvis.git@master#egg=pyLDAvis

In [2]:
import pyLDAvis
pyLDAvis.enable_notebook()

### Reading in the saved model story topics

After runnning `lda2vec_run.py` script in `examples/hacker_news/lda2vec` directory `topics.story.pyldavis.npz` and `topics.author.pyldavis.npz` will be created that contain the topic-to-word probabilities and frequencies. What's left is to visualize and label each topic from the it's prevalent words.

In [11]:
npz = np.load(open('topics.story.pyldavis.npz', 'r'))
dat = {k: v for (k, v) in npz.iteritems()}
dat['vocab'] = dat['vocab'].tolist()

In [12]:
top_n = 10
topic_to_topwords = {}
for j, topic_to_word in enumerate(dat['topic_term_dists']):
    top = np.argsort(topic_to_word)[::-1][:top_n]
    msg = 'Topic %i '  % (j+ 1)
    top_words = [dat['vocab'][i].strip()[:35] for i in top]
    msg += ' '.join(top_words)
    print msg
    topic_to_topwords[j] = top_words

Topic 1 housing costs rent control more housing affordable housing new housing housing prices taxis house prices gentrifying housing
Topic 2 thinkpad xmonad macbook air xubuntu qwerty x220 optical drive trackpoint window manager external monitor
Topic 3 algebra reading speed calculus common core linear algebra meditation sucrose meditations mathematics deliberate practice
Topic 4 electric cars model s trader joe's broder theaters starz b&#38;n foxconn charging stations top gear
Topic 5 bing g+ bing toolbar cuil google+. ddg google+ rss ddg. duck duck go
Topic 6 tesla&#x27;s hyperloop lyft km&#x2f;h electric cars concorde autonomous vehicles driving cars f-35 autonomous cars
Topic 7 dawkins ebert sorkin atheists gladwell gender roles sexuality ramanujan dolphins marie curie
Topic 8 btc bitcoins mtgox bitcoin denominated mt. gox capital gains bitcoin&#x27;s index fund tax liability
Topic 9 college experience graduates mba program top school cs degrees good school idea guy business skills

### Visualize story topics

In [5]:
import warnings
warnings.filterwarnings('ignore')
prepared_data_story = pyLDAvis.prepare(dat['topic_term_dists'], dat['doc_topic_dists'], 
                                       dat['doc_lengths'] * 1.0, dat['vocab'], dat['term_frequency'] * 1.0, sort_topics=False)

In [6]:
pyLDAvis.display(prepared_data_story)

I spent an hour looking through the visualization above and manually labeled each topic, which is reproduced below.

In [164]:
labels = [   'housing social issues, affordability, rent',
    'computer hardware and monitors',
    'math, language, meditation and education',
    'cars and entertainment',
    'bing, google, facebook, search engines',
    'transportation and military',
    'technology in the media and society',
    'finance and bitcoin',
    'higher education, business and grad schools',
    'sleep, stimulants, and excercise',
    'programming (introductory)',
    'interviews, severance, salaries, reviews',
    'health, dieting and nutrition',
    'civil rights, gay rights, sexual harassment, free speech',
    'internet security, passwords, authentication',
    'physics and computer science',
    'academic success, testing, grades',
    'privacy, FBI, wiretapping',
    'internet media, streaming, advertising, communication',
    'job posting (remote)',
    'online payments, banking, domain registration, user accounts',
    'programming frameworks, stacks, ecosystems, OSs',
    'programming (advanced)',
    'job posting (general)',
    'freelancing, salary, equity',
    'design, typography, user experience',
    'tech culture, stem workers, bootcamps',
    'mental health, introversion, therapy, work/life balance',
    'karma, votes, comments, stories, rss',
    'desktop environments, linux, xp, gnome',
    'programming (theory)',
    'job posting (general)',
    'energy, public policy',
    'programming (browser)',
    'job posting (general)',
    'software patents, patent trolls, patent law',
    'games, gaming hardware and displays',
    'terrorism, surveillance, consitutionality, ',
    'code editors, programming fonts, terminals',
    'cloud technology, docker, AWS'
]
labels = np.array(labels)

### Article Features

In [106]:
features = pd.read_csv('../data/hacker_news_comments.csv', encoding='utf8')

In [108]:
# Convert all integer arrays to int32
for col, dtype in zip(features.columns, features.dtypes):
    if dtype is np.dtype('int64'):
        features[col] = features[col].astype('int32')

In [110]:
max_length = 250   # Limit of 250 words per comment
min_author_comments = 50  # Exclude authors with fewer comments
nrows = None  # Number of rows of file to read; None reads in full file

In [111]:
# Extract numpy arrays over the fields we want covered by topics
# Convert to categorical variables
author_counts = features['comment_author'].value_counts()
to_remove = author_counts[author_counts < min_author_comments].index
mask = features['comment_author'].isin(to_remove).values
author_name = features['comment_author'].values.copy()
author_name[mask] = 'infrequent_author'
features['comment_author'] = author_name
authors = pd.Categorical(features['comment_author'])
author_id = authors.codes
author_name = authors.categories
story_id = pd.Categorical(features['story_id']).codes
# Chop timestamps into days
story_time = pd.to_datetime(features['story_time'], unit='s')
days_since = (story_time - story_time.min()) / pd.Timedelta('1 day')
time_id = days_since.astype('int32') 
features['story_id_codes'] = story_id 
features['author_id_codes'] = story_id
features['time_id_codes'] = time_id

#### Individual documents

In [144]:
top_urls = features['story_url'].value_counts().index
mask = features['story_url'] == top_urls[1]
story_id_code = features[mask].story_id_codes.values[0]
story_id_url = features[mask].story_url.values[0]

In [145]:
story_id_url

u'http://googleblog.blogspot.com/2013/03/a-second-spring-of-cleaning.html'

In [153]:
topics=dat['doc_topic_dists'][story_id_code]

In [170]:
msg = "{fraction:02d}% {text:s}"
for idx in np.argsort(topics)[::-1][:5]:
    print msg.format(fraction=int(100.0 * topics[idx]), text=labels[idx])

27% bing, google, facebook, search engines
15% karma, votes, comments, stories, rss
08% online payments, banking, domain registration, user accounts
07% internet security, passwords, authentication
05% computer hardware and monitors


Looking at these topics and then reading the [HN article comments](u'http://googleblog.blogspot.com/2013/03/a-second-spring-of-cleaning.html') this is about Google Reader shutting down -- it's appropriate that the top topic is about Google itself and the second topic is about RSS.

#### Plots of topics vs time

#### Topics vs points

### Vector Math

In [23]:
from lda2vec_model import LDA2Vec

In [None]:
model = LDA2Vec

### Visualize Author Topics

Unfortunately, this is a failed experiment! Looking at the user-level topics just generates nonsense. There might be one or two coherent topics in the bunch, but for the most part it makes very little sense.

In [None]:
prepared_data_author = pyLDAvis.prepare(dat['topic_term_dists'], dat['doc_topic_dists'], 
                                        dat['doc_lengths'] * 1.0, dat['vocab'], dat['term_frequency'] * 1.0)

In [7]:
npz = np.load(open('topics.author.pyldavis.npz', 'r'))
dat = {k: v for (k, v) in npz.iteritems()}
dat['vocab'] = dat['vocab'].tolist()
top_n = 10
topic_to_topwords = {}
for j, topic_to_word in enumerate(dat['topic_term_dists']):
    top = np.argsort(topic_to_word)[::-1][:top_n]
    msg = 'Topic %i '  % j
    top_words = [dat['vocab'][i].strip()[:35] for i in top]
    msg += ' '.join(top_words)
    print msg
    topic_to_topwords[j] = top_words

Topic 0 out_of_vocabulary submitted article <SKIP> cognate work-sample test href="http://norvig.com/experiment xtopdf learners democratic self-government p.
Topic 1 out_of_vocabulary  <SKIP> wasen&#x27;t  probally twiddla --and huge engagement &quot;rent
Topic 2  out_of_vocabulary <SKIP> alot ie- ve wasen&#x27;t realy nt bad product
Topic 3 portfolio:<p extensive experience building e-com heta resourceful.<p>reach tax experts music artists here&#x27;re href="http://github.com/sidmitra rel="nofollow">http:&#x2f;&#x2f;www rel="nofollow">http:&#x2f;&#x2f;git
Topic 4 rel="nofollow">http:&#x2f;&#x2f;tur accountants.<p courses.<p tax experts intuit]<p rel="nofollow">http:&#x2f;&#x2f;git href="http://github.com/sidmitra rel="nofollow">http:&#x2f;&#x2f;www music artists out_of_vocabulary
Topic 5 submitted article great web software substantial annual turnover july 2007 limited-time online sales current name-brand goods homeware lifestyle categories tricky integration test postdoctoral fellows


In [9]:
pyLDAvis.display(prepared_data_author)