## Visualizing high-dimensional data with Bokeh and PCA/t-SNE

Using the toy digits dataset available from scikit-learn, this demonstrates how to make use of PCA & t-SNE to project high dimensional data (in our case, NSF abstracts) down to two dimensions, allowing us to reasonably visualize and explore how the data is distributed (**assuming the projections actually worked the way we expected them to**). This also uses Bokeh, a library that lets us interact with the visualizations from within the notebook. 

In [None]:
import bokeh
from bokeh.io import output_notebook, show
from bokeh.plotting import figure, ColumnDataSource
from bokeh.models import HoverTool, BoxSelectTool, BoxZoomTool, WheelZoomTool
import numpy as np
import gensim
import random
from gensim.models.doc2vec import Doc2Vec, FAST_VERSION, LabeledSentence
import csv
import pickle
import nltk
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt
from numpy.linalg import norm
import re
from matplotlib import colors
import six

assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"

output_notebook(bokeh.resources.INLINE)



### Loading Data

In [None]:

text_fpath = '/Users/Bartley/Desktop/NSF_abstracts_subset.csv'

extra_abbreviations = ['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'i.e', 'e.g', 'ph.d']
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sentence_tokenizer._params.abbrev_types.update(extra_abbreviations)




In [None]:

min_len = 2
patt = r'[,\.-_]+$'
size = 100
min_count = 50
num_abs = 500

docs = []
div_dir = []
abs_uid_map = {}
directorates = []
texts = []
uids = []
for iter_ in xrange(1):
    with open(text_fpath, 'r') as fp:
        csvReader = csv.reader(fp)
        csvReader.next()
        abs_ix = 0
        uid = 0
        rand_abs = []
        for row in csvReader:
            rand_abs.append(row[0])
        random.shuffle(rand_abs)

        fp.seek(0)
        for row in csvReader:
            abs_ = row[2].decode('latin-1', 'backslashreplace').encode('ascii', 'backslashreplace')
            abs_id = row[0]
            if abs_id not in rand_abs:
                continue
            sents = nltk.sent_tokenize(abs_)
            sent_ix = 0
            texts.append(abs_)
            docs.append(LabeledSentence(words=[re.sub(patt, '', x) if len(re.sub(patt, '', x)) >= min_len else '' for x in abs_.lower().split()], tags = ['%d' % (uid)]))
 
            uids.append(uid)

            uid += 1
            abs_ix += 1 
            div_dir.append(row[14:16])
            directorates.append(row[14])
            if abs_ix > num_abs:
                break


directorates_uniq = list(set(directorates))

In [None]:
model = Doc2Vec(docs, size=size, window=5, min_count=min_count, dm=0,dm_concat=0, workers=4, dbow_words=1, sample=1e-5)

print len(model.vocab)



In [None]:
for epoch in xrange(0,15):
    random.shuffle(docs)
    model.train(docs)

In [None]:
docvecs = model.docvecs[uids]

In [None]:
docvecs

### Using PCA & t-SNE to change the data

In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

In [None]:
raw_X = docvecs

In [None]:
pca = PCA(n_components=25)
pca_X = pca.fit_transform(raw_X)

In [None]:
pca_X.shape

In [None]:
tsne = TSNE()
tsne_X = tsne.fit_transform(pca_X)

In [None]:
tsne_X.shape

### Using Bokeh to plot the low-dimensional data

In [None]:
%matplotlib inline
colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c", 
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5", 
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f", 
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5",
    "#ffff00", "ff00ff", "#ff0000", "00ff00", "0000ff",
    "#00ffff", "00ccff", "00ffcc", "ff00cc"
])


# create a new plot with default tools, using figure
#hover = HoverTool(tooltips = {"label": "@label", "text": "@text"})
hover = HoverTool(tooltips="""
        <div>
            <span style="font-size: 17px; font-weight: bold; width:400px; display:block;">@label</span>
            <span style="font-size: 15px; color: #966; width:400px; display:block;">[$index]</span>
        </div>
        <div>
            <span style="font-size: 15px;">Text</span>
            <span style="font-size: 8px; color: #696; display:block; width:400px; word-break: break-all;">(@text)</span>
        </div>
""")


p = figure(plot_width=1000, plot_height=1000, tools=[hover, BoxZoomTool(), WheelZoomTool()])

# add a circle renderer with a size, color, and alpha
#p.circle([1, 2, 3, 4, 5], [6, 7, 2, 4, 5], size=15, line_color="navy", fill_color="orange", fill_alpha=0.5)

p.scatter(x=tsne_X[:,0], y=tsne_X[:,1], 
          color=colormap[[directorates_uniq.index(x) for x in directorates]],
          source=ColumnDataSource(data={
                        "label": div_dir,
                        "text": [text[:500] for text in texts]
                    }))

show(p) # show the results