# Topic Modeling on NIPS Dataset Using Gaussian LDA w/ Word-Embeddings

In [86]:
import numpy as np
import os
from operator import itemgetter
from collections import Counter
import scipy.stats as stat
from gensim.models import Word2Vec
from nltk import corpus
import FastGaussianLDA2

#### Loading the word_vector model with GenSim

In [97]:
wvmodel = Word2Vec.load_word2vec_format(
    "/Users/michael/Documents/Gaussian_LDA-master/data/glove.wiki/glove.6B.50d.txt", binary=False)
print "word-vector dimension: {}".format(wvmodel.vector_size())



#### Sets of vocab to filter on: NLTK StopWords and Glove vocab

In [108]:
wv_vocab = set(wvmodel.vocab.keys())
stops = set(corpus.stopwords.words(fileids="english"))

#### Document cleaning
- Tokenizing just on spaces
- no lemmatization or stemming
- removing non-ascci characters
- removing stop words
- removing words not in Glove vocab
- removing non-alpha words (e.g. Letter and symbols)
- removing short words < 2 characters long
- lowercasing all words

In [109]:
corpus = []
nips_path = "/Users/michael/Documents/GaussianLDA/data/"
for folder in os.listdir(nips_path)[1:]:
    for doc in os.listdir(nips_path + folder):
        with open(nips_path + folder + "/" + doc, 'r') as f:
            txt = f.read().split()
            txt = map(lambda x: x.lower(), txt)  # Lowercasing each word
            txt = filter(lambda word: [letter for letter in word if ord(letter) < 128 ], txt)  # Checking each word for ascci error
            txt = filter(lambda x: x not in stops, txt)  # Removing stop words
            txt = filter(lambda x: x.isalpha(), txt)  # Removing non-letter words (eg numbers and symbols)
            txt = filter(lambda x: len(x) > 2, txt)  # removing super short words and single letters
            txt = filter(lambda x: x in wv_vocab, txt)    
            txt = ' '.join(txt)
            corpus.append(txt)

In [None]:
print "Number of documents in corpus: {}".format(len(corpus))

In [None]:
reload(FastGaussianLDA2)
topics = 50
dim = 50
run_num = 1
outputfile = "/Users/michael/Documents/GaussianLDA/output/NIPS_{}_{}T_{}D_".format(str(run_num),
                                                                                  str(topics), 
                                                                                  str(dim))
lda = FastGaussianLDA2.Gauss_LDA(topics, corpus, word_vector_model=wvmodel, alpha=.5, outputfile=outputfile)
lda.fit(50)  # Number of samples to run

Done processing corpus with 111 documents
There are 9007 words that could be converted to word vectors in your corpus 
There are 0 words that could NOT be converted to word vectors
getting cluster centroids
[   425.   2899.    465.   1933.    560.   3618.   1019.    840.    752.
   4657.    413.   1823.    818.   2918.   2437.    626.   2464.   2074.
   5779.    579.   2841.    419.    451.    193.   4595.   2902.   2167.
   1466.    537.   6669.   1570.   2227.    720.   1901.    541.    934.
    494.   3562.    728.  10024.   9659.   3327.    446.   5769.   2181.
   3879.   3630.    732.    411.   4745.]
Initialization complete
Starting fit
print topic means
TOPIC 0: (u'lurching', u'wobbling', u'awkwardly', u'veering', u'momentarily', u'nimbly', u'gingerly', u'stealthily', u'yanking')
TOPIC 1: (u'consider', u'decision', u'considering', u'whether', u'accept', u'step', u'possibility', u'should', u'would')
TOPIC 2: (u'maximally', u'describable', u'axiomatically', u'biochemically', u'red