# Comparing My Output to Das Java Output 
## NIPS Dataset with 50 Topics

In [1]:
import numpy as np
import os
from operator import itemgetter
from collections import Counter
import scipy.stats as stat
from gensim.models import Word2Vec
from nltk import corpus

#### Using 50-dim word vectors trained on wikipedia

In [2]:
wvmodel = Word2Vec.load_word2vec_format(
    "/Users/michael/Documents/Gaussian_LDA-master/data/glove.wiki/glove.6B.50d.txt", binary=False)
print "word-vector dimension: {}".format(wvmodel.vector_size)



word-vector dimension: 50


#### Routine cleaning

stopwords and words in language model

In [3]:
wv_vocab = set(wvmodel.vocab.keys())
stops = set(corpus.stopwords.words(fileids="english"))

In [4]:
corpus = []
nips_path = "/Users/michael/Documents/GaussianLDA/data/"
for folder in os.listdir(nips_path)[1:]:
    for doc in os.listdir(nips_path + folder):
        with open(nips_path + folder + "/" + doc, 'r') as f:
            txt = f.read().split()
            txt = map(lambda x: x.lower(), txt)  # Lowercasing each word
            txt = filter(lambda word: [letter for letter in word if ord(letter) < 128 ], txt)  # Checking each word for ascci error
            txt = filter(lambda x: x not in stops, txt)  # Removing stop words
            txt = filter(lambda x: x.isalpha(), txt)  # Removing non-letter words (eg numbers and symbols)
            txt = filter(lambda x: len(x) > 2, txt)  # removing super short words and single letters
            txt = filter(lambda x: x in wv_vocab, txt)    
            txt = ' '.join(txt)
            corpus.append(txt)
            f.close()

In [5]:
words = []
for doc in corpus:
    for word in doc.split():
        words.append(word)
c = Counter(words)
vocab = set(words)

#### Removing very common words

In [6]:
temp_corp = []
common_words = zip(*c.most_common(n=50))[0]
for doc in corpus:
    doc = doc.split()
    doc = filter(lambda x: x not in common_words, doc)
    doc = " ".join(doc)
    temp_corp.append(doc)
corpus = temp_corp

Less words now!

In [7]:
print len(list(vocab))
print len(words)
vocab = list(vocab)

9007
116819


#### Shaping data for the java model

It requires strange files where words are indexed, and we provide the word-vectors corropsonding to the word indicies. 

In [8]:
wordids = {word: i for i, word in enumerate(vocab)}

In [9]:
vecs = {i: wvmodel[word] for word, i in wordids.iteritems()}

In [10]:
index_corpus = []
for doc in corpus:
    docids = []
    for word in doc.split():
        docids.append(wordids[word])
    index_corpus.append(docids)

Sanity checking output - list (corpus) of lists(docs) of ints(words)

In [14]:
index_corpus[:2]

[[7942,
  2469,
  2542,
  4611,
  3594,
  1744,
  1453,
  1572,
  3379,
  3482,
  6915,
  4678,
  3163,
  6690,
  4794,
  6852,
  7264,
  1221,
  7412,
  6399,
  1445,
  7616,
  4899,
  2016,
  4787,
  931,
  3379,
  7986,
  3335,
  1427,
  4553,
  8979,
  8199,
  8341,
  3347,
  8887,
  1856,
  1367,
  5649,
  3379,
  6503,
  697,
  2207,
  5727,
  4970,
  3764,
  4441,
  1676,
  6061,
  2578,
  2799,
  5832,
  1857,
  428,
  2542,
  5226,
  8985,
  2804,
  2937,
  7421,
  1510,
  3268,
  3322,
  3897,
  8957,
  4812,
  8289,
  6908,
  1427,
  7346,
  6912,
  4292,
  2799,
  5478,
  3773,
  6262,
  7780,
  2804,
  2937,
  8690,
  1895,
  5008,
  7780,
  8655,
  3166,
  6026,
  2578,
  486,
  3349,
  2469,
  7101,
  3166,
  962,
  7974,
  5383,
  2307,
  4576,
  2189,
  2469,
  1329,
  5952,
  1521,
  2048,
  2982,
  3296,
  2318,
  7745,
  6027,
  1257,
  4010,
  3600,
  5374,
  7877,
  2542,
  2469,
  4441,
  605,
  8483,
  2426,
  3335,
  365,
  7942,
  5727,
  8341,
  2207,
  2996,

In [30]:
np.array(vecs.values()).shape

(9007, 50)

Writing to file...it does not want things delmited by a comma

Also, no lists or anything, just new lines for a new word-vector or new document

In [23]:
with open("/Users/michael/Documents/GaussianLDA/dasnips.txt", 'w') as f:
    for doc in index_corpus:
#         f.write(str(doc).replace("[", "").replace("]","") + "\n")
        for word in doc:
            f.write(str(word) + " ")
        f.write("\n")

In [19]:
# lazy way of creating file that did not exist before
with open("/Users/michael/Documents/GaussianLDA/dasnips_vecs50.txt", 'w') as f:
    None

Saving the word-vectors to file

In [89]:
np.savetxt("/Users/michael/Documents/GaussianLDA/dasnips_vecs50.txt", np.array(vecs.values()))

### Loading Das Results

We've run it, now lets look at the results

In [31]:
fp = "/Users/michael/Documents/Gaussian_LDA-master/output/"

In [28]:
!ls /Users/michael/Documents/Gaussian_LDA-master/output/sample50D2Ta

ls: /Users/michael/Documents/Gaussian_LDA-master/output/sample50D2Ta: No such file or directory


In [46]:
from collections import defaultdict

#### Super annoying to load the data in

must read data, turn it into floats, and throw into numpy array..

Only doing the means so we can examine them.  Some instances, they come out all exactly the same. 

#### Throws error b/c of junk with same name that the program outputs.  Just ignore it there for now, it gets our topic params out just fine

In [81]:
das_data = defaultdict(dict)
counter = 1
for f in os.listdir("/Users/michael/Documents/Gaussian_LDA-master/output/"):
    if f.startswith("sample50D2Ta"):
        with open(fp+f, 'r') as datum:

#             print datum.readlines()[1:]
            topics = datum.readlines()
            das_data[counter]['mean'] = np.array([float(mean) for mean in topics[0].split()])
            das_data[counter]['cov'] = topics[1:]
            counter += 1

ValueError: could not convert string to float: Starting

In [82]:
das_data[1]['mean']

## Checking rough meaning of each topic-mean 

Using Gensim Word2Vec model's Most_Similar() method.  This gives us a super quick and dirty idea of the peak of the distribution lays in the coordinate space.   While this may not be informative of the covariance and what words it might produce, its fast + easy

More data munging just to look at it

In [90]:
for k in range(1, 51): # choose on number of topics.. only doing +1 since I was lazy and made it start at one, and not zero. 
    mean = das_data[k]['mean'].T
    print wvmodel.most_similar(positive=[mean])

[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (u'different', 0.8317035436630249)]
[(u'rather', 0.8870659470558167), (u'specific', 0.8716172575950623), (u'certain', 0.87103271484375), (u'particular', 0.8699130415916443), (u'furthermore', 0.8485186100006104), (u'means', 0.8480140566825867), (u'example', 0.8467985391616821), (u'thus', 0.8414475321769714), (u'therefore', 0.8393896222114563), (

In [65]:
for k in range(1, 51):
    das_data[k]['mean'] = np.array(das_data[k]['mean'])
    das_data[k]['cov'] = np.array(das_data[k]['cov'])