In [1]:
import spacy
import codecs
from __future__ import print_function
nlp = spacy.load('en', disable=['parser', 'ner'])

In [2]:
def tag(token):
    word = token.text
    tag = token.tag_
    return word+'_'+tag

In [75]:
import thinc.extra.datasets
data, _ = thinc.extra.datasets.imdb()
texts, _ = zip(*data[-10000:])

In [76]:
print(len(texts))

10000


In [77]:
filtered = []
for doc in texts:
    filtered.append(' '.join(w.text for w in nlp(doc) if w.pos_ in ['NOUN', 'VERB']))

In [78]:
from collections import Counter
word_freq = Counter()
for t in filtered:
    word_freq.update(t.split())
print(word_freq.most_common(5))

[(u'is', 43748), (u'was', 20075), (u'movie', 17216), (u'film', 16060), (u"'s", 12189)]


In [79]:
vocab = [x[0] for x in word_freq.most_common(1550)]
assert len(vocab) == 1550

In [80]:
from sklearn.feature_extraction.text import CountVectorizer
docs = filtered
count_model = CountVectorizer(vocabulary=vocab)
X = count_model.fit_transform(docs)
Xc = (X.T * X) 
Xc.setdiag(0)
print(Xc.todense())

[[    0 99400 92004 ...   467   432   512]
 [99400     0 47475 ...   206   219   174]
 [92004 47475     0 ...   152   193   143]
 ...
 [  467   206   152 ...     0     2     2]
 [  432   219   193 ...     2     0     0]
 [  512   174   143 ...     2     0     0]]


In [81]:
# invert the vocabulary
di = dict([[v,k] for k,v in count_model.vocabulary_.items()])

In [82]:
Xc.shape

(1550, 1550)

In [83]:
with open('mono.dm',"w") as o:
    for word, counts in zip(vocab, Xc.toarray()):
        o.write(word.encode('utf-8')+' '+" ".join(map(str, counts)))
        o.write('\n')

In [84]:
with open('mono.rows', 'w') as o:
    for w in vocab:
        o.write(w.encode('utf-8'))
        o.write('\n')

In [85]:
with open('mono.cols', 'w') as o:
    for w in vocab:
        o.write(w.encode('utf-8'))
        o.write('\n')

In [86]:
from composes.semantic_space.space import Space

#create a space from co-occurrence counts in sparse format
my_space = Space.build(data = "./mono.dm",
                       rows = "./mono.rows",
                       cols = "./mono.cols",
                       format = "dm")

In [92]:
from composes.similarity.cos import CosSimilarity


[('watch', 0.9999999999999999), ('like', 0.9960006811821124), ('lot', 0.9957635405505887), ('give', 0.9955334607061698), ('watching', 0.9953829088219986)]


In [93]:
for w in word_freq.most_common(5):
    print(my_space.get_neighbours(w[0], 5, CosSimilarity()))

[('is', 0.9999999999999998), ('seemed', 0.8046856388600236), ('sat', 0.7979291245288295), ('went', 0.7901582461772928), ('rented', 0.789837450921472)]
[('was', 1.0000000000000002), ('fails', 0.958414549691608), ('deserves', 0.9567530492543871), ('makes', 0.9534803352206943), ('add', 0.9530460131948736)]
[('movie', 0.9999999999999998), ('show', 0.9791018375391118), ('series', 0.9767045873087746), ('land', 0.9744854424598389), ('belief', 0.9719459802706958)]
[('film', 1.0), ('series', 0.9739397474509586), ('show', 0.973368591107264), ('door', 0.97231024407701), ('return', 0.971558815826768)]
[('is', 0.0), ('was', 0.0), ('movie', 0.0), ('film', 0.0), ("'s", 0.0)]
