In [33]:
import nltk
import string
import numpy
from sklearn.datasets import fetch_20newsgroups
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from gensim.models import Word2Vec


dataset = fetch_20newsgroups(subset='all')

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(dataset.data)

from sklearn.decomposition import TruncatedSVD
n_comp = 10
svd = TruncatedSVD(n_components=n_comp)
X_svd = svd.fit_transform(X_tfidf)

y = dataset.target
X_train, X_test, y_train, y_test = train_test_split(X_svd, y, test_size = 0.3, random_state = 1)

N_train, _ = X_train.shape 
N_test,  _ = X_test.shape 

print (N_train, N_test)

In [22]:

knn = KNeighborsClassifier().fit(X_train, y_train)

y_train_predict = knn.predict(X_train)
y_test_predict = knn.predict(X_test)

err_train = numpy.mean(y_train != y_train_predict)
err_test  = numpy.mean(y_test  != y_test_predict)

print (err_train, err_test)

(0.36014251061249242, 0.5452776795189247)


In [23]:

print (classification_report(y_test, y_test_predict))

             precision    recall  f1-score   support

          0       0.40      0.52      0.45       251
          1       0.21      0.38      0.27       289
          2       0.37      0.38      0.38       318
          3       0.41      0.40      0.40       304
          4       0.32      0.31      0.31       278
          5       0.33      0.39      0.36       290
          6       0.47      0.49      0.48       300
          7       0.31      0.34      0.32       311
          8       0.36      0.38      0.37       297
          9       0.50      0.55      0.52       283
         10       0.74      0.62      0.68       314
         11       0.89      0.75      0.81       312
         12       0.28      0.22      0.24       287
         13       0.34      0.35      0.35       297
         14       0.69      0.51      0.59       313
         15       0.59      0.65      0.62       283
         16       0.58      0.53      0.55       267
         17       0.81      0.72      0.76   

In [34]:
for i_t,i in enumerate(svd.components_,1):
    ind = numpy.argsort(i)
    print ("Popular words in category #",i_t,numpy.asarray(tfidf.get_feature_names())[ind[-10:-1]][:])

('Popular words in category #', 1, array([u'for', u'you', u'it', u'is', u'that', u'in', u'and', u'of', u'to'], 
      dtype='<U180'))
('Popular words in category #', 2, array([u'they', u'we', u'were', u'his', u'was', u'that', u'god', u'he',
       u'of'], 
      dtype='<U180'))
('Popular words in category #', 3, array([u'team', u'armenians', u'drive', u'and', u'on', u'armenian',
       u'game', u'of', u'was'], 
      dtype='<U180'))
('Popular words in category #', 4, array([u'writes', u'article', u'team', u'university', u'in', u'game',
       u'his', u'was', u'he'], 
      dtype='<U180'))
('Popular words in category #', 5, array([u'team', u'him', u'my', u'was', u'it', u'game', u'drive', u'scsi',
       u'his'], 
      dtype='<U180'))
('Popular words in category #', 6, array([u'writes', u're', u'government', u'key', u'clipper', u'netcom',
       u'stratus', u'they', u'you'], 
      dtype='<U180'))
('Popular words in category #', 7, array([u'kent', u'his', u'netcom', u'uk', u'stratus', u

In [26]:
dataset_tokenized = [nltk.word_tokenize(text) for text in dataset.data]

texts = []
for text in dataset_tokenized:
    lemms = []
    for word in text:
        if word not in string.punctuation:
            lemms.append(word.lower())
    texts.append(lemms)

In [27]:
print (len(texts), len(texts[0]), len(texts[1]))

(18846, 158, 133)


In [29]:
model = Word2Vec(texts)
model.most_similar(positive=['black', 'white'], negative=['red'])

[(u'house', 0.5042642951011658),
 (u'called', 0.48002660274505615),
 (u'holes', 0.47237879037857056),
 (u'helios.usq.edu.au', 0.45455724000930786),
 (u'gang', 0.43754374980926514),
 (u'filled', 0.4052257835865021),
 (u'cult', 0.40019476413726807),
 (u'chatnam', 0.38969773054122925),
 (u'inside', 0.36760005354881287),
 (u'burning', 0.35901063680648804)]

In [30]:
model.doesnt_match("Earth Mars Jupiter power".split())

'power'

In [31]:
model.similarity('black', 'red')

0.58408312229729997