# 06.06 Classwork №4: Cингулярное разложение и Word2Vec

In [2]:
import numpy as np

##Загрузка данных

In [1]:
from sklearn.datasets import fetch_20newsgroups
cats = ['alt.atheism', 'rec.motorcycles',  'rec.autos', 'sci.electronics']
dataset = fetch_20newsgroups(categories=cats, subset='all')

In [4]:
print ("Dataset categories: ", dataset.target_names)
print ("Dataset texts: ", len(dataset.data))

Dataset categories:  ['alt.atheism', 'rec.autos', 'rec.motorcycles', 'sci.electronics']
Dataset texts:  3769


##Векторизация и сингулярное разложение

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(dataset.data)

In [6]:
from sklearn.decomposition import TruncatedSVD
n_comp = 10
svd = TruncatedSVD(n_components=n_comp)
X_svd = svd.fit_transform(X_tfidf)

In [7]:
print (X_svd.shape)
print (svd.components_)

(3769, 10)
[[  5.40635309e-03   1.09778971e-02   2.51324104e-04 ...,   2.12522492e-04
    2.06266117e-04   2.06266117e-04]
 [  9.93053031e-03   1.79756417e-02   8.26957258e-04 ...,   4.12102399e-04
    3.35655130e-04   3.35655130e-04]
 [  4.72569250e-03   2.63779571e-04   6.41619660e-04 ...,   1.91137403e-04
    3.88905662e-04   3.88905662e-04]
 ..., 
 [  1.80630693e-03   2.18007139e-02  -9.47554897e-04 ...,   9.24762463e-05
   -1.09704778e-03  -1.09704778e-03]
 [  1.60778310e-02   1.06610561e-02   1.01579050e-03 ...,  -2.61294700e-04
    2.39940632e-03   2.39940632e-03]
 [ -6.21291720e-03  -7.48404779e-03   3.20521518e-04 ...,   5.25722813e-05
   -1.02095604e-03  -1.02095604e-03]]


## Классификация

In [8]:
from sklearn.cross_validation import train_test_split
y = dataset.target
X_train, X_test, y_train, y_test = train_test_split(X_svd, y, test_size = 0.3, random_state = 1)

N_train, _ = X_train.shape 
N_test,  _ = X_test.shape 

print (N_train, N_test)

2638 1131


In [9]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier().fit(X_train, y_train)

y_train_predict = knn.predict(X_train)
y_test_predict = knn.predict(X_test)

err_train = np.mean(y_train != y_train_predict)
err_test  = np.mean(y_test  != y_test_predict)

print (err_train, err_test)

0.103487490523 0.14765694076


In [10]:
from sklearn.metrics import classification_report
print (classification_report(y_test, y_test_predict))

             precision    recall  f1-score   support

          0       0.98      0.93      0.95       220
          1       0.80      0.81      0.80       317
          2       0.83      0.85      0.84       299
          3       0.84      0.84      0.84       295

avg / total       0.85      0.85      0.85      1131



## Интерпретация топиков

In [11]:
for i_t,i in enumerate(svd.components_,1):
    ind = np.argsort(i)
    print ("Most popular words in category #",i_t,np.asarray(tfidf.get_feature_names())[ind[-10:-1]][:])

Most popular words in category # 1 ['edu' 'you' 'it' 'in' 'that' 'is' 'and' 'of' 'to']
Most popular words in category # 2 ['hp' 'dod' 'sun' 'com' 'ca' 'my' 'on' 'bike' 'car']
Most popular words in category # 3 ['jon' 'wpd' 'solntze' 'caltech' 'sun' 'livesey' 'keith' 'sgi' 'edu']
Most popular words in category # 4 ['god' 'to' 'her' 'egreen' 'green' 'com' 'ed' 'you' 'east']
Most popular words in category # 5 ['sun' 'was' 'caltech' 'com' 'wpd' 'solntze' 'jon' 'livesey' 'sgi']
Most popular words in category # 6 ['my' 'nec' 'have' 'ca' 'behanna' 'livesey' 'sgi' 'to' 'car']
Most popular words in category # 7 ['bike' 'ca' 'bnr' 'demon' 'nec' 'behanna' 'co' 'morgan' 'tony']
Most popular words in category # 8 ['nec' 'beauchaine' 'bobbe' 'behanna' 'sandvik' 'vice' 'he' 'ico' 'tek']
Most popular words in category # 9 ['kent' 'hp' 'beauchaine' 'bobbe' 'vice' 'ico' 'tek' 'sandvik' 'uk']
Most popular words in category # 10 ['vice' 'ico' 'allan' 'pasadena' 'schneider' 'cco' 'morality' 'the'
 'caltech

##Word2vec

In [4]:
import nltk
import string
dataset_tokenized = [nltk.word_tokenize(text) for text in dataset.data]

texts = []
for text in dataset_tokenized:
    lemms = []
    for word in text:
        if word not in string.punctuation:
            lemms.append(word.lower())
    texts.append(lemms)

In [5]:
print (len(texts), len(texts[0]), len(texts[1]))

3769 134 428


In [22]:
from gensim.models import Word2Vec
model = Word2Vec(texts)



In [30]:
model.most_similar(positive=['honda', 'helmet'], negative=['auto'])

[('motorcycle', 0.8870781660079956),
 ('lubed', 0.8047821521759033),
 ('head', 0.7949617505073547),
 ('wife', 0.7829919457435608),
 ('seat', 0.7651523947715759),
 ('wallet', 0.7623045444488525),
 ('hands', 0.7563285827636719),
 ('dog', 0.7508450746536255),
 ('frame', 0.7453364133834839),
 ('house', 0.7442944049835205)]

In [31]:
model.doesnt_match("pc tablet cluster dog".split())

'dog'

In [72]:
model.doesnt_match("circuit motherboard screen code".split())

'motherboard'

А вот и нет. Лишнее слово тут - cat. Всё остальное относится к компьютерам.

In [70]:
model.similarity('god', 'satan')

0.85874348349521235

In [29]:
model['satan']

array([-0.0184273 ,  0.00619601, -0.42118284,  0.28704453,  0.58555907,
       -0.21817216,  0.08192006,  0.01356775, -0.1778322 , -0.15329054,
       -0.12459341,  0.18103896, -0.03870576, -0.1133403 ,  0.11081257,
       -0.01971509, -0.12565637,  0.0716049 , -0.54352778, -0.20821916,
       -0.2454417 , -0.06420164, -0.09143994,  0.23850578, -0.37113723,
       -0.14094523, -0.3717812 ,  0.41719395,  0.13052195, -0.24853545,
        0.39064136, -0.58536935, -0.18217732,  0.0215153 , -0.24226755,
        0.42140931,  0.0378667 ,  0.34588763, -0.48749039, -0.37451002,
        0.17843001,  0.40116221, -0.27440837,  0.14714156,  0.33964202,
        0.34375665, -0.49631339, -0.55757093, -0.08487422, -0.31132206,
        0.06261025,  0.16420554,  0.10875834, -0.06008864,  0.16958977,
        0.31347817, -0.28694597,  0.0787242 , -0.38555342, -0.17179225,
       -0.24191099, -0.05923447,  0.04102996,  0.02831565,  0.21226011,
       -0.1936094 ,  0.00486322, -0.07231928, -0.12243412,  0.13