In [2]:
import os
import sklearn
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import enchant
from sklearn.neighbors import NearestNeighbors
import re
from scipy.sparse import csr_matrix

In [3]:
def log_progress(sequence, every=None, size=None):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display
    
    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = size / 200     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)
    
    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{index} / ?'.format(index=index)
                else:
                    progress.value = index
                    label.value = u'{index} / {size}'.format(
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = str(index or '?')

In [4]:
# vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
vectorizer = TfidfVectorizer(stop_words='english', strip_accents='ascii')
# categories = ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
#               'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles',
#               'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med',
#               'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast',
#               'talk.politics.misc', 'talk.religion.misc']
initial_categories = ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
              'comp.sys.mac.hardware', 'comp.windows.x', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball',
              'rec.sport.hockey', 'sci.electronics', 'soc.religion.christian', 'talk.politics.mideast',
              'talk.politics.misc', 'talk.religion.misc']
eng_dict = enchant.Dict('en_US')

In [5]:
ng_train = fetch_20newsgroups(subset='train', categories=initial_categories,
                              remove=('headers', 'footers', 'quotes'))
print ng_train.keys()

['description', 'DESCR', 'filenames', 'target_names', 'data', 'target']


*Выполним фильтрацию не словарных элементов. Для этого сначала необходимо выполнить токенизацию*

In [6]:
data_prep = []
for el in ng_train.data:
    filtered = re.findall(u'(?u)\\b\\w\\w+\\b', el)
    filtered = [filt for filt in filtered if eng_dict.check(filt)]
    el_prep = ' '.join(filtered)
    data_prep.append(el_prep)

*Составим массив tf-idf индексов для предобработанных текстов*

In [7]:
# vectors_train = vectorizer.fit_transform(ng_train.data)
vectors_train = vectorizer.fit_transform(data_prep)
vectors_train

<8401x31912 sparse matrix of type '<type 'numpy.float64'>'
	with 446852 stored elements in Compressed Sparse Row format>

*Отфильтруем тексты с пустыми tf-idf индексами*

In [8]:
vectors_train_proc = []
ex_ind = []
data_orig_proc_cor = []
for i_el, el in enumerate(vectors_train):
    if el.getnnz() > 0:
        vectors_train_proc.append(el.toarray())
        data_orig_proc_cor.append(i_el)
    else:
        ex_ind.append(i_el)
vectors_train_proc = np.squeeze(np.asarray(vectors_train_proc).transpose(1, 0, 2))
data_proc = [d for i, d in enumerate(data_prep) if i not in ex_ind]
print vectors_train_proc
print vectors_train_proc.shape
print len(data_proc)

[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]
(8144, 31912)
8144


In [9]:
nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree', metric='euclidean').fit(vectors_train_proc)

---

In [10]:
request = 'something about computers electronic components harware etc'

*Составим tf-idf для запроса*

In [11]:
vectors_req = vectorizer.transform([vectorizer.decode(request)])
vectors_req

<1x31912 sparse matrix of type '<type 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [12]:
dsts, inds = nbrs.kneighbors(vectors_req.toarray(), n_neighbors=10)

In [13]:
inds = np.squeeze(inds)
inds

array([1054,  994,  309, 8121, 3552,   89, 4292, 1459, 3943, 3842])

In [14]:
inds_corr = np.asarray([data_orig_proc_cor[ind] for ind in inds])
inds_corr

array([1091, 1026,  319, 8377, 3672,   91, 4447, 1510, 4085, 3975])

*Top-10 оригинальных текстов*

In [21]:
for i, item in enumerate(np.asarray(ng_train.data)[inds_corr], start=1):
    print '--------------------------- Top-{} ---------------------------'.format(i)
    print item
    print '---------------------------- end -----------------------------'
    print

--------------------------- Top-1 ---------------------------

 Computers are a special case.. and it's a pretty good idea to
 leave them on.. cuz everytime you turn on a computer, you're 
 putting a surge of electricity through its delicate components.
  Imagine you're turning on your computer 5 or more times a day.
 You're increasing the chances of damaging the chips, memory,
 etc on all the components of your computer. So you may save
 a few cents here and there in electricity bills, but it won't
 look like much when it come time to fix your computer.
---------------------------- end -----------------------------

--------------------------- Top-2 ---------------------------

Good point...also, I wouldn't be surprised that the components
they use off-shore are of inferior quality.  As long as it was
properly designed and robust, premium components are used, it
shouldn't matter where it is assembled.


An amp that runs hot has no bearing on how it's gonna sound.
The amp you have prob