In [30]:
import os
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import re
import enchant
import gensim
from sklearn.ensemble import RandomForestClassifier

In [2]:
def log_progress(sequence, every=None, size=None):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display
    
    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = size / 200     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)
    
    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{index} / ?'.format(index=index)
                else:
                    progress.value = index
                    label.value = u'{index} / {size}'.format(
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = str(index or '?')

In [3]:
dict_eng = enchant.Dict('en')

**Обучение собственной модели:**

In [6]:
texts_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')).data
texts_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes')).data

print(len(texts_train), len(texts_test))

11314 7532


In [25]:
labels_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')).target
labels_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes')).target

print(len(labels_train), len(labels_test))

11314 7532


In [7]:
texts_prep_train = []

for text in log_progress(texts_train):
    filtered = re.findall('(?u)\\b\\w\\w+\\b', text)
    filtered = [filt for filt in filtered if dict_eng.check(filt)]
    texts_prep_train.append(' '.join(filtered).lower())

VBox(children=(HTML(value=''), IntProgress(value=0, max=11314)))

In [8]:
texts_prep_test = []

for text in log_progress(texts_test):
    filtered = re.findall('(?u)\\b\\w\\w+\\b', text)
    filtered = [filt for filt in filtered if dict_eng.check(filt)]
    texts_prep_test.append(' '.join(filtered).lower())

VBox(children=(HTML(value=''), IntProgress(value=0, max=7532)))

In [26]:
corpus_train = []
labels_train_ = []

for text, lbl in zip(log_progress(texts_prep_train), labels_train):
    tokens = list(gensim.utils.tokenize(text, lower=True))
    if len(tokens) != 0:
        corpus_train.append(tokens)
        labels_train_.append(lbl)
#     break

VBox(children=(HTML(value=''), IntProgress(value=0, max=11314)))

In [27]:
corpus_test = []
labels_test_ = []

for text, lbl in zip(log_progress(texts_prep_test), labels_test):
    tokens = list(gensim.utils.tokenize(text, lower=True))
    if len(tokens) != 0:
        corpus_test.append(tokens)
        labels_test_.append(lbl)
#     break

VBox(children=(HTML(value=''), IntProgress(value=0, max=7532)))

In [14]:
documents_train = [gensim.models.doc2vec.TaggedDocument(tokens, [i]) for i, tokens in enumerate(corpus_train)]
documents_test = [gensim.models.doc2vec.TaggedDocument(tokens, [i]) for i, tokens in enumerate(corpus_test)]

In [19]:
model = gensim.models.Doc2Vec(documents_train, vector_size=100, window=5, min_count=5, workers=4)

In [20]:
vectors_train = []

for tokens in corpus_train:
    vectors_train.append(model.infer_vector(tokens))
    
vectors_train = np.asarray(vectors_train)
print(vectors_train.shape)

(11000, 100)


In [21]:
vectors_test = []

for tokens in corpus_test:
    vectors_test.append(model.infer_vector(tokens))
    
vectors_test = np.asarray(vectors_test)
print(vectors_test.shape)

(7303, 100)


In [31]:
predictor = RandomForestClassifier(n_estimators=100).fit(vectors_train, labels_train_)

In [32]:
preds = predictor.predict(vectors_test)

In [33]:
np.mean(preds == labels_test_)

0.3602629056552102

In [34]:
preds

array([ 9,  1,  8, ..., 13,  4,  0])

In [39]:
np.asarray(labels_test_)

array([ 7,  5,  0, ...,  9,  6, 15])