In [1]:
from sklearn.datasets import fetch_20newsgroups

### Выбираем категории и загружаем dataset

In [2]:
categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train',
    categories=categories, shuffle=True, random_state=42)

In [3]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

### Смотрим на размер выборки

In [4]:
len(twenty_train.data)
len(twenty_train.filenames)

2257

In [5]:
print("\n".join(twenty_train.data[0].split("\n")[:3]))
print(twenty_train.target_names[twenty_train.target[0]])

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
comp.graphics


In [6]:
twenty_train.target[:10]

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2], dtype=int64)

In [7]:
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


### Выполняем предварительную обработку признаков

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 35788)

In [9]:
count_vect.vocabulary_.get(u'algorithm')

4690

In [10]:
from sklearn.feature_extraction.text import TfidfTransformer

In [11]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 35788)

### Разделяем выборку на train и test

In [12]:
X = X_train_tfidf
y = twenty_train.target

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Обучаем модель

In [15]:
from sklearn.tree import DecisionTreeClassifier

In [16]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
clf = DecisionTreeClassifier(max_depth=10).fit(X_train, y_train)

In [18]:
# clf = RandomForestClassifier(max_depth=10).fit(X_train, y_train)

### Оцениваем качество предсказания модели

In [19]:
predicted = clf.predict_proba(X_test)

In [20]:
from sklearn.metrics import classification_report

In [21]:
print(classification_report(y_pred=predicted.argmax(axis=1), y_true=y_test))

             precision    recall  f1-score   support

          0       0.82      0.66      0.73       143
          1       0.46      0.94      0.62       184
          2       0.89      0.49      0.63       208
          3       0.91      0.61      0.73       210

avg / total       0.78      0.67      0.67       745



### Обработываем новые данные

In [22]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics
