In [21]:
import os.path as path

import pandas as pd
from tqdm import tqdm
from joblib import delayed, Parallel, dump
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
import numpy as np

from models.db_session import create_session, global_init
from models.news import News

from classes import Corpus, Document, Vectorizer

In [2]:
global_init(path.join('db', 'news.sqlite'))
session = create_session()

In [3]:
SEED = 0

In [4]:
corpus = Corpus()
print('Создаем корпус новостей...')
vectorizer = Vectorizer()
news = session.query(News).all()
data = zip(news, Parallel(n_jobs=-1)(
    delayed(vectorizer)(s.full_text)
    for s in tqdm(news)
))
for news, lemmas in data:
    corpus.add_document(Document(news=news, corpus=corpus, lemmas=lemmas))

Создаем корпус новостей...


100%|██████████| 2126/2126 [02:22<00:00, 14.95it/s]


In [5]:
len(corpus.lemmas)

19429

In [6]:
# Оставляем слова, которые встречаются не менее 5 раз

corpus.filter(min_freq=5)
len(corpus.lemmas)

5266

In [7]:
corpus.save(path.join('data', 'corpus.pkl'))

In [8]:
tf_idf = corpus.get_tf_idf(is_normalize=True)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(tf_idf, corpus.categories_labels, test_size=0.15, random_state=SEED)

In [10]:
model = MultinomialNB(alpha=0.01)
model.fit(X_train, y_train)

In [27]:
predicted = model.predict(X_train)
dc = metrics.classification_report(y_train, predicted, zero_division=np.NaN, output_dict=True)
print(pd.DataFrame(dc).T.round(2).to_markdown(tablefmt='github'))

|                   |   precision |   recall |   f1-score |   support |
|-------------------|-------------|----------|------------|-----------|
| 69-я параллель    |        1    |     1    |       1    |      4    |
| Бывший СССР       |        0.95 |     0.89 |       0.92 |    246    |
| Забота о себе     |        0.96 |     1    |       0.98 |     48    |
| Из жизни          |        1    |     1    |       1    |     52    |
| Интернет и СМИ    |        0.94 |     0.84 |       0.89 |     69    |
| Культура          |        0.9  |     1    |       0.95 |     76    |
| Мир               |        0.9  |     0.97 |       0.94 |    355    |
| Моя страна        |        1    |     1    |       1    |     15    |
| Наука и техника   |        0.97 |     0.99 |       0.98 |     75    |
| Путешествия       |        1    |     1    |       1    |     55    |
| Россия            |        0.96 |     0.9  |       0.93 |    315    |
| Силовые структуры |        0.97 |     0.98 |       0.97 |    1

In [28]:
predicted = model.predict(X_test)
dc = metrics.classification_report(y_test, predicted, zero_division=np.NaN, output_dict=True)
print(pd.DataFrame(dc).T.round(2).to_markdown(tablefmt='github'))

|                   |   precision |   recall |   f1-score |   support |
|-------------------|-------------|----------|------------|-----------|
| 69-я параллель    |        0    |   nan    |       0    |      0    |
| Бывший СССР       |        0.85 |     0.8  |       0.82 |     50    |
| Забота о себе     |        0.67 |     1    |       0.8  |      4    |
| Из жизни          |        0.64 |     0.9  |       0.75 |     10    |
| Интернет и СМИ    |        0.9  |     0.5  |       0.64 |     18    |
| Культура          |        0.77 |     0.77 |       0.77 |     13    |
| Мир               |        0.81 |     0.92 |       0.86 |     59    |
| Моя страна        |        1    |     0.33 |       0.5  |      3    |
| Наука и техника   |        1    |     0.73 |       0.84 |     11    |
| Путешествия       |        1    |     0.71 |       0.83 |     14    |
| Россия            |        0.83 |     0.84 |       0.83 |     57    |
| Силовые структуры |        0.93 |     0.96 |       0.95 |     

In [13]:
dump(model, path.join('data', 'classifier.pkl'));