In [1]:
import numpy as np
import pandas as pd
from typing import Dict, Tuple
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, median_absolute_error, r2_score 
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
import seaborn as sns
from collections import Counter
from sklearn.datasets import fetch_20newsgroups
import matplotlib.pyplot as plt

In [2]:
categories = ['comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x'
             ]
newsgroups = fetch_20newsgroups(subset='train', categories=categories)
data = newsgroups['data']

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [3]:
data

["From: lemons@cadsys.enet.dec.com\nSubject: Xremote into X11R6?\nReply-To: lemons@cadsys.enet.dec.com ()\nOrganization: Digital Equipment Corporation\nLines: 12\nX-Newsreader: mxrn 6.18\n\n\nHi!\n\nI remember reading (or hallucinating) that NCD's PC-Xremote functionality had \nbeen given, by NCD, to MIT for inclusion in X11R6.  Is this true?  If so,\n(set mode/cheap) can I just wait for X11R6 to get compressed serial line\nX server support?\n\nThanks!\n\nTerry Lemons\nDigital Equipment Corporation\n",
 'From: jas@ISI.EDU (Jeff Sullivan)\nSubject: ADB Mouse II (ergo) -- when?\nOrganization: USC-ISI\nLines: 11\nDistribution: comp\nNNTP-Posting-Host: tigger.isi.edu\n\n\nWhen is Apple supposed to start bundlign the new ergonomic ADB Mouse\nII with all CPUs sold?\n\njas\n\n--\n--------------------------------------------------------------------------\nJeffrey A. Sullivan             | Research Scientist et al.\njas@isi.edu (Internet)          | Information Sciences Institute\n72511,402    

In [4]:
def accuracy_score_for_classes(
    y_true: np.ndarray, 
    y_pred: np.ndarray) -> Dict[int, float]:
    """
    Вычисление метрики accuracy для каждого класса
    y_true - истинные значения классов
    y_pred - предсказанные значения классов
    Возвращает словарь: ключ - метка класса, 
    значение - Accuracy для данного класса
    """
    # Для удобства фильтрации сформируем Pandas DataFrame 
    d = {'t': y_true, 'p': y_pred}
    df = pd.DataFrame(data=d)
    # Метки классов
    classes = np.unique(y_true)
    # Результирующий словарь
    res = dict()
    # Перебор меток классов
    for c in classes:
        # отфильтруем данные, которые соответствуют 
        # текущей метке класса в истинных значениях
        temp_data_flt = df[df['t']==c]
        # расчет accuracy для заданной метки класса
        temp_acc = accuracy_score(
            temp_data_flt['t'].values, 
            temp_data_flt['p'].values)
        # сохранение результата в словарь
        res[c] = temp_acc
    return res

def print_accuracy_score_for_classes(
    y_true: np.ndarray, 
    y_pred: np.ndarray):
    """
    Вывод метрики accuracy для каждого класса
    """
    accs = accuracy_score_for_classes(y_true, y_pred)
    if len(accs)>0:
        print('Метка \t Accuracy')
    for i in accs:
        print('{} \t {}'.format(i, accs[i]))

In [5]:
vocabVect = CountVectorizer()
vocabVect.fit(data)
corpusVocab = vocabVect.vocabulary_
print('Количество сформированных признаков - {}'.format(len(corpusVocab)))

Количество сформированных признаков - 66735


In [6]:
for i in list(corpusVocab)[1:10]:
    print('{}={}'.format(i, corpusVocab[i]))

lemons=37752
cadsys=19759
enet=26209
dec=23472
com=21393
subject=55891
xremote=64377
into=34000
x11r6=63320


In [7]:
test_features = vocabVect.transform(data)
test_features

<2936x66735 sparse matrix of type '<class 'numpy.int64'>'
	with 406296 stored elements in Compressed Sparse Row format>

In [8]:
# Размер нулевой строки
len(test_features.todense()[0].getA1())

66735

In [9]:
vocabVect.get_feature_names()[100:120]

['013846',
 '0139',
 '014',
 '014237',
 '01451',
 '015',
 '0150',
 '0158',
 '015844',
 '01609',
 '01701',
 '01752',
 '0179',
 '01800',
 '01801',
 '01803',
 '0182',
 '01821',
 '0183',
 '0184']

In [10]:
def VectorizeAndClassify(vectorizers_list, classifiers_list):
    for v in vectorizers_list:
        for c in classifiers_list:
            pipeline1 = Pipeline([("vectorizer", v), ("classifier", c)])
            score = cross_val_score(pipeline1, newsgroups['data'], newsgroups['target'], scoring='accuracy', cv=3).mean()
            print('Векторизация - {}'.format(v))
            print('Модель для классификации - {}'.format(c))
            print('Accuracy = {}'.format(score))
            print('===========================')

In [11]:
vectorizers_list = [CountVectorizer(vocabulary = corpusVocab), TfidfVectorizer(vocabulary = corpusVocab)]
classifiers_list = [LogisticRegression(), MultinomialNB()]
VectorizeAndClassify(vectorizers_list, classifiers_list)



Векторизация - CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None,
        vocabulary={'from': 28722, 'lemons': 37752, 'cadsys': 19759, 'enet': 26209, 'dec': 23472, 'com': 21393, 'subject': 55891, 'xremote': 64377, 'into': 34000, 'x11r6': 63320, 'reply': 51283, 'to': 57856, 'organization': 46122, 'digital': 24087, 'equipment': 26441, 'corporation': 22056, 'lines': 38125, '...721, '9959': 12679, 'ins8250a': 33785, 'ins82c50a': 33786, 'ins8250': 33784, 'exceptionally': 26861})
Модель для классификации - LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_s



Векторизация - TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary={'from': 28722, 'lemons': 37752, 'cadsys': 19759, 'enet': 26209, 'dec': 23472, 'com': 21393, 'subject': 55891, 'xremote': 64377, 'into': 34000, 'x11r6': 63320, 'reply': 51283, 'to': 57856, 'organization': 46122, 'digital': 24087, 'equipment': 26441, 'corporation': 22056, 'lines': 38125, '...721, '9959': 12679, 'ins8250a': 33785, 'ins82c50a': 33786, 'ins8250': 33784, 'exceptionally': 26861})
Модель для классификации - LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, m

## Лучшая точность у TfidfVectorizer с LogisticRegression