In [3]:
## токенизатор из предыдущих заданий

import re
import numpy as np
import pymorphy2


MORPH = pymorphy2.MorphAnalyzer()

#-----------------------------------------------------------
re_tokenizer = re.compile(r'[\w]+')
re_stopwordy = re.compile(r'[\d_]+|[\d]{2,}[\w]+')
morph_cache = {}


def tokenize(s):
    global morph_cache

    s = s.lower().replace('ё', 'е')
    tokens = []
    for word in re_tokenizer.findall(s):
        if re_stopwordy.match(word):
            continue

        wn = morph_cache.get(word, None)
        
        if wn == None:
            p = MORPH.parse(word)[0]
            wn = p.normal_form
                
            morph_cache[word] = wn

        tokens.append(wn)

    return tokens

In [65]:
## загрузка обучающей выборки

import os

train_texts = []
train_labels = []
for filename in os.listdir('train/0'):
    with open('train/0/' + filename, 'r') as collection_file:
        x = collection_file.readlines()
        train_texts.append(x[0])
    train_labels.append('Политика')
for filename in os.listdir('train/1'):
    with open('train/1/' + filename, 'r') as collection_file:
        x = collection_file.readlines()
        train_texts.append(x[0])
    train_labels.append('Технологии')

In [139]:
from numpy import log

class NaiveBayes:
    def __init__(self, X, y, alpha = 1):
        self.alpha = alpha
        self.classes = list(set(y))
        self.vocabulary = set()
        self.class_probs = dict()
        self.word_probs = dict()
        self.number_words = 0
        docs = []
        for text in X:
            tokens = tokenize(text)
            docs.append(tokens)
            self.vocabulary.update(tokens)
        self.vocabulary = list(self.vocabulary)
        for word in self.vocabulary:
            self.word_probs[word] = dict()
        for n_class in self.classes:
            class_docs = []
            for i in range(len(y)):
                if y[i] == n_class:
                    class_docs.append(docs[i])
            self.class_probs[n_class] = len(class_docs) / len(y)
            number_words_in_class = 0
            for doc in class_docs:
                number_words_in_class += len(doc)
            self.number_words += number_words_in_class
            for word in self.vocabulary:
                word_in_class_counter = 0
                for doc in class_docs:
                    word_in_class_counter += len(list(filter(lambda x: x == word, doc)))
                self.word_probs[word][n_class] = (word_in_class_counter + self.alpha) / (number_words_in_class 
                                                                                         + self.alpha * len(self.vocabulary))
                
           
    def classification(self, text):
        probs = dict()
        tokens = tokenize(text)
        for n_class in self.classes:
            probs[n_class] = log(self.class_probs[n_class])
            for word in tokens:
                if word in self.vocabulary:
                    probs[n_class] += log(self.word_probs[word][n_class])
                else:
                    probs[n_class] += log(self.alpha / (self.number_words + self.alpha * len(self.vocabulary)))
            print("Класс ", n_class, ": ", probs[n_class])
        return self.classes[list(probs.values()).index(max(probs.values()))]
            
                
            


In [140]:
## пример работы классификатора
NB = NaiveBayes(train_texts, train_labels)
sample_text = 'Сайт магазина электроники «М.Видео» опубликовал характеристики смартфона «Яндекс.Телефон» за три дня до релиза. Запись была удалена, но доступна в кэше. Согласно опубликованным данным, смартфон обладает 4 Гб оперативной памяти и 64 Гб внутренней.'
NB.classification(sample_text)

Класс  Технологии :  -212.64333002704893
Класс  Политика :  -215.69353806539553


'Технологии'